aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/.kunitconfig3
-rw-r--r--fs/ext4/Kconfig11
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/acl.c23
-rw-r--r--fs/ext4/acl.h5
-rw-r--r--fs/ext4/balloc.c93
-rw-r--r--fs/ext4/block_validity.c187
-rw-r--r--fs/ext4/crypto.c246
-rw-r--r--fs/ext4/dir.c125
-rw-r--r--fs/ext4/ext4.h869
-rw-r--r--fs/ext4/ext4_extents.h10
-rw-r--r--fs/ext4/ext4_jbd2.c114
-rw-r--r--fs/ext4/ext4_jbd2.h48
-rw-r--r--fs/ext4/extents.c1647
-rw-r--r--fs/ext4/extents_status.c36
-rw-r--r--fs/ext4/fast_commit.c2301
-rw-r--r--fs/ext4/fast_commit.h186
-rw-r--r--fs/ext4/file.c144
-rw-r--r--fs/ext4/fsmap.c17
-rw-r--r--fs/ext4/fsmap.h4
-rw-r--r--fs/ext4/fsync.c38
-rw-r--r--fs/ext4/hash.c33
-rw-r--r--fs/ext4/ialloc.c438
-rw-r--r--fs/ext4/indirect.c64
-rw-r--r--fs/ext4/inline.c400
-rw-r--r--fs/ext4/inode-test.c320
-rw-r--r--fs/ext4/inode.c1854
-rw-r--r--fs/ext4/ioctl.c920
-rw-r--r--fs/ext4/mballoc.c2307
-rw-r--r--fs/ext4/mballoc.h43
-rw-r--r--fs/ext4/migrate.c53
-rw-r--r--fs/ext4/mmp.c99
-rw-r--r--fs/ext4/move_extent.c54
-rw-r--r--fs/ext4/namei.c1300
-rw-r--r--fs/ext4/orphan.c652
-rw-r--r--fs/ext4/page-io.c60
-rw-r--r--fs/ext4/readpage.c69
-rw-r--r--fs/ext4/resize.c159
-rw-r--r--fs/ext4/super.c4696
-rw-r--r--fs/ext4/symlink.c78
-rw-r--r--fs/ext4/sysfs.c92
-rw-r--r--fs/ext4/truncate.h8
-rw-r--r--fs/ext4/verity.c153
-rw-r--r--fs/ext4/xattr.c265
-rw-r--r--fs/ext4/xattr.h27
-rw-r--r--fs/ext4/xattr_hurd.c52
-rw-r--r--fs/ext4/xattr_security.c1
-rw-r--r--fs/ext4/xattr_trusted.c1
-rw-r--r--fs/ext4/xattr_user.c1
49 files changed, 13843 insertions, 6467 deletions
diff --git a/fs/ext4/.kunitconfig b/fs/ext4/.kunitconfig
new file mode 100644
index 000000000000..bf51da7cd9fc
--- /dev/null
+++ b/fs/ext4/.kunitconfig
@@ -0,0 +1,3 @@
+CONFIG_KUNIT=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_KUNIT_TESTS=y
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 2a592e38cdfe..86699c8cab28 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -99,18 +99,17 @@ config EXT4_DEBUG
Enables run-time debugging support for the ext4 filesystem.
If you select Y here, then you will be able to turn on debugging
- with a command such as:
- echo 1 > /sys/module/ext4/parameters/mballoc_debug
+ using dynamic debug control for mb_debug() / ext_debug() msgs.
config EXT4_KUNIT_TESTS
- tristate "KUnit tests for ext4"
- select EXT4_FS
- depends on KUNIT
+ tristate "KUnit tests for ext4" if !KUNIT_ALL_TESTS
+ depends on EXT4_FS && KUNIT
+ default KUNIT_ALL_TESTS
help
This builds the ext4 KUnit tests.
KUnit tests run during boot and output the results to the debug log
- in TAP format (http://testanything.org/). Only useful for kernel devs
+ in TAP format (https://testanything.org/). Only useful for kernel devs
running KUnit test harness and are not for inclusion into a production
build.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 4ccb3c9189d8..72206a292676 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -9,10 +9,12 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
- super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
+ super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \
+ xattr_user.o fast_commit.o orphan.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
ext4-inode-test-objs += inode-test.o
obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o
ext4-$(CONFIG_FS_VERITY) += verity.o
+ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8c7bbf3e566d..57e82e25f8e2 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -139,16 +139,19 @@ fail:
/*
* Inode operation get_posix_acl().
*
- * inode->i_mutex: don't care
+ * inode->i_rwsem: don't care
*/
struct posix_acl *
-ext4_get_acl(struct inode *inode, int type)
+ext4_get_acl(struct inode *inode, int type, bool rcu)
{
int name_index;
char *value = NULL;
struct posix_acl *acl;
int retval;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -180,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type)
/*
* Set the access or default ACL of an inode.
*
- * inode->i_mutex: down unless called from ext4_new_inode
+ * inode->i_rwsem: down unless called from ext4_new_inode
*/
static int
__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
@@ -215,15 +218,15 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
value, size, xattr_flags);
kfree(value);
- if (!error) {
+ if (!error)
set_cached_acl(inode, type, acl);
- }
return error;
}
int
-ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type)
{
handle_t *handle;
int error, credits, retries = 0;
@@ -245,7 +248,7 @@ retry:
return PTR_ERR(handle);
if ((type == ACL_TYPE_ACCESS) && acl) {
- error = posix_acl_update_mode(inode, &mode, &acl);
+ error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl);
if (error)
goto out_stop;
if (mode != inode->i_mode)
@@ -256,7 +259,7 @@ retry:
if (!error && update_mode) {
inode->i_mode = mode;
inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ error = ext4_mark_inode_dirty(handle, inode);
}
out_stop:
ext4_journal_stop(handle);
@@ -268,8 +271,8 @@ out_stop:
/*
* Initialize the ACLs of a new inode. Called from ext4_new_inode.
*
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
+ * dir->i_rwsem: down
+ * inode->i_rwsem: up (access to inode is still exclusive)
*/
int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9b63f5416a2f..3219669732bf 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -55,8 +55,9 @@ static inline int ext4_acl_count(size_t size)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
/* acl.c */
-struct posix_acl *ext4_get_acl(struct inode *inode, int type);
-int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu);
+int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 8fd0b3cdab4c..8ff4b9192a9f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -185,7 +185,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start, tmp;
- J_ASSERT_BH(bh, buffer_locked(bh));
+ ASSERT(buffer_locked(bh));
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
@@ -239,7 +239,7 @@ unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
- return num_clusters_in_group(sb, block_group) -
+ return num_clusters_in_group(sb, block_group) -
ext4_num_overhead_clusters(sb, block_group, gdp);
}
@@ -368,7 +368,12 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
- struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_group_info *grp;
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
+ grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return 0;
@@ -406,14 +411,16 @@ verified:
* ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
+ * @ignore_locked: ignore locked buffers
*
* Read the bitmap for a given block_group,and validate the
* bits for block/inode/inode tables are set in the bitmaps
*
- * Return buffer_head on success or NULL in case of failure.
+ * Return buffer_head on success or an ERR_PTR in case of failure.
*/
struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
{
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -441,6 +448,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
return ERR_PTR(-ENOMEM);
}
+ if (ignore_locked && buffer_locked(bh)) {
+ /* buffer under IO already, return if called for prefetching */
+ put_bh(bh);
+ return NULL;
+ }
+
if (bitmap_uptodate(bh))
goto verify;
@@ -487,10 +500,10 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
* submit the buffer_head for reading
*/
set_buffer_new(bh);
- trace_ext4_read_block_bitmap_load(sb, block_group);
- bh->b_end_io = ext4_end_bitmap_read;
- get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+ trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
+ (ignore_locked ? REQ_RAHEAD : 0),
+ ext4_end_bitmap_read);
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -502,7 +515,7 @@ out:
return ERR_PTR(err);
}
-/* Returns 0 on success, 1 on error */
+/* Returns 0 on success, -errno on error */
int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
struct buffer_head *bh)
{
@@ -516,10 +529,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
wait_on_buffer(bh);
ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
if (!buffer_uptodate(bh)) {
- ext4_set_errno(sb, EIO);
- ext4_error(sb, "Cannot read block bitmap - "
- "block_group = %u, block_bitmap = %llu",
- block_group, (unsigned long long) bh->b_blocknr);
+ ext4_error_err(sb, EIO, "Cannot read block bitmap - "
+ "block_group = %u, block_bitmap = %llu",
+ block_group, (unsigned long long) bh->b_blocknr);
ext4_mark_group_bitmap_corrupted(sb, block_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EIO;
@@ -535,7 +547,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh;
int err;
- bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
if (IS_ERR(bh))
return bh;
err = ext4_wait_block_bitmap(sb, block_group, bh);
@@ -615,27 +627,47 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
/**
* ext4_should_retry_alloc() - check if a block allocation should be retried
- * @sb: super block
- * @retries: number of attemps has been made
+ * @sb: superblock
+ * @retries: number of retry attempts made so far
*
- * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
- * it is profitable to retry the operation, this function will wait
- * for the current or committing transaction to complete, and then
- * return TRUE. We will only retry once.
+ * ext4_should_retry_alloc() is called when ENOSPC is returned while
+ * attempting to allocate blocks. If there's an indication that a pending
+ * journal transaction might free some space and allow another attempt to
+ * succeed, this function will wait for the current or committing transaction
+ * to complete and then return TRUE.
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
- (*retries)++ > 1 ||
- !EXT4_SB(sb)->s_journal)
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!sbi->s_journal)
return 0;
- smp_mb();
- if (EXT4_SB(sb)->s_mb_free_pending == 0)
+ if (++(*retries) > 3) {
+ percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit);
return 0;
+ }
+
+ /*
+ * if there's no indication that blocks are about to be freed it's
+ * possible we just missed a transaction commit that did so
+ */
+ smp_mb();
+ if (sbi->s_mb_free_pending == 0) {
+ if (test_opt(sb, DISCARD)) {
+ atomic_inc(&sbi->s_retry_alloc_pending);
+ flush_work(&sbi->s_discard_work);
+ atomic_dec(&sbi->s_retry_alloc_pending);
+ }
+ return ext4_has_free_clusters(sbi, 1, 0);
+ }
- jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
- jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+ /*
+ * it's possible we've just missed a transaction commit here,
+ * so ignore the returned status
+ */
+ ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id);
+ (void) jbd2_journal_force_commit_nested(sbi->s_journal);
return 1;
}
@@ -904,10 +936,11 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
return bg_start;
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
- colour = (current->pid % 16) *
+ colour = (task_pid_nr(current) % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
else
- colour = (current->pid % 16) * ((last_block - bg_start) / 16);
+ colour = (task_pid_nr(current) % 16) *
+ ((last_block - bg_start) / 16);
return bg_start + colour;
}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 0a734ffb4310..5504f72bbbbe 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -24,6 +24,7 @@ struct ext4_system_zone {
struct rb_node node;
ext4_fsblk_t start_blk;
unsigned int count;
+ u32 ino;
};
static struct kmem_cache *ext4_system_zone_cachep;
@@ -45,7 +46,8 @@ void ext4_exit_system_zone(void)
static inline int can_merge(struct ext4_system_zone *entry1,
struct ext4_system_zone *entry2)
{
- if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+ if ((entry1->start_blk + entry1->count) == entry2->start_blk &&
+ entry1->ino == entry2->ino)
return 1;
return 0;
}
@@ -66,9 +68,9 @@ static void release_system_zone(struct ext4_system_blocks *system_blks)
*/
static int add_system_zone(struct ext4_system_blocks *system_blks,
ext4_fsblk_t start_blk,
- unsigned int count)
+ unsigned int count, u32 ino)
{
- struct ext4_system_zone *new_entry = NULL, *entry;
+ struct ext4_system_zone *new_entry, *entry;
struct rb_node **n = &system_blks->root.rb_node, *node;
struct rb_node *parent = NULL, *new_node = NULL;
@@ -79,30 +81,21 @@ static int add_system_zone(struct ext4_system_blocks *system_blks,
n = &(*n)->rb_left;
else if (start_blk >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
- else {
- if (start_blk + count > (entry->start_blk +
- entry->count))
- entry->count = (start_blk + count -
- entry->start_blk);
- new_node = *n;
- new_entry = rb_entry(new_node, struct ext4_system_zone,
- node);
- break;
- }
+ else /* Unexpected overlap of system zones. */
+ return -EFSCORRUPTED;
}
- if (!new_entry) {
- new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
- GFP_KERNEL);
- if (!new_entry)
- return -ENOMEM;
- new_entry->start_blk = start_blk;
- new_entry->count = count;
- new_node = &new_entry->node;
-
- rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &system_blks->root);
- }
+ new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+ GFP_KERNEL);
+ if (!new_entry)
+ return -ENOMEM;
+ new_entry->start_blk = start_blk;
+ new_entry->count = count;
+ new_entry->ino = ino;
+ new_node = &new_entry->node;
+
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &system_blks->root);
/* Can we merge to the left? */
node = rb_prev(new_node);
@@ -138,7 +131,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
printk(KERN_INFO "System zones: ");
rcu_read_lock();
- system_blks = rcu_dereference(sbi->system_blks);
+ system_blks = rcu_dereference(sbi->s_system_blks);
node = rb_first(&system_blks->root);
while (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
@@ -151,44 +144,6 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
printk(KERN_CONT "\n");
}
-/*
- * Returns 1 if the passed-in block region (start_blk,
- * start_blk+count) is valid; 0 if some part of the block region
- * overlaps with filesystem metadata blocks.
- */
-static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
- struct ext4_system_blocks *system_blks,
- ext4_fsblk_t start_blk,
- unsigned int count)
-{
- struct ext4_system_zone *entry;
- struct rb_node *n;
-
- if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
- (start_blk + count < start_blk) ||
- (start_blk + count > ext4_blocks_count(sbi->s_es))) {
- sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
- return 0;
- }
-
- if (system_blks == NULL)
- return 1;
-
- n = system_blks->root.rb_node;
- while (n) {
- entry = rb_entry(n, struct ext4_system_zone, node);
- if (start_blk + count - 1 < entry->start_blk)
- n = n->rb_left;
- else if (start_blk >= (entry->start_blk + entry->count))
- n = n->rb_right;
- else {
- sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
- return 0;
- }
- }
- return 1;
-}
-
static int ext4_protect_reserved_inode(struct super_block *sb,
struct ext4_system_blocks *system_blks,
u32 ino)
@@ -218,17 +173,16 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
if (n == 0) {
i++;
} else {
- if (!ext4_data_block_valid_rcu(sbi, system_blks,
- map.m_pblk, n)) {
- ext4_error(sb, "blocks %llu-%llu from inode %u "
- "overlap system zone", map.m_pblk,
- map.m_pblk + map.m_len - 1, ino);
- err = -EFSCORRUPTED;
+ err = add_system_zone(system_blks, map.m_pblk, n, ino);
+ if (err < 0) {
+ if (err == -EFSCORRUPTED) {
+ EXT4_ERROR_INODE_ERR(inode, -err,
+ "blocks %llu-%llu from inode overlap system zone",
+ map.m_pblk,
+ map.m_pblk + map.m_len - 1);
+ }
break;
}
- err = add_system_zone(system_blks, map.m_pblk, n);
- if (err < 0)
- break;
i += n;
}
}
@@ -250,7 +204,7 @@ static void ext4_destroy_system_zone(struct rcu_head *rcu)
*
* The update of system_blks pointer in this function is protected by
* sb->s_umount semaphore. However we have to be careful as we can be
- * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * racing with ext4_inode_block_valid() calls reading system_blks rbtree
* protected only by RCU. That's why we first build the rbtree and then
* swap it in place.
*/
@@ -264,14 +218,6 @@ int ext4_setup_system_zone(struct super_block *sb)
int flex_size = ext4_flex_bg_size(sbi);
int ret;
- if (!test_opt(sb, BLOCK_VALIDITY)) {
- if (sbi->system_blks)
- ext4_release_system_zone(sb);
- return 0;
- }
- if (sbi->system_blks)
- return 0;
-
system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
if (!system_blks)
return -ENOMEM;
@@ -279,22 +225,25 @@ int ext4_setup_system_zone(struct super_block *sb)
for (i=0; i < ngroups; i++) {
cond_resched();
if (ext4_bg_has_super(sb, i) &&
- ((i < 5) || ((i % flex_size) == 0)))
- add_system_zone(system_blks,
+ ((i < 5) || ((i % flex_size) == 0))) {
+ ret = add_system_zone(system_blks,
ext4_group_first_block_no(sb, i),
- ext4_bg_num_gdb(sb, i) + 1);
+ ext4_bg_num_gdb(sb, i) + 1, 0);
+ if (ret)
+ goto err;
+ }
gdp = ext4_get_group_desc(sb, i, NULL);
ret = add_system_zone(system_blks,
- ext4_block_bitmap(sb, gdp), 1);
+ ext4_block_bitmap(sb, gdp), 1, 0);
if (ret)
goto err;
ret = add_system_zone(system_blks,
- ext4_inode_bitmap(sb, gdp), 1);
+ ext4_inode_bitmap(sb, gdp), 1, 0);
if (ret)
goto err;
ret = add_system_zone(system_blks,
ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group);
+ sbi->s_itb_per_group, 0);
if (ret)
goto err;
}
@@ -307,10 +256,10 @@ int ext4_setup_system_zone(struct super_block *sb)
/*
* System blks rbtree complete, announce it once to prevent racing
- * with ext4_data_block_valid() accessing the rbtree at the same
+ * with ext4_inode_block_valid() accessing the rbtree at the same
* time.
*/
- rcu_assign_pointer(sbi->system_blks, system_blks);
+ rcu_assign_pointer(sbi->s_system_blks, system_blks);
if (test_opt(sb, DEBUG))
debug_print_tree(sbi);
@@ -327,7 +276,7 @@ err:
*
* The update of system_blks pointer in this function is protected by
* sb->s_umount semaphore. However we have to be careful as we can be
- * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * racing with ext4_inode_block_valid() calls reading system_blks rbtree
* protected only by RCU. So we first clear the system_blks pointer and
* then free the rbtree only after RCU grace period expires.
*/
@@ -335,19 +284,27 @@ void ext4_release_system_zone(struct super_block *sb)
{
struct ext4_system_blocks *system_blks;
- system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks,
+ system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks,
lockdep_is_held(&sb->s_umount));
- rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL);
+ rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL);
if (system_blks)
call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}
-int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
- unsigned int count)
+int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
+ ext4_fsblk_t start_blk, unsigned int count)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_system_blocks *system_blks;
- int ret;
+ struct ext4_system_zone *entry;
+ struct rb_node *n;
+ int ret = 1;
+
+ if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
+ (start_blk + count > ext4_blocks_count(sbi->s_es)))
+ return 0;
/*
* Lock the system zone to prevent it being released concurrently
@@ -355,17 +312,43 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
* mount option.
*/
rcu_read_lock();
- system_blks = rcu_dereference(sbi->system_blks);
- ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk,
- count);
+ system_blks = rcu_dereference(sbi->s_system_blks);
+ if (system_blks == NULL)
+ goto out_rcu;
+
+ n = system_blks->root.rb_node;
+ while (n) {
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ if (start_blk + count - 1 < entry->start_blk)
+ n = n->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = n->rb_right;
+ else {
+ ret = 0;
+ if (inode)
+ ret = (entry->ino == inode->i_ino);
+ break;
+ }
+ }
+out_rcu:
rcu_read_unlock();
return ret;
}
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with some other filesystem metadata blocks.
+ */
+int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count);
+}
+
int ext4_check_blockref(const char *function, unsigned int line,
struct inode *inode, __le32 *p, unsigned int max)
{
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
__le32 *bref = p;
unsigned int blk;
@@ -377,9 +360,7 @@ int ext4_check_blockref(const char *function, unsigned int line,
while (bref < p+max) {
blk = le32_to_cpu(*bref++);
if (blk &&
- unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- blk, 1))) {
- es->s_last_error_block = cpu_to_le64(blk);
+ unlikely(!ext4_inode_block_valid(inode, blk, 1))) {
ext4_error_inode(inode, function, line, blk,
"invalid block");
return -EFSCORRUPTED;
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
new file mode 100644
index 000000000000..e20ac0654b3f
--- /dev/null
+++ b/fs/ext4/crypto.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/quotaops.h>
+#include <linux/uuid.h>
+
+#include "ext4.h"
+#include "xattr.h"
+#include "ext4_jbd2.h"
+
+static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
+ const struct fscrypt_name *src)
+{
+ memset(dst, 0, sizeof(*dst));
+
+ dst->usr_fname = src->usr_fname;
+ dst->disk_name = src->disk_name;
+ dst->hinfo.hash = src->hash;
+ dst->hinfo.minor_hash = src->minor_hash;
+ dst->crypto_buf = src->crypto_buf;
+}
+
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+ int err;
+
+ err = fscrypt_setup_filename(dir, iname, lookup, &name);
+ if (err)
+ return err;
+
+ ext4_fname_from_fscrypt_name(fname, &name);
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ err = ext4_fname_setup_ci_filename(dir, iname, fname);
+#endif
+ return err;
+}
+
+int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
+ struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+ int err;
+
+ err = fscrypt_prepare_lookup(dir, dentry, &name);
+ if (err)
+ return err;
+
+ ext4_fname_from_fscrypt_name(fname, &name);
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
+#endif
+ return err;
+}
+
+void ext4_fname_free_filename(struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+
+ name.crypto_buf = fname->crypto_buf;
+ fscrypt_free_filename(&name);
+
+ fname->crypto_buf.name = NULL;
+ fname->usr_fname = NULL;
+ fname->disk_name.name = NULL;
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+#endif
+}
+
+static bool uuid_is_zero(__u8 u[16])
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ if (u[i])
+ return false;
+ return true;
+}
+
+int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg)
+{
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err, err2;
+ handle_t *handle;
+
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+
+ if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto pwsalt_err_exit;
+ }
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto pwsalt_err_journal;
+ lock_buffer(sbi->s_sbh);
+ generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+pwsalt_err_journal:
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
+ err = err2;
+pwsalt_err_exit:
+ mnt_drop_write_file(filp);
+ if (err)
+ return err;
+ }
+
+ if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
+}
+
+static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
+ void *fs_data)
+{
+ handle_t *handle = fs_data;
+ int res, res2, credits, retries = 0;
+
+ /*
+ * Encrypting the root directory is not allowed because e2fsck expects
+ * lost+found to exist and be unencrypted, and encrypting the root
+ * directory would imply encrypting the lost+found directory as well as
+ * the filename "lost+found" itself.
+ */
+ if (inode->i_ino == EXT4_ROOT_INO)
+ return -EPERM;
+
+ if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
+ return -EINVAL;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EOPNOTSUPP;
+
+ res = ext4_convert_inline_data(inode);
+ if (res)
+ return res;
+
+ /*
+ * If a journal handle was specified, then the encryption context is
+ * being set on a new inode via inheritance and is part of a larger
+ * transaction to create the inode. Otherwise the encryption context is
+ * being set on an existing inode in its own transaction. Only in the
+ * latter case should the "retry on ENOSPC" logic be used.
+ */
+
+ if (handle) {
+ res = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
+ if (!res) {
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ ext4_clear_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA);
+ /*
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
+ */
+ ext4_set_inode_flags(inode, false);
+ }
+ return res;
+ }
+
+ res = dquot_initialize(inode);
+ if (res)
+ return res;
+retry:
+ res = ext4_xattr_set_credits(inode, len, false /* is_create */,
+ &credits);
+ if (res)
+ return res;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
+ if (!res) {
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ /*
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
+ */
+ ext4_set_inode_flags(inode, false);
+ res = ext4_mark_inode_dirty(handle, inode);
+ if (res)
+ EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
+ }
+ res2 = ext4_journal_stop(handle);
+
+ if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ if (!res)
+ res = res2;
+ return res;
+}
+
+static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
+{
+ return EXT4_SB(sb)->s_dummy_enc_policy.policy;
+}
+
+static bool ext4_has_stable_inodes(struct super_block *sb)
+{
+ return ext4_has_feature_stable_inodes(sb);
+}
+
+static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
+ int *ino_bits_ret, int *lblk_bits_ret)
+{
+ *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
+ *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
+}
+
+const struct fscrypt_operations ext4_cryptops = {
+ .key_prefix = "ext4:",
+ .get_context = ext4_get_context,
+ .set_context = ext4_set_context,
+ .get_dummy_policy = ext4_get_dummy_policy,
+ .empty_dir = ext4_empty_dir,
+ .has_stable_inodes = ext4_has_stable_inodes,
+ .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
+};
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9aa1f75409b0..3985f8c33f95 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -55,6 +55,18 @@ static int is_dx_dir(struct inode *inode)
return 0;
}
+static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de)
+{
+ /* Check if . or .. , or skip if namelen is 0 */
+ if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') &&
+ (de->name[1] == '.' || de->name[1] == '\0'))
+ return true;
+ /* Check if this is a csum entry */
+ if (de->file_type == EXT4_FT_DIR_CSUM)
+ return true;
+ return false;
+}
+
/*
* Return 0 if the directory entry is OK, and 1 if there is a problem
*
@@ -73,16 +85,20 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
const int rlen = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
const int next_offset = ((char *) de - buf) + rlen;
+ bool fake = is_fake_dir_entry(de);
+ bool has_csum = ext4_has_metadata_csum(dir->i_sb);
- if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
+ if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
error_msg = "rec_len is smaller than minimal";
else if (unlikely(rlen % 4 != 0))
error_msg = "rec_len % 4 != 0";
- else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
+ else if (unlikely(rlen < ext4_dir_rec_len(de->name_len,
+ fake ? NULL : dir)))
error_msg = "rec_len is too small for name_len";
else if (unlikely(next_offset > size))
error_msg = "directory entry overrun";
- else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) &&
+ else if (unlikely(next_offset > size - ext4_dir_rec_len(1,
+ has_csum ? NULL : dir) &&
next_offset != size))
error_msg = "directory entry too close to block end";
else if (unlikely(le32_to_cpu(de->inode) >
@@ -94,15 +110,15 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
if (filp)
ext4_error_file(filp, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u, "
- "inode=%u, rec_len=%d, name_len=%d, size=%d",
+ "inode=%u, rec_len=%d, size=%d fake=%d",
error_msg, offset, le32_to_cpu(de->inode),
- rlen, de->name_len, size);
+ rlen, size, fake);
else
ext4_error_inode(dir, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u, "
- "inode=%u, rec_len=%d, name_len=%d, size=%d",
+ "inode=%u, rec_len=%d, size=%d fake=%d",
error_msg, offset, le32_to_cpu(de->inode),
- rlen, de->name_len, size);
+ rlen, size, fake);
return 1;
}
@@ -118,17 +134,15 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
struct buffer_head *bh = NULL;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
- if (IS_ENCRYPTED(inode)) {
- err = fscrypt_get_encryption_info(inode);
- if (err)
- return err;
- }
+ err = fscrypt_prepare_readdir(inode);
+ if (err)
+ return err;
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(file, ctx);
- if (err != ERR_BAD_DX_DIR) {
+ if (err != ERR_BAD_DX_DIR)
return err;
- }
+
/* Can we just clear INDEX flag to ignore htree information? */
if (!ext4_has_metadata_csum(sb)) {
/*
@@ -148,7 +162,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
}
if (IS_ENCRYPTED(inode)) {
- err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr);
if (err < 0)
return err;
}
@@ -226,7 +240,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len,
- sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
+ sb->s_blocksize) < ext4_dir_rec_len(1,
+ inode))
break;
i += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
@@ -267,7 +282,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
/* Directory is encrypted */
err = fscrypt_fname_disk_to_usr(inode,
- 0, 0, &de_name, &fstr);
+ EXT4_DIRENT_HASH(de),
+ EXT4_DIRENT_MINOR_HASH(de),
+ &de_name, &fstr);
de_name = fstr;
fstr.len = save_len;
if (err)
@@ -286,7 +303,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
goto done;
brelse(bh);
bh = NULL;
- offset = 0;
}
done:
err = 0;
@@ -392,11 +408,11 @@ struct fname {
__u32 inode;
__u8 name_len;
__u8 file_type;
- char name[0];
+ char name[];
};
/*
- * This functoin implements a non-recursive way of freeing all of the
+ * This function implements a non-recursive way of freeing all of the
* nodes in the red-black tree.
*/
static void free_rb_tree_fname(struct rb_root *root)
@@ -499,7 +515,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
/*
* This is a helper function for ext4_dx_readdir. It calls filldir
- * for all entres on the fname linked list. (Normally there is only
+ * for all entries on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
static int call_filldir(struct file *file, struct dir_context *ctx,
@@ -534,7 +550,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
struct dir_private_info *info = file->private_data;
struct inode *inode = file_inode(file);
struct fname *fname;
- int ret;
+ int ret = 0;
if (!info) {
info = ext4_htree_create_dir_info(file, ctx->pos);
@@ -582,7 +598,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
- return ret;
+ goto finished;
if (ret == 0) {
ctx->pos = ext4_get_htree_eof(file);
break;
@@ -613,14 +629,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
}
finished:
info->last_pos = ctx->pos;
- return 0;
-}
-
-static int ext4_dir_open(struct inode * inode, struct file * filp)
-{
- if (IS_ENCRYPTED(inode))
- return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
- return 0;
+ return ret < 0 ? ret : 0;
}
static int ext4_release_dir(struct inode *inode, struct file *filp)
@@ -639,7 +648,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
unsigned int offset = 0;
char *top;
- de = (struct ext4_dir_entry_2 *)buf;
+ de = buf;
top = buf + buf_size;
while ((char *) de < top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
@@ -664,57 +673,5 @@ const struct file_operations ext4_dir_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.fsync = ext4_sync_file,
- .open = ext4_dir_open,
.release = ext4_release_dir,
};
-
-#ifdef CONFIG_UNICODE
-static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
- const char *str, const struct qstr *name)
-{
- struct qstr qstr = {.name = str, .len = len };
- const struct dentry *parent = READ_ONCE(dentry->d_parent);
- const struct inode *inode = READ_ONCE(parent->d_inode);
-
- if (!inode || !IS_CASEFOLDED(inode) ||
- !EXT4_SB(inode->i_sb)->s_encoding) {
- if (len != name->len)
- return -1;
- return memcmp(str, name->name, len);
- }
-
- return ext4_ci_compare(inode, name, &qstr, false);
-}
-
-static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
-{
- const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
- const struct unicode_map *um = sbi->s_encoding;
- const struct inode *inode = READ_ONCE(dentry->d_inode);
- unsigned char *norm;
- int len, ret = 0;
-
- if (!inode || !IS_CASEFOLDED(inode) || !um)
- return 0;
-
- norm = kmalloc(PATH_MAX, GFP_ATOMIC);
- if (!norm)
- return -ENOMEM;
-
- len = utf8_casefold(um, str, norm, PATH_MAX);
- if (len < 0) {
- if (ext4_has_strict_mode(sbi))
- ret = -EINVAL;
- goto out;
- }
- str->hash = full_name_hash(dentry, norm, len);
-out:
- kfree(norm);
- return ret;
-}
-
-const struct dentry_operations ext4_dentry_ops = {
- .d_hash = ext4_d_hash,
- .d_compare = ext4_d_compare,
-};
-#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 61b37a052052..8d5453852f98 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -17,6 +17,7 @@
#ifndef _EXT4_H
#define _EXT4_H
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/magic.h>
@@ -27,7 +28,6 @@
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
-#include <linux/version.h>
#include <linux/wait.h>
#include <linux/sched/signal.h>
#include <linux/blockgroup_lock.h>
@@ -36,6 +36,7 @@
#include <crypto/hash.h>
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
+#include <linux/fiemap.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
@@ -80,16 +81,34 @@
#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
+ /*
+ * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c
+ */
+#define EXT_DEBUG__
+
/*
- * Turn on EXT_DEBUG to get lots of info about extents operations.
+ * Dynamic printk for controlled extents debugging.
*/
-#define EXT_DEBUG__
-#ifdef EXT_DEBUG
-#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#ifdef CONFIG_EXT4_DEBUG
+#define ext_debug(ino, fmt, ...) \
+ pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \
+ current->comm, task_pid_nr(current), \
+ ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \
+ __func__, ##__VA_ARGS__)
#else
-#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
+#define ASSERT(assert) \
+do { \
+ if (unlikely(!(assert))) { \
+ printk(KERN_EMERG \
+ "Assertion failure in %s() at %s:%d: '%s'\n", \
+ __func__, __FILE__, __LINE__, #assert); \
+ BUG(); \
+ } \
+} while (0)
+
/* data type for block offset of block group */
typedef int ext4_grpblk_t;
@@ -142,7 +161,12 @@ enum SHIFT_DIRECTION {
#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
/* Use blocks from reserved pool */
#define EXT4_MB_USE_RESERVED 0x2000
-
+/* Do strict check for free blocks while retrying block allocation */
+#define EXT4_MB_STRICT_CHECK 0x4000
+/* Large fragment size list lookup succeeded at least once for cr = 0 */
+#define EXT4_MB_CR0_OPTIMIZED 0x8000
+/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
+#define EXT4_MB_CR1_OPTIMIZED 0x00010000
struct ext4_allocation_request {
/* target inode for block we're allocating */
struct inode *inode;
@@ -171,10 +195,10 @@ struct ext4_allocation_request {
* well as to store the information returned by ext4_map_blocks(). It
* takes less room on the stack than a struct buffer_head.
*/
-#define EXT4_MAP_NEW (1 << BH_New)
-#define EXT4_MAP_MAPPED (1 << BH_Mapped)
-#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
-#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
+#define EXT4_MAP_NEW BIT(BH_New)
+#define EXT4_MAP_MAPPED BIT(BH_Mapped)
+#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
+#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
@@ -216,7 +240,7 @@ typedef struct ext4_io_end {
struct bio *bio; /* Linked list of completed
* bios covering the extent */
unsigned int flag; /* unwritten or not */
- atomic_t count; /* reference counter */
+ refcount_t count; /* reference counter */
struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
@@ -414,29 +438,51 @@ struct flex_groups {
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
-#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */
+
+#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */
+
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
-#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */
+#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */
-
-/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
-#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
+/* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \
+ EXT4_UNRM_FL | \
+ EXT4_COMPR_FL | \
+ EXT4_SYNC_FL | \
EXT4_IMMUTABLE_FL | \
EXT4_APPEND_FL | \
EXT4_NODUMP_FL | \
EXT4_NOATIME_FL | \
- EXT4_PROJINHERIT_FL)
+ EXT4_JOURNAL_DATA_FL | \
+ EXT4_NOTAIL_FL | \
+ EXT4_DIRSYNC_FL | \
+ EXT4_TOPDIR_FL | \
+ EXT4_EXTENTS_FL | \
+ 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \
+ EXT4_DAX_FL | \
+ EXT4_PROJINHERIT_FL | \
+ EXT4_CASEFOLD_FL)
+
+/* User visible flags */
+#define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \
+ EXT4_DIRTY_FL | \
+ EXT4_COMPRBLK_FL | \
+ EXT4_NOCOMPR_FL | \
+ EXT4_ENCRYPT_FL | \
+ EXT4_INDEX_FL | \
+ EXT4_VERITY_FL | \
+ EXT4_INLINE_DATA_FL)
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
- EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL)
+ EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
+ EXT4_DAX_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
@@ -448,6 +494,10 @@ struct flex_groups {
/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+/* Flags which are mutually exclusive to DAX */
+#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
+ EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL)
+
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
@@ -487,9 +537,11 @@ enum {
EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
EXT4_INODE_VERITY = 20, /* Verity protected inode */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
- EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+/* 22 was formerly EXT4_INODE_EOFBLOCKS */
+ EXT4_INODE_DAX = 25, /* Inode is DAX */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */
+ EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */
EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
};
@@ -533,9 +585,9 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(EXTENTS);
CHECK_FLAG_VALUE(VERITY);
CHECK_FLAG_VALUE(EA_INODE);
- CHECK_FLAG_VALUE(EOFBLOCKS);
CHECK_FLAG_VALUE(INLINE_DATA);
CHECK_FLAG_VALUE(PROJINHERIT);
+ CHECK_FLAG_VALUE(CASEFOLD);
CHECK_FLAG_VALUE(RESERVED);
}
@@ -610,8 +662,6 @@ enum {
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
/* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
- /* Request will not result in inode size update (user for fallocate) */
-#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
/* Convert written extents to unwritten */
#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100
/* Write zeros to newly created written extents */
@@ -621,6 +671,8 @@ enum {
/* Caller will submit data before dropping transaction handle. This
* allows jbd2 to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
+ /* Caller is in the atomic contex, find extent if it has been cached */
+#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800
/*
* The bit position of these flags must not overlap with any of the
@@ -633,6 +685,7 @@ enum {
*/
#define EXT4_EX_NOCACHE 0x40000000
#define EXT4_EX_FORCE_CACHE 0x20000000
+#define EXT4_EX_NOFAIL 0x10000000
/*
* Flags used by ext4_free_blocks
@@ -648,8 +701,6 @@ enum {
/*
* ioctl commands
*/
-#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
@@ -666,16 +717,13 @@ enum {
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
-#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
-#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
-#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
/* ioctl codes 19--39 are reserved for fscrypt */
#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
-
-#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
-#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
+#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32)
+#define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid)
+#define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid)
#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
@@ -697,12 +745,27 @@ enum {
#define EXT4_STATE_FLAG_NEWENTRY 0x00000004
#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008
+/* flags for ioctl EXT4_IOC_CHECKPOINT */
+#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1
+#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2
+#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4
+#define EXT4_IOC_CHECKPOINT_FLAG_VALID (EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \
+ EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \
+ EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+
+/*
+ * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID
+ */
+struct fsuuid {
+ __u32 fsu_len;
+ __u32 fsu_flags;
+ __u8 fsu_uuid[];
+};
+
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
*/
-#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS
#define EXT4_IOC32_GETVERSION _IOR('f', 3, int)
#define EXT4_IOC32_SETVERSION _IOW('f', 4, int)
#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
@@ -723,7 +786,7 @@ enum {
#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
/* Max logical block we can support */
-#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF
+#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE
/*
* Structure of an inode on the disk
@@ -927,6 +990,7 @@ do { \
#endif /* defined(__KERNEL__) || defined(__linux__) */
#include "extents_status.h"
+#include "fast_commit.h"
/*
* Lock subclasses for i_data_sem in the ext4_inode_info structure.
@@ -975,14 +1039,45 @@ struct ext4_inode_info {
/*
* Extended attributes can be read independently of the main file
- * data. Taking i_mutex even when reading would cause contention
+ * data. Taking i_rwsem even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
*/
struct rw_semaphore xattr_sem;
- struct list_head i_orphan; /* unlinked but open inodes */
+ /*
+ * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
+ * i_orphan is used.
+ */
+ union {
+ struct list_head i_orphan; /* unlinked but open inodes */
+ unsigned int i_orphan_idx; /* Index in orphan file */
+ };
+
+ /* Fast commit related info */
+
+ /* For tracking dentry create updates */
+ struct list_head i_fc_dilist;
+ struct list_head i_fc_list; /*
+ * inodes that need fast commit
+ * protected by sbi->s_fc_lock.
+ */
+
+ /* Start of lblk range that needs to be committed in this fast commit */
+ ext4_lblk_t i_fc_lblk_start;
+
+ /* End of lblk range that needs to be committed in this fast commit */
+ ext4_lblk_t i_fc_lblk_len;
+
+ /* Number of ongoing updates on this inode */
+ atomic_t i_fc_updates;
+
+ /* Fast commit wait queue for this inode */
+ wait_queue_head_t i_fc_wait;
+
+ /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
+ struct mutex i_fc_lock;
/*
* i_disksize keeps track of what the inode size is ON DISK, not
@@ -1012,15 +1107,6 @@ struct ext4_inode_info {
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
- /*
- * i_mmap_sem is for serializing page faults with truncate / punch hole
- * operations. We have to make sure that new page cannot be faulted in
- * a section of the inode that is being punched. We cannot easily use
- * i_data_sem for this since we need protection for the whole punch
- * operation and i_data_sem ranks below transaction start so we have
- * to occasionally drop it.
- */
- struct rw_semaphore i_mmap_sem;
struct inode vfs_inode;
struct jbd2_inode *jinode;
@@ -1033,6 +1119,7 @@ struct ext4_inode_info {
struct timespec64 i_crtime;
/* mballoc */
+ atomic_t i_prealloc_active;
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
@@ -1103,6 +1190,7 @@ struct ext4_inode_info {
#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */
#define EXT4_ERROR_FS 0x0002 /* Errors detected */
#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */
+#define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */
/*
* Misc. filesystem flags
@@ -1124,9 +1212,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
-#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
+#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */
#else
-#define EXT4_MOUNT_DAX 0
+#define EXT4_MOUNT_DAX_ALWAYS 0
#endif
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
@@ -1151,6 +1239,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */
+#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -1169,10 +1258,16 @@ struct ext4_inode_info {
blocks */
#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
file systems */
-
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly
specified journal checksum */
+#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
+#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
+#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
+#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group
+ * scanning in mballoc
+ */
+
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
@@ -1197,7 +1292,7 @@ struct ext4_inode_info {
#define ext4_find_next_zero_bit find_next_zero_bit_le
#define ext4_find_next_bit find_next_bit_le
-extern void ext4_set_bits(void *bm, int cur, int len);
+extern void mb_set_bits(void *bm, int cur, int len);
/*
* Maximal mount counts between two filesystem checks
@@ -1216,6 +1311,8 @@ extern void ext4_set_bits(void *bm, int cur, int len);
/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM 1
+#define EXT4_LABEL_MAX 16
+
/*
* Structure of the super block
*/
@@ -1265,7 +1362,7 @@ struct ext4_super_block {
/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
__le32 s_feature_ro_compat; /* readonly-compatible feature set */
/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
-/*78*/ char s_volume_name[16]; /* volume name */
+/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */
/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */
/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
/*
@@ -1345,7 +1442,8 @@ struct ext4_super_block {
__u8 s_last_error_errcode;
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
- __le32 s_reserved[95]; /* Padding to the end of the block */
+ __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */
+ __le32 s_reserved[94]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1353,32 +1451,58 @@ struct ext4_super_block {
#ifdef __KERNEL__
-/*
- * run-time mount flags
- */
-#define EXT4_MF_MNTDIR_SAMPLED 0x0001
-#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
-#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004
-
-#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
- EXT4_MF_TEST_DUMMY_ENCRYPTION))
-#else
-#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
-#endif
-
/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3
#define EXT4_ENC_UTF8_12_1 1
+/* Types of ext4 journal triggers */
+enum ext4_journal_trigger_type {
+ EXT4_JTR_ORPHAN_FILE,
+ EXT4_JTR_NONE /* This must be the last entry for indexing to work! */
+};
+
+#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE
+
+struct ext4_journal_trigger {
+ struct jbd2_buffer_trigger_type tr_triggers;
+ struct super_block *sb;
+};
+
+static inline struct ext4_journal_trigger *EXT4_TRIGGER(
+ struct jbd2_buffer_trigger_type *trigger)
+{
+ return container_of(trigger, struct ext4_journal_trigger, tr_triggers);
+}
+
+#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04
+
+/* Structure at the tail of orphan block */
+struct ext4_orphan_block_tail {
+ __le32 ob_magic;
+ __le32 ob_checksum;
+};
+
+static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
+{
+ return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) /
+ sizeof(u32);
+}
+
+struct ext4_orphan_block {
+ atomic_t ob_free_entries; /* Number of free orphan entries in block */
+ struct buffer_head *ob_bh; /* Buffer for orphan block */
+};
+
/*
- * Flags for ext4_sb_info.s_encoding_flags.
+ * Info about orphan file.
*/
-#define EXT4_ENC_STRICT_MODE_FL (1 << 0)
-
-#define ext4_has_strict_mode(sbi) \
- (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL)
+struct ext4_orphan_info {
+ int of_blocks; /* Number of orphan blocks in a file */
+ __u32 of_csum_seed; /* Checksum seed for orphan file */
+ struct ext4_orphan_block *of_binfo; /* Array with info about orphan
+ * file blocks */
+};
/*
* fourth extended-fs super-block data in memory
@@ -1403,7 +1527,7 @@ struct ext4_sb_info {
struct buffer_head * __rcu *s_group_desc;
unsigned int s_mount_opt;
unsigned int s_mount_opt2;
- unsigned int s_mount_flags;
+ unsigned long s_mount_flags;
unsigned int s_def_mount_opt;
ext4_fsblk_t s_sb_block;
atomic64_t s_resv_clusters;
@@ -1419,37 +1543,37 @@ struct ext4_sb_info {
unsigned int s_inode_goal;
u32 s_hash_seed[4];
int s_def_hash_version;
- int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
+ int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */
struct percpu_counter s_freeclusters_counter;
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
struct percpu_counter s_dirtyclusters_counter;
+ struct percpu_counter s_sra_exceeded_retry_limit;
struct blockgroup_lock *s_blockgroup_lock;
struct proc_dir_entry *s_proc;
struct kobject s_kobj;
struct completion s_kobj_unregister;
struct super_block *s_sb;
-#ifdef CONFIG_UNICODE
- struct unicode_map *s_encoding;
- __u16 s_encoding_flags;
-#endif
+ struct buffer_head *s_mmp_bh;
/* Journaling */
struct journal_s *s_journal;
- struct list_head s_orphan;
- struct mutex s_orphan_lock;
unsigned long s_ext4_flags; /* Ext4 superblock flags */
+ struct mutex s_orphan_lock; /* Protects on disk list changes */
+ struct list_head s_orphan; /* List of orphaned inodes in on disk
+ list */
+ struct ext4_orphan_info s_orphan_info;
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
- struct block_device *journal_bdev;
+ struct block_device *s_journal_bdev;
#ifdef CONFIG_QUOTA
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
- struct ext4_system_blocks __rcu *system_blks;
+ struct ext4_system_blocks __rcu *s_system_blks;
#ifdef EXTENTS_STATS
/* ext4 extents stats */
@@ -1471,31 +1595,47 @@ struct ext4_sb_info {
unsigned int s_mb_free_pending;
struct list_head s_freed_data_list; /* List of blocks to be freed
after commit completed */
+ struct list_head s_discard_list;
+ struct work_struct s_discard_work;
+ atomic_t s_retry_alloc_pending;
+ struct list_head *s_mb_avg_fragment_size;
+ rwlock_t *s_mb_avg_fragment_size_locks;
+ struct list_head *s_mb_largest_free_orders;
+ rwlock_t *s_mb_largest_free_orders_locks;
/* tunables */
unsigned long s_stripe;
+ unsigned int s_mb_max_linear_groups;
unsigned int s_mb_stream_request;
unsigned int s_mb_max_to_scan;
unsigned int s_mb_min_to_scan;
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
+ unsigned int s_mb_max_inode_prealloc;
unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
+ unsigned int s_mb_prefetch;
+ unsigned int s_mb_prefetch_limit;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
atomic_t s_bal_success; /* we found long enough chunks */
atomic_t s_bal_allocated; /* in blocks */
atomic_t s_bal_ex_scanned; /* total extents scanned */
+ atomic_t s_bal_groups_scanned; /* number of groups scanned */
atomic_t s_bal_goals; /* goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
- spinlock_t s_bal_lock;
- unsigned long s_mb_buddies_generated;
- unsigned long long s_mb_generation_time;
+ atomic_t s_bal_cr0_bad_suggestions;
+ atomic_t s_bal_cr1_bad_suggestions;
+ atomic64_t s_bal_cX_groups_considered[4];
+ atomic64_t s_bal_cX_hits[4];
+ atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */
+ atomic_t s_mb_buddies_generated; /* number of buddies generated */
+ atomic64_t s_mb_generation_time;
atomic_t s_mb_lost_chunks;
atomic_t s_mb_preallocated;
atomic_t s_mb_discarded;
@@ -1530,7 +1670,7 @@ struct ext4_sb_info {
struct task_struct *s_mmp_tsk;
/* record the last minlen when FITRIM is called. */
- atomic_t s_last_trim_minblks;
+ unsigned long s_last_trim_minblks;
/* Reference to checksum algorithm driver via cryptoapi */
struct crypto_shash *s_chksum_driver;
@@ -1547,10 +1687,18 @@ struct ext4_sb_info {
struct mb_cache *s_ea_inode_cache;
spinlock_t s_es_lock ____cacheline_aligned_in_smp;
+ /* Journal triggers for checksum computation */
+ struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT];
+
/* Ratelimit ext4 messages. */
struct ratelimit_state s_err_ratelimit_state;
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
+ atomic_t s_warning_count;
+ atomic_t s_msg_count;
+
+ /* Encryption policy for '-o test_dummy_encryption' */
+ struct fscrypt_dummy_policy s_dummy_enc_policy;
/*
* Barrier between writepages ops and changing any inode's JOURNAL_DATA
@@ -1558,9 +1706,62 @@ struct ext4_sb_info {
*/
struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
+ u64 s_dax_part_off;
#ifdef CONFIG_EXT4_DEBUG
unsigned long s_simulate_fail;
#endif
+ /* Record the errseq of the backing block device */
+ errseq_t s_bdev_wb_err;
+ spinlock_t s_bdev_wb_lock;
+
+ /* Information about errors that happened during this mount */
+ spinlock_t s_error_lock;
+ int s_add_error_count;
+ int s_first_error_code;
+ __u32 s_first_error_line;
+ __u32 s_first_error_ino;
+ __u64 s_first_error_block;
+ const char *s_first_error_func;
+ time64_t s_first_error_time;
+ int s_last_error_code;
+ __u32 s_last_error_line;
+ __u32 s_last_error_ino;
+ __u64 s_last_error_block;
+ const char *s_last_error_func;
+ time64_t s_last_error_time;
+ /*
+ * If we are in a context where we cannot update error information in
+ * the on-disk superblock, we queue this work to do it.
+ */
+ struct work_struct s_error_work;
+
+ /* Ext4 fast commit sub transaction ID */
+ atomic_t s_fc_subtid;
+
+ /*
+ * After commit starts, the main queue gets locked, and the further
+ * updates get added in the staging queue.
+ */
+#define FC_Q_MAIN 0
+#define FC_Q_STAGING 1
+ struct list_head s_fc_q[2]; /* Inodes staged for fast commit
+ * that have data changes in them.
+ */
+ struct list_head s_fc_dentry_q[2]; /* directory entry updates */
+ unsigned int s_fc_bytes;
+ /*
+ * Main fast commit lock. This lock protects accesses to the
+ * following fields:
+ * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
+ */
+ spinlock_t s_fc_lock;
+ struct buffer_head *s_fc_bh;
+ struct ext4_fc_stats s_fc_stats;
+ tid_t s_fc_ineligible_tid;
+#ifdef CONFIG_EXT4_DEBUG
+ int s_fc_debug_max_replay;
+#endif
+ struct ext4_fc_replay_state s_fc_replay_state;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1597,6 +1798,31 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
})
/*
+ * run-time mount flags
+ */
+enum {
+ EXT4_MF_MNTDIR_SAMPLED,
+ EXT4_MF_FS_ABORTED, /* Fatal error detected */
+ EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */
+};
+
+static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
+{
+ set_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+static inline void ext4_clear_mount_flag(struct super_block *sb, int bit)
+{
+ clear_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+static inline int ext4_test_mount_flag(struct super_block *sb, int bit)
+{
+ return test_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+
+/*
* Simulate_fail codes
*/
#define EXT4_SIM_BBITMAP_EIO 1
@@ -1671,6 +1897,8 @@ enum {
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
+ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
+ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1744,7 +1972,6 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */
#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
-#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV
#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV
#define EXT4_GOOD_OLD_INODE_SIZE 128
@@ -1764,7 +1991,16 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
+/*
+ * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes
+ * incompatible only if fast commit blocks are present in the FS. Since we
+ * clear the journal (and thus the fast commit blocks), we don't mark FS as
+ * incompatible. We also have a JBD2 incompat feature, which gets set when
+ * there are fast commit blocks present in the journal.
+ */
+#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800
+#define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
@@ -1785,6 +2021,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000
+#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be
+ non-empty */
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1866,7 +2104,9 @@ EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR)
EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
+EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT)
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES)
+EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE)
@@ -1881,6 +2121,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY)
+EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT)
EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
@@ -1914,7 +2155,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
-#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \
+ EXT4_FEATURE_COMPAT_ORPHAN_FILE)
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG| \
@@ -1939,7 +2181,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
EXT4_FEATURE_RO_COMPAT_QUOTA |\
EXT4_FEATURE_RO_COMPAT_PROJECT |\
- EXT4_FEATURE_RO_COMPAT_VERITY)
+ EXT4_FEATURE_RO_COMPAT_VERITY |\
+ EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT)
#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -1975,18 +2218,20 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
}
+extern int ext4_feature_set_ok(struct super_block *sb, int readonly);
+
/*
* Superblock flags
*/
#define EXT4_FLAGS_RESIZING 0
#define EXT4_FLAGS_SHUTDOWN 1
+#define EXT4_FLAGS_BDEV_IS_DAX 2
static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
{
return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
}
-
/*
* Default values for user and/or group using reserved blocks
*/
@@ -2033,6 +2278,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
+/*
+ * Base length of the ext4 directory entry excluding the name length
+ */
+#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)
struct ext4_dir_entry {
__le32 inode; /* Inode number */
@@ -2041,6 +2290,17 @@ struct ext4_dir_entry {
char name[EXT4_NAME_LEN]; /* File name */
};
+
+/*
+ * Encrypted Casefolded entries require saving the hash on disk. This structure
+ * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned
+ * boundary.
+ */
+struct ext4_dir_entry_hash {
+ __le32 hash;
+ __le32 minor_hash;
+};
+
/*
* The new version of the directory entry. Since EXT4 structures are
* stored in intel byte order, and the name_len field could never be
@@ -2051,11 +2311,27 @@ struct ext4_dir_entry_2 {
__le32 inode; /* Inode number */
__le16 rec_len; /* Directory entry length */
__u8 name_len; /* Name length */
- __u8 file_type;
+ __u8 file_type; /* See file type macros EXT4_FT_* below */
char name[EXT4_NAME_LEN]; /* File name */
};
/*
+ * Access the hashes at the end of ext4_dir_entry_2
+ */
+#define EXT4_DIRENT_HASHES(entry) \
+ ((struct ext4_dir_entry_hash *) \
+ (((void *)(entry)) + \
+ ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
+#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash)
+#define EXT4_DIRENT_MINOR_HASH(entry) \
+ le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash)
+
+static inline bool ext4_hash_in_dirent(const struct inode *inode)
+{
+ return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode);
+}
+
+/*
* This is a bogus directory entry at the end of each leaf block that
* records checksums.
*/
@@ -2096,11 +2372,25 @@ struct ext4_dir_entry_tail {
*/
#define EXT4_DIR_PAD 4
#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1)
-#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \
- ~EXT4_DIR_ROUND)
#define EXT4_MAX_REC_LEN ((1<<16)-1)
/*
+ * The rec_len is dependent on the type of directory. Directories that are
+ * casefolded and encrypted need to store the hash as well, so we add room for
+ * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should
+ * pass NULL for dir, as those entries do not use the extra fields.
+ */
+static inline unsigned int ext4_dir_rec_len(__u8 name_len,
+ const struct inode *dir)
+{
+ int rec_len = (name_len + 8 + EXT4_DIR_ROUND);
+
+ if (dir && ext4_hash_in_dirent(dir))
+ rec_len += sizeof(struct ext4_dir_entry_hash);
+ return (rec_len & ~EXT4_DIR_ROUND);
+}
+
+/*
* If we ever get support for fs block sizes > page_size, we'll need
* to remove the #if statements in the next two functions...
*/
@@ -2120,8 +2410,7 @@ ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
- if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
- BUG();
+ BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3));
#if (PAGE_SIZE >= 65536)
if (len < 65536)
return cpu_to_le16(len);
@@ -2156,6 +2445,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_LEGACY_UNSIGNED 3
#define DX_HASH_HALF_MD4_UNSIGNED 4
#define DX_HASH_TEA_UNSIGNED 5
+#define DX_HASH_SIPHASH 6
static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
const void *address, unsigned int length)
@@ -2204,12 +2494,13 @@ struct ext4_filename {
#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_str crypto_buf;
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
struct fscrypt_str cf_name;
#endif
};
#define fname_name(p) ((p)->disk_name.name)
+#define fname_usr_name(p) ((p)->usr_fname->name)
#define fname_len(p) ((p)->disk_name.len)
/*
@@ -2288,9 +2579,15 @@ struct ext4_lazy_init {
struct mutex li_list_mtx;
};
+enum ext4_li_mode {
+ EXT4_LI_MODE_PREFETCH_BBITMAP,
+ EXT4_LI_MODE_ITABLE,
+};
+
struct ext4_li_request {
struct super_block *lr_super;
- struct ext4_sb_info *lr_sbi;
+ enum ext4_li_mode lr_mode;
+ ext4_group_t lr_first_not_zeroed;
ext4_group_t lr_next_group;
struct list_head lr_request;
unsigned long lr_next_sched;
@@ -2421,7 +2718,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
- ext4_group_t block_group);
+ ext4_group_t block_group,
+ bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh);
@@ -2432,95 +2730,42 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-#ifdef CONFIG_UNICODE
-extern void ext4_fname_setup_ci_filename(struct inode *dir,
+#if IS_ENABLED(CONFIG_UNICODE)
+extern int ext4_fname_setup_ci_filename(struct inode *dir,
const struct qstr *iname,
- struct fscrypt_str *fname);
+ struct ext4_filename *fname);
#endif
+/* ext4 encryption related stuff goes here crypto.c */
#ifdef CONFIG_FS_ENCRYPTION
-static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
- const struct fscrypt_name *src)
-{
- memset(dst, 0, sizeof(*dst));
-
- dst->usr_fname = src->usr_fname;
- dst->disk_name = src->disk_name;
- dst->hinfo.hash = src->hash;
- dst->hinfo.minor_hash = src->minor_hash;
- dst->crypto_buf = src->crypto_buf;
-}
-
-static inline int ext4_fname_setup_filename(struct inode *dir,
- const struct qstr *iname,
- int lookup,
- struct ext4_filename *fname)
-{
- struct fscrypt_name name;
- int err;
-
- err = fscrypt_setup_filename(dir, iname, lookup, &name);
- if (err)
- return err;
+extern const struct fscrypt_operations ext4_cryptops;
- ext4_fname_from_fscrypt_name(fname, &name);
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname);
-#ifdef CONFIG_UNICODE
- ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
-#endif
- return 0;
-}
+int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
+ struct ext4_filename *fname);
-static inline int ext4_fname_prepare_lookup(struct inode *dir,
- struct dentry *dentry,
- struct ext4_filename *fname)
-{
- struct fscrypt_name name;
- int err;
+void ext4_fname_free_filename(struct ext4_filename *fname);
- err = fscrypt_prepare_lookup(dir, dentry, &name);
- if (err)
- return err;
+int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg);
- ext4_fname_from_fscrypt_name(fname, &name);
-
-#ifdef CONFIG_UNICODE
- ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name);
-#endif
- return 0;
-}
-
-static inline void ext4_fname_free_filename(struct ext4_filename *fname)
-{
- struct fscrypt_name name;
-
- name.crypto_buf = fname->crypto_buf;
- fscrypt_free_filename(&name);
-
- fname->crypto_buf.name = NULL;
- fname->usr_fname = NULL;
- fname->disk_name.name = NULL;
-
-#ifdef CONFIG_UNICODE
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
-}
#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
const struct qstr *iname,
int lookup,
struct ext4_filename *fname)
{
+ int err = 0;
fname->usr_fname = iname;
fname->disk_name.name = (unsigned char *) iname->name;
fname->disk_name.len = iname->len;
-#ifdef CONFIG_UNICODE
- ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
+#if IS_ENABLED(CONFIG_UNICODE)
+ err = ext4_fname_setup_ci_filename(dir, iname, fname);
#endif
- return 0;
+ return err;
}
static inline int ext4_fname_prepare_lookup(struct inode *dir,
@@ -2532,11 +2777,17 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
kfree(fname->cf_name.name);
fname->cf_name.name = NULL;
#endif
}
+
+static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp,
+ void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
#endif /* !CONFIG_FS_ENCRYPTION */
/* dir.c */
@@ -2545,9 +2796,9 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
struct ext4_dir_entry_2 *,
struct buffer_head *, char *, int,
unsigned int);
-#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
+#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
- (de), (bh), (buf), (size), (offset)))
+ (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
struct ext4_dir_entry_2 *dirent,
@@ -2558,13 +2809,14 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
-void ext4_insert_dentry(struct inode *inode,
+void ext4_insert_dentry(struct inode *dir, struct inode *inode,
struct ext4_dir_entry_2 *de,
int buf_size,
struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
- if (!ext4_has_feature_dir_index(inode->i_sb)) {
+ if (!ext4_has_feature_dir_index(inode->i_sb) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
/* ext4_iget() should have caught this... */
WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
@@ -2592,18 +2844,20 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo);
/* ialloc.c */
-extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
+extern int ext4_mark_inode_used(struct super_block *sb, int ino);
+extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *,
+ struct inode *, umode_t,
const struct qstr *qstr, __u32 goal,
uid_t *owner, __u32 i_flags,
int handle_type, unsigned int line_no,
int nblocks);
-#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \
- __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
- i_flags, 0, 0, 0)
-#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \
+ __ext4_new_inode(&init_user_ns, (handle), (dir), (mode), (qstr), \
+ (goal), (owner), i_flags, 0, 0, 0)
+#define ext4_new_inode_start_handle(mnt_userns, dir, mode, qstr, goal, owner, \
type, nblocks) \
- __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
+ __ext4_new_inode((mnt_userns), NULL, (dir), (mode), (qstr), (goal), (owner), \
0, (type), __LINE__, (nblocks))
@@ -2617,18 +2871,55 @@ extern int ext4_init_inode_table(struct super_block *sb,
ext4_group_t group, int barrier);
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
+/* fast_commit.c */
+int ext4_fc_info_show(struct seq_file *seq, void *v);
+void ext4_fc_init(struct super_block *sb, journal_t *journal);
+void ext4_fc_init_inode(struct inode *inode);
+void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end);
+void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode,
+ struct dentry *dentry);
+void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
+ struct dentry *dentry);
+void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
+void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
+void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+ struct dentry *dentry);
+void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
+void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
+void ext4_fc_start_update(struct inode *inode);
+void ext4_fc_stop_update(struct inode *inode);
+void ext4_fc_del(struct inode *inode);
+bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
+void ext4_fc_replay_cleanup(struct super_block *sb);
+int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
+int __init ext4_fc_init_dentry_cache(void);
+void ext4_fc_destroy_dentry_cache(void);
+int ext4_fc_record_regions(struct super_block *sb, int ino,
+ ext4_lblk_t lblk, ext4_fsblk_t pblk,
+ int len, int replay);
+
/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
+extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
+extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
extern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_discard_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *, unsigned int);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
+extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
+ ext4_group_t group,
+ unsigned int nr, int *cnt);
+extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr);
+
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
unsigned long count, int flags);
@@ -2640,8 +2931,12 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
+extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
+ int len, int state);
/* inode.c */
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
@@ -2654,13 +2949,14 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
+ struct inode *inode,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
- int (*fn)(handle_t *handle,
+ int (*fn)(handle_t *handle, struct inode *inode,
struct buffer_head *bh));
-int do_journal_get_write_access(handle_t *handle,
+int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -2679,21 +2975,27 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
__ext4_iget((sb), (ino), (flags), __func__, __LINE__)
extern int ext4_write_inode(struct inode *, struct writeback_control *);
-extern int ext4_setattr(struct dentry *, struct iattr *);
-extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int ext4_setattr(struct user_namespace *, struct dentry *,
+ struct iattr *);
+extern u32 ext4_dio_alignment(struct inode *inode);
+extern int ext4_getattr(struct user_namespace *, const struct path *,
+ struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
-extern int ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int ext4_file_getattr(struct user_namespace *, const struct path *,
+ struct kstat *, u32, unsigned int);
extern int ext4_sync_inode(handle_t *, struct inode *);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+ struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
-extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern void ext4_set_inode_flags(struct inode *);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
@@ -2701,7 +3003,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
-extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
@@ -2721,16 +3022,21 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+int ext4_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
+int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+extern void ext4_reset_inode_seed(struct inode *inode);
+int ext4_update_overhead(struct super_block *sb, bool force);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);
/* namei.c */
+extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+ struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
struct buffer_head *bh);
-extern int ext4_orphan_add(handle_t *, struct inode *);
-extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
extern int ext4_search_dir(struct buffer_head *bh,
@@ -2740,8 +3046,7 @@ extern int ext4_search_dir(struct buffer_head *bh,
struct ext4_filename *fname,
unsigned int offset,
struct ext4_dir_entry_2 **res_dir);
-extern int ext4_generic_delete_entry(handle_t *handle,
- struct inode *dir,
+extern int ext4_generic_delete_entry(struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh,
void *entry_buf,
@@ -2757,12 +3062,25 @@ extern int ext4_group_extend(struct super_block *sb,
struct ext4_super_block *es,
ext4_fsblk_t n_blocks_count);
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
+extern unsigned int ext4_list_backups(struct super_block *sb,
+ unsigned int *three, unsigned int *five,
+ unsigned int *seven);
/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
- sector_t block, int op_flags);
+ sector_t block, blk_opf_t op_flags);
+extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
+ sector_t block);
+extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io);
+extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io);
+extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
+extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
+extern __le32 ext4_superblock_csum(struct super_block *sb,
+ struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
@@ -2771,23 +3089,19 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno,
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
ext4_group_t block_group,
unsigned int flags);
-extern void ext4_set_errno(struct super_block *sb, int err);
-extern __printf(4, 5)
-void __ext4_error(struct super_block *, const char *, unsigned int,
- const char *, ...);
-extern __printf(5, 6)
-void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
- const char *, ...);
+extern __printf(7, 8)
+void __ext4_error(struct super_block *, const char *, unsigned int, bool,
+ int, __u64, const char *, ...);
+extern __printf(6, 7)
+void __ext4_error_inode(struct inode *, const char *, unsigned int,
+ ext4_fsblk_t, int, const char *, ...);
extern __printf(5, 6)
void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
extern __printf(4, 5)
-void __ext4_abort(struct super_block *, const char *, unsigned int,
- const char *, ...);
-extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
const char *, ...);
extern __printf(4, 5)
@@ -2806,22 +3120,34 @@ void __ext4_grp_locked_error(const char *, unsigned int,
#define EXT4_ERROR_INODE(inode, fmt, a...) \
ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
-#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
- ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
+#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \
+ __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a)
+
+#define ext4_error_inode_block(inode, block, err, fmt, a...) \
+ __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \
+ (fmt), ## a)
#define EXT4_ERROR_FILE(file, block, fmt, a...) \
ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
+#define ext4_abort(sb, err, fmt, a...) \
+ __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a)
+
#ifdef CONFIG_PRINTK
#define ext4_error_inode(inode, func, line, block, fmt, ...) \
- __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+ __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__)
+#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \
+ __ext4_error_inode((inode), (func), (line), (block), \
+ (err), (fmt), ##__VA_ARGS__)
#define ext4_error_file(file, func, line, block, fmt, ...) \
__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...) \
- __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
-#define ext4_abort(sb, fmt, ...) \
- __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+ __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \
+ ##__VA_ARGS__)
+#define ext4_error_err(sb, err, fmt, ...) \
+ __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \
+ ##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...) \
__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...) \
@@ -2839,7 +3165,12 @@ void __ext4_grp_locked_error(const char *, unsigned int,
#define ext4_error_inode(inode, func, line, block, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
- __ext4_error_inode(inode, "", 0, block, " "); \
+ __ext4_error_inode(inode, "", 0, block, 0, " "); \
+} while (0)
+#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_inode(inode, "", 0, block, err, " "); \
} while (0)
#define ext4_error_file(file, func, line, block, fmt, ...) \
do { \
@@ -2849,12 +3180,12 @@ do { \
#define ext4_error(sb, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
- __ext4_error(sb, "", 0, " "); \
+ __ext4_error(sb, "", 0, false, 0, 0, " "); \
} while (0)
-#define ext4_abort(sb, fmt, ...) \
+#define ext4_error_err(sb, err, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
- __ext4_abort(sb, "", 0, " "); \
+ __ext4_error(sb, "", 0, false, err, 0, " "); \
} while (0)
#define ext4_warning(sb, fmt, ...) \
do { \
@@ -2881,12 +3212,6 @@ do { \
#endif
-extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
- __u32 compat);
-extern int ext4_update_rocompat_feature(handle_t *handle,
- struct super_block *sb, __u32 rocompat);
-extern int ext4_update_incompat_feature(handle_t *handle,
- struct super_block *sb, __u32 incompat);
extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -2937,22 +3262,24 @@ static inline int ext4_has_group_desc_csum(struct super_block *sb)
return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
}
+#define ext4_read_incompat_64bit_val(es, name) \
+ (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \
+ ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \
+ le32_to_cpu(es->name##_lo))
+
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_blocks_count);
}
static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_r_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_r_blocks_count);
}
static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_free_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_free_blocks_count);
}
static inline void ext4_blocks_count_set(struct ext4_super_block *es,
@@ -3046,7 +3373,7 @@ do { \
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif
-/* Update i_disksize. Requires i_mutex to avoid races with truncate */
+/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
@@ -3057,7 +3384,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
up_write(&EXT4_I(inode)->i_data_sem);
}
-/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
+/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
int changed = 0;
@@ -3078,16 +3405,24 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
struct ext4_group_info {
unsigned long bb_state;
+#ifdef AGGRESSIVE_CHECK
+ unsigned long bb_check_counter;
+#endif
struct rb_root bb_free_root;
ext4_grpblk_t bb_first_free; /* first free block */
ext4_grpblk_t bb_free; /* total free blocks */
ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
+ int bb_avg_fragment_size_order; /* order of average
+ fragment in BG */
ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
+ ext4_group_t bb_group; /* Group number */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
+ struct list_head bb_avg_fragment_size_node;
+ struct list_head bb_largest_free_order_node;
ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
* regions, index is order.
* bb_counters[3] = 5 means
@@ -3102,6 +3437,7 @@ struct ext4_group_info {
(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
@@ -3116,6 +3452,8 @@ struct ext4_group_info {
(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \
+ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
#define EXT4_MAX_CONTENTION 8
#define EXT4_CONTENTION_THRESHOLD 2
@@ -3161,6 +3499,22 @@ static inline void ext4_unlock_group(struct super_block *sb,
spin_unlock(ext4_group_lock_ptr(sb, group));
}
+#ifdef CONFIG_QUOTA
+static inline bool ext4_quota_capable(struct super_block *sb)
+{
+ return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb));
+}
+
+static inline bool ext4_is_quota_journalled(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ return (ext4_has_feature_quota(sb) ||
+ sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]);
+}
+int ext4_enable_quotas(struct super_block *sb);
+#endif
+
/*
* Block validity checking
*/
@@ -3181,10 +3535,6 @@ static inline void ext4_unlock_group(struct super_block *sb,
/* dir.c */
extern const struct file_operations ext4_dir_operations;
-#ifdef CONFIG_UNICODE
-extern const struct dentry_operations ext4_dentry_ops;
-#endif
-
/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
@@ -3201,7 +3551,6 @@ extern int ext4_readpage_inline(struct inode *inode, struct page *page);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep);
extern int ext4_write_inline_data_end(struct inode *inode,
loff_t pos, unsigned len,
@@ -3214,12 +3563,8 @@ ext4_journalled_write_inline_data(struct inode *inode,
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep,
void **fsdata);
-extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
- unsigned len, unsigned copied,
- struct page *page);
extern int ext4_try_add_inline_entry(handle_t *handle,
struct ext4_filename *fname,
struct inode *dir, struct inode *inode);
@@ -3247,9 +3592,7 @@ extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
struct ext4_dir_entry_2 **parent_de,
int *retval);
-extern int ext4_inline_data_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- int *has_inline, __u64 start, __u64 len);
+extern void *ext4_read_inline_link(struct inode *inode);
struct iomap;
extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);
@@ -3276,9 +3619,10 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
-extern int ext4_ci_compare(const struct inode *parent,
- const struct qstr *fname,
- const struct qstr *entry, bool quick);
+extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
+ struct inode *inode);
+extern int __ext4_link(struct inode *dir, struct inode *inode,
+ struct dentry *dentry);
#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
@@ -3299,9 +3643,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
}
/* readpages.c */
-extern int ext4_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead);
+extern int ext4_mpage_readpages(struct inode *inode,
+ struct readahead_control *rac, struct page *page);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);
@@ -3311,6 +3654,7 @@ extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;
/* sysfs.c */
+extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi);
extern int ext4_register_sysfs(struct super_block *sb);
extern void ext4_unregister_sysfs(struct super_block *sb);
extern int __init ext4_init_sysfs(void);
@@ -3321,11 +3665,14 @@ extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
-extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
- ext4_fsblk_t start_blk,
- unsigned int count);
+extern int ext4_inode_block_valid(struct inode *inode,
+ ext4_fsblk_t start_blk,
+ unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
struct inode *, __le32 *, unsigned int);
+extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
+ ext4_fsblk_t start_blk, unsigned int count);
+
/* extents.c */
struct ext4_ext_path;
@@ -3337,7 +3684,7 @@ struct ext4_extent;
*/
#define EXT_MAX_BLOCKS 0xffffffff
-extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
+extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
@@ -3363,7 +3710,7 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *,
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
struct ext4_ext_path **,
int flags);
-extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -3380,6 +3727,11 @@ extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
int check_cred, int restart_cred,
int revoke_cred);
+extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
+extern int ext4_ext_replay_set_iblocks(struct inode *inode);
+extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
+ int len, int unwritten, ext4_fsblk_t pblk);
+extern int ext4_ext_clear_bb(struct inode *inode);
/* move_extent.c */
@@ -3405,7 +3757,6 @@ extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
int len,
- struct writeback_control *wbc,
bool keep_towrite);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
@@ -3413,9 +3764,25 @@ extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+/* mmp.c */
+extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);
+
/* verity.c */
extern const struct fsverity_operations ext4_verityops;
+/* orphan.c */
+extern int ext4_orphan_add(handle_t *, struct inode *);
+extern int ext4_orphan_del(handle_t *, struct inode *);
+extern void ext4_orphan_cleanup(struct super_block *sb,
+ struct ext4_super_block *es);
+extern void ext4_release_orphan_info(struct super_block *sb);
+extern int ext4_init_orphan_info(struct super_block *sb);
+extern int ext4_orphan_file_empty(struct super_block *sb);
+extern void ext4_orphan_file_block_trigger(
+ struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size);
+
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
@@ -3442,7 +3809,7 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
extern int ext4_resize_begin(struct super_block *sb);
-extern void ext4_resize_end(struct super_block *sb);
+extern int ext4_resize_end(struct super_block *sb, bool update_backups);
static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
@@ -3477,7 +3844,7 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
* have to read the block because we may read the old data
* successfully.
*/
- if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
+ if (buffer_write_io_error(bh))
set_buffer_uptodate(bh);
return buffer_uptodate(bh);
}
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 1c216fcc202a..26435f3a3094 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -170,10 +170,14 @@ struct partial_cluster {
(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
#define EXT_LAST_INDEX(__hdr__) \
(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
-#define EXT_MAX_EXTENT(__hdr__) \
- (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+ ((le16_to_cpu((__hdr__)->eh_max)) ? \
+ ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \
+ : NULL)
#define EXT_MAX_INDEX(__hdr__) \
- (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+ ((le16_to_cpu((__hdr__)->eh_max)) ? \
+ ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \
+ : NULL)
static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
{
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 1f53d64e42a5..8e1fb18f465e 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -80,8 +80,7 @@ static int ext4_journal_check_start(struct super_block *sb)
* take the FS itself readonly cleanly.
*/
if (journal && is_journal_aborted(journal)) {
- ext4_set_errno(sb, -journal->j_errno);
- ext4_abort(sb, "Detected aborted journal");
+ ext4_abort(sb, -journal->j_errno, "Detected aborted journal");
return -EROFS;
}
return 0;
@@ -101,7 +100,7 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
return ERR_PTR(err);
journal = EXT4_SB(sb)->s_journal;
- if (!journal)
+ if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return ext4_get_nojournal();
return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
GFP_NOFS, type, line);
@@ -163,6 +162,8 @@ int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
{
if (!ext4_handle_valid(handle))
return 0;
+ if (is_handle_aborted(handle))
+ return -EROFS;
if (jbd2_handle_buffer_credits(handle) >= check_cred &&
handle->h_revoke_credits >= revoke_cred)
return 0;
@@ -196,20 +197,54 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line,
jbd2_journal_abort_handle(handle);
}
+static void ext4_check_bdev_write_error(struct super_block *sb)
+{
+ struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ /*
+ * If the block device has write error flag, it may have failed to
+ * async write out metadata buffers in the background. In this case,
+ * we could read old data from disk and write it out again, which
+ * may lead to on-disk filesystem inconsistency.
+ */
+ if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) {
+ spin_lock(&sbi->s_bdev_wb_lock);
+ err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err);
+ spin_unlock(&sbi->s_bdev_wb_lock);
+ if (err)
+ ext4_error_err(sb, -err,
+ "Error while async write back metadata");
+ }
+}
+
int __ext4_journal_get_write_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh)
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type)
{
- int err = 0;
+ int err;
might_sleep();
+ if (bh->b_bdev->bd_super)
+ ext4_check_bdev_write_error(bh->b_bdev->bd_super);
+
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
- if (err)
+ if (err) {
ext4_journal_abort_handle(where, line, __func__, bh,
handle, err);
+ return err;
+ }
}
- return err;
+ if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
+ return 0;
+ BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
+ jbd2_journal_set_triggers(bh,
+ &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
+ return 0;
}
/*
@@ -220,9 +255,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
* "bh" may be NULL: a metadata block may have been freed from memory
* but there may still be a record of it in the journal, and that record
* still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling, but we still need to
- * call into ext4_journal_revoke() to put the buffer head.
*/
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
int is_metadata, struct inode *inode,
@@ -235,8 +267,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
trace_ext4_forget(inode, is_metadata, blocknr);
BUFFER_TRACE(bh, "enter");
- jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
- "data mode %x\n",
+ ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n",
bh, is_metadata, inode->i_mode,
test_opt(inode->i_sb, DATA_FLAGS));
@@ -272,26 +303,35 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
if (err) {
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
- ext4_set_errno(inode->i_sb, -err);
- __ext4_abort(inode->i_sb, where, line,
- "error %d when attempting revoke", err);
+ __ext4_error(inode->i_sb, where, line, true, -err, 0,
+ "error %d when attempting revoke", err);
}
BUFFER_TRACE(bh, "exit");
return err;
}
int __ext4_journal_get_create_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh)
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type)
{
- int err = 0;
+ int err;
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_get_create_access(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, line, __func__,
- bh, handle, err);
+ if (!ext4_handle_valid(handle))
+ return 0;
+
+ err = jbd2_journal_get_create_access(handle, bh);
+ if (err) {
+ ext4_journal_abort_handle(where, line, __func__, bh, handle,
+ err);
+ return err;
}
- return err;
+ if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
+ return 0;
+ BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
+ jbd2_journal_set_triggers(bh,
+ &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
+ return 0;
}
int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
@@ -304,6 +344,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
set_buffer_meta(bh);
set_buffer_prio(bh);
+ set_buffer_uptodate(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
/* Errors can only happen due to aborted journal or a nasty bug */
@@ -339,14 +380,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
if (inode && inode_needs_sync(inode)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
- struct ext4_super_block *es;
-
- es = EXT4_SB(inode->i_sb)->s_es;
- es->s_last_error_block =
- cpu_to_le64(bh->b_blocknr);
- ext4_set_errno(inode->i_sb, EIO);
- ext4_error_inode(inode, where, line,
- bh->b_blocknr,
+ ext4_error_inode_err(inode, where, line,
+ bh->b_blocknr, EIO,
"IO error syncing itable block");
err = -EIO;
}
@@ -354,20 +389,3 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
}
return err;
}
-
-int __ext4_handle_dirty_super(const char *where, unsigned int line,
- handle_t *handle, struct super_block *sb)
-{
- struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
- int err = 0;
-
- ext4_superblock_csum_set(sb);
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_dirty_metadata(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, line, __func__,
- bh, handle, err);
- } else
- mark_buffer_dirty(bh);
- return err;
-}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 7ea4f6fa173b..db2ae4a2b38d 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -86,17 +86,14 @@
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only data block */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- ext4_has_feature_quota(sb)) ? 1 : 0)
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
-#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- ext4_has_feature_quota(sb)) ?\
+#define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
(DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_INIT_REWRITE) : 0)
-#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- ext4_has_feature_quota(sb)) ?\
+#define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
(DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_DEL_REWRITE) : 0)
#else
@@ -222,7 +219,10 @@ ext4_mark_iloc_dirty(handle_t *handle,
int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc);
-int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
+#define ext4_mark_inode_dirty(__h, __i) \
+ __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__)
+int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
+ const char *func, unsigned int line);
int ext4_expand_extra_isize(struct inode *inode,
unsigned int new_extra_isize,
@@ -231,34 +231,35 @@ int ext4_expand_extra_isize(struct inode *inode,
* Wrapper functions with which ext4 calls into JBD.
*/
int __ext4_journal_get_write_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh);
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type);
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr);
int __ext4_journal_get_create_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh);
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type);
int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle_t *handle, struct inode *inode,
struct buffer_head *bh);
-int __ext4_handle_dirty_super(const char *where, unsigned int line,
- handle_t *handle, struct super_block *sb);
-
-#define ext4_journal_get_write_access(handle, bh) \
- __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
+#define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \
+ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \
+ (bh), (trigger_type))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
__ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
(bh), (block_nr))
-#define ext4_journal_get_create_access(handle, bh) \
- __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
+#define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \
+ __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \
+ (bh), (trigger_type))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
(bh))
-#define ext4_handle_dirty_super(handle, sb) \
- __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
int type, int blocks, int rsv_blocks,
@@ -335,12 +336,6 @@ static inline handle_t *__ext4_journal_start(struct inode *inode,
handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
int type);
-static inline void ext4_journal_free_reserved(handle_t *handle)
-{
- if (ext4_handle_valid(handle))
- jbd2_journal_free_reserved(handle);
-}
-
static inline handle_t *ext4_journal_current_handle(void)
{
return journal_current_handle();
@@ -496,7 +491,7 @@ static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
- * i_mutex for direct I/O reads. This only works for extent-based
+ * i_rwsem for direct I/O reads. This only works for extent-based
* files, and it doesn't work if data journaling is enabled, since the
* dioread_nolock code uses b_private to pass information back to the
* I/O completion handler, and this conflicts with the jbd's use of
@@ -512,6 +507,9 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
return 0;
if (ext4_should_journal_data(inode))
return 0;
+ /* temporary fix to prevent generic/422 test failures */
+ if (!test_opt(inode->i_sb, DELALLOC))
+ return 0;
return 1;
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 954013d6076b..f1956288307f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -27,7 +27,8 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fiemap.h>
-#include <linux/backing-dev.h>
+#include <linux/iomap.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
@@ -83,13 +84,6 @@ static void ext4_extent_block_csum_set(struct inode *inode,
et->et_checksum = ext4_extent_block_csum(inode, eh);
}
-static int ext4_split_extent(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- struct ext4_map_blocks *map,
- int split_flag,
- int flags);
-
static int ext4_split_extent_at(handle_t *handle,
struct inode *inode,
struct ext4_ext_path **ppath,
@@ -97,24 +91,40 @@ static int ext4_split_extent_at(handle_t *handle,
int split_flag,
int flags);
-static int ext4_find_delayed_extent(struct inode *inode,
- struct extent_status *newes);
-
static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
/*
* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
* moment, get_block can be called only for blocks inside i_size since
* page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
+ * i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
}
+static void ext4_ext_drop_refs(struct ext4_ext_path *path)
+{
+ int depth, i;
+
+ if (!path)
+ return;
+ depth = path->p_depth;
+ for (i = 0; i <= depth; i++, path++) {
+ brelse(path->p_bh);
+ path->p_bh = NULL;
+ }
+}
+
+void ext4_free_ext_path(struct ext4_ext_path *path)
+{
+ ext4_ext_drop_refs(path);
+ kfree(path);
+}
+
/*
* Make sure 'handle' has at least 'check_cred' credits. If not, restart
* transaction with 'restart_cred' credits. The function drops i_data_sem
@@ -145,14 +155,25 @@ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
{
+ int err = 0;
+
if (path->p_bh) {
/* path points to block */
BUFFER_TRACE(path->p_bh, "get_write_access");
- return ext4_journal_get_write_access(handle, path->p_bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ path->p_bh, EXT4_JTR_NONE);
+ /*
+ * The extent buffer's verified bit will be set again in
+ * __ext4_ext_dirty(). We could leave an inconsistent
+ * buffer if the extents updating procudure break off du
+ * to some error happens, force to check it again.
+ */
+ if (!err)
+ clear_buffer_verified(path->p_bh);
}
/* path points to leaf/index in inode body */
/* we use in-core data, no need to protect them */
- return 0;
+ return err;
}
/*
@@ -173,6 +194,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
/* path points to block */
err = __ext4_handle_dirty_metadata(where, line, handle,
inode, path->p_bh);
+ /* Extents updating done, re-set verified flag */
+ if (!err)
+ set_buffer_verified(path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
@@ -306,11 +330,14 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
{
struct ext4_ext_path *path = *ppath;
int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
+ int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
+
+ if (nofail)
+ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
- EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
- (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
+ flags);
}
static int
@@ -346,7 +373,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
*/
if (lblock + len <= lblock)
return 0;
- return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
+ return ext4_inode_block_valid(inode, block, len);
}
static int ext4_valid_extent_idx(struct inode *inode,
@@ -354,14 +381,18 @@ static int ext4_valid_extent_idx(struct inode *inode,
{
ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
- return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
+ return ext4_inode_block_valid(inode, block, 1);
}
static int ext4_valid_extent_entries(struct inode *inode,
- struct ext4_extent_header *eh,
- int depth)
+ struct ext4_extent_header *eh,
+ ext4_lblk_t lblk, ext4_fsblk_t *pblk,
+ int depth)
{
unsigned short entries;
+ ext4_lblk_t lblock = 0;
+ ext4_lblk_t cur = 0;
+
if (eh->eh_entries == 0)
return 1;
@@ -370,34 +401,51 @@ static int ext4_valid_extent_entries(struct inode *inode,
if (depth == 0) {
/* leaf entries */
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
- ext4_fsblk_t pblock = 0;
- ext4_lblk_t lblock = 0;
- ext4_lblk_t prev = 0;
- int len = 0;
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext->ee_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
/* Check for overlapping extents */
lblock = le32_to_cpu(ext->ee_block);
- len = ext4_ext_get_actual_len(ext);
- if ((lblock <= prev) && prev) {
- pblock = ext4_ext_pblock(ext);
- es->s_last_error_block = cpu_to_le64(pblock);
+ if (lblock < cur) {
+ *pblk = ext4_ext_pblock(ext);
return 0;
}
+ cur = lblock + ext4_ext_get_actual_len(ext);
ext++;
entries--;
- prev = lblock + len - 1;
}
} else {
struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the parent index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext_idx->ei_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
+
+ /* Check for overlapping index extents */
+ lblock = le32_to_cpu(ext_idx->ei_block);
+ if (lblock < cur) {
+ *pblk = ext4_idx_pblock(ext_idx);
+ return 0;
+ }
ext_idx++;
entries--;
+ cur = lblock + 1;
}
}
return 1;
@@ -405,7 +453,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
- int depth, ext4_fsblk_t pblk)
+ int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
{
const char *error_msg;
int max = 0, err = -EFSCORRUPTED;
@@ -431,7 +479,11 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid eh_entries";
goto corrupted;
}
- if (!ext4_valid_extent_entries(inode, eh, depth)) {
+ if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
+ error_msg = "eh_entries is 0 but eh_depth is > 0";
+ goto corrupted;
+ }
+ if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
}
@@ -449,19 +501,19 @@ static int __ext4_ext_check(const char *function, unsigned int line,
return 0;
corrupted:
- ext4_set_errno(inode->i_sb, -err);
- ext4_error_inode(inode, function, line, 0,
- "pblk %llu bad header/extent: %s - magic %x, "
- "entries %u, max %u(%u), depth %u(%u)",
- (unsigned long long) pblk, error_msg,
- le16_to_cpu(eh->eh_magic),
- le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
- max, le16_to_cpu(eh->eh_depth), depth);
+ ext4_error_inode_err(inode, function, line, 0, -err,
+ "pblk %llu bad header/extent: %s - magic %x, "
+ "entries %u, max %u(%u), depth %u(%u)",
+ (unsigned long long) pblk, error_msg,
+ le16_to_cpu(eh->eh_magic),
+ le16_to_cpu(eh->eh_entries),
+ le16_to_cpu(eh->eh_max),
+ max, le16_to_cpu(eh->eh_depth), depth);
return err;
}
#define ext4_ext_check(inode, eh, depth, pblk) \
- __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
int ext4_ext_check_inode(struct inode *inode)
{
@@ -494,32 +546,34 @@ static void ext4_cache_extents(struct inode *inode,
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
- struct inode *inode, ext4_fsblk_t pblk, int depth,
- int flags)
+ struct inode *inode, struct ext4_extent_idx *idx,
+ int depth, int flags)
{
struct buffer_head *bh;
int err;
+ gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS;
+ ext4_fsblk_t pblk;
- bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS);
+ if (flags & EXT4_EX_NOFAIL)
+ gfp_flags |= __GFP_NOFAIL;
+
+ pblk = ext4_idx_pblock(idx);
+ bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
if (!bh_uptodate_or_lock(bh)) {
trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
- err = bh_submit_read(bh);
+ err = ext4_read_bh(bh, 0, NULL);
if (err < 0)
goto errout;
}
if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh;
- if (!ext4_has_feature_journal(inode->i_sb) ||
- (inode->i_ino !=
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
- err = __ext4_ext_check(function, line, inode,
- ext_block_hdr(bh), depth, pblk);
- if (err)
- goto errout;
- }
+ err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
+ depth, pblk, le32_to_cpu(idx->ei_block));
+ if (err)
+ goto errout;
set_buffer_verified(bh);
/*
* If this is a leaf block, cache all of its entries
@@ -535,8 +589,8 @@ errout:
}
-#define read_extent_tree_block(inode, pblk, depth, flags) \
- __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+#define read_extent_tree_block(inode, idx, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \
(depth), (flags))
/*
@@ -556,6 +610,12 @@ int ext4_ext_precache(struct inode *inode)
down_read(&ei->i_data_sem);
depth = ext_depth(inode);
+ /* Don't cache anything if there are no external extent blocks */
+ if (!depth) {
+ up_read(&ei->i_data_sem);
+ return ret;
+ }
+
path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
GFP_NOFS);
if (path == NULL) {
@@ -563,9 +623,6 @@ int ext4_ext_precache(struct inode *inode)
return -ENOMEM;
}
- /* Don't cache anything if there are no external extent blocks */
- if (depth == 0)
- goto out;
path[0].p_hdr = ext_inode_hdr(inode);
ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
if (ret)
@@ -583,8 +640,7 @@ int ext4_ext_precache(struct inode *inode)
i--;
continue;
}
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx++),
+ bh = read_extent_tree_block(inode, path[i].p_idx++,
depth - i - 1,
EXT4_EX_FORCE_CACHE);
if (IS_ERR(bh)) {
@@ -599,8 +655,7 @@ int ext4_ext_precache(struct inode *inode)
ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
out:
up_read(&ei->i_data_sem);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -609,22 +664,22 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
{
int k, l = path->p_depth;
- ext_debug("path:");
+ ext_debug(inode, "path:");
for (k = 0; k <= l; k++, path++) {
if (path->p_idx) {
- ext_debug(" %d->%llu",
+ ext_debug(inode, " %d->%llu",
le32_to_cpu(path->p_idx->ei_block),
ext4_idx_pblock(path->p_idx));
} else if (path->p_ext) {
- ext_debug(" %d:[%d]%d:%llu ",
+ ext_debug(inode, " %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext),
ext4_ext_pblock(path->p_ext));
} else
- ext_debug(" []");
+ ext_debug(inode, " []");
}
- ext_debug("\n");
+ ext_debug(inode, "\n");
}
static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
@@ -640,14 +695,14 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
eh = path[depth].p_hdr;
ex = EXT_FIRST_EXTENT(eh);
- ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+ ext_debug(inode, "Displaying leaf extents\n");
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
- ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
}
- ext_debug("\n");
+ ext_debug(inode, "\n");
}
static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
@@ -660,10 +715,9 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
struct ext4_extent_idx *idx;
idx = path[level].p_idx;
while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
- ext_debug("%d: move %d:%llu in new index %llu\n", level,
- le32_to_cpu(idx->ei_block),
- ext4_idx_pblock(idx),
- newblock);
+ ext_debug(inode, "%d: move %d:%llu in new index %llu\n",
+ level, le32_to_cpu(idx->ei_block),
+ ext4_idx_pblock(idx), newblock);
idx++;
}
@@ -672,7 +726,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
ex = path[depth].p_ext;
while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
- ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+ ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n",
le32_to_cpu(ex->ee_block),
ext4_ext_pblock(ex),
ext4_ext_is_unwritten(ex),
@@ -688,21 +742,6 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
#define ext4_ext_show_move(inode, path, newblock, level)
#endif
-void ext4_ext_drop_refs(struct ext4_ext_path *path)
-{
- int depth, i;
-
- if (!path)
- return;
- depth = path->p_depth;
- for (i = 0; i <= depth; i++, path++) {
- if (path->p_bh) {
- brelse(path->p_bh);
- path->p_bh = NULL;
- }
- }
-}
-
/*
* ext4_ext_binsearch_idx:
* binary search for the closest index of the given block
@@ -716,23 +755,24 @@ ext4_ext_binsearch_idx(struct inode *inode,
struct ext4_extent_idx *r, *l, *m;
- ext_debug("binsearch for %u(idx): ", block);
+ ext_debug(inode, "binsearch for %u(idx): ", block);
l = EXT_FIRST_INDEX(eh) + 1;
r = EXT_LAST_INDEX(eh);
while (l <= r) {
m = l + (r - l) / 2;
+ ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
+ le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
+ r, le32_to_cpu(r->ei_block));
+
if (block < le32_to_cpu(m->ei_block))
r = m - 1;
else
l = m + 1;
- ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
- m, le32_to_cpu(m->ei_block),
- r, le32_to_cpu(r->ei_block));
}
path->p_idx = l - 1;
- ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
+ ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
ext4_idx_pblock(path->p_idx));
#ifdef CHECK_BINSEARCH
@@ -783,24 +823,25 @@ ext4_ext_binsearch(struct inode *inode,
return;
}
- ext_debug("binsearch for %u: ", block);
+ ext_debug(inode, "binsearch for %u: ", block);
l = EXT_FIRST_EXTENT(eh) + 1;
r = EXT_LAST_EXTENT(eh);
while (l <= r) {
m = l + (r - l) / 2;
+ ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
+ le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
+ r, le32_to_cpu(r->ee_block));
+
if (block < le32_to_cpu(m->ee_block))
r = m - 1;
else
l = m + 1;
- ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
- m, le32_to_cpu(m->ee_block),
- r, le32_to_cpu(r->ee_block));
}
path->p_ext = l - 1;
- ext_debug(" -> %d:%llu:[%d]%d ",
+ ext_debug(inode, " -> %d:%llu:[%d]%d ",
le32_to_cpu(path->p_ext->ee_block),
ext4_ext_pblock(path->p_ext),
ext4_ext_is_unwritten(path->p_ext),
@@ -825,7 +866,7 @@ ext4_ext_binsearch(struct inode *inode,
}
-int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
+void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
{
struct ext4_extent_header *eh;
@@ -834,8 +875,8 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_entries = 0;
eh->eh_magic = EXT4_EXT_MAGIC;
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
+ eh->eh_generation = 0;
ext4_mark_inode_dirty(handle, inode);
- return 0;
}
struct ext4_ext_path *
@@ -847,6 +888,10 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
short int depth, i, ppos = 0;
int ret;
+ gfp_t gfp_flags = GFP_NOFS;
+
+ if (flags & EXT4_EX_NOFAIL)
+ gfp_flags |= __GFP_NOFAIL;
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
@@ -867,7 +912,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
if (!path) {
/* account possible depth increase */
path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
- GFP_NOFS);
+ gfp_flags);
if (unlikely(!path))
return ERR_PTR(-ENOMEM);
path[0].p_maxdepth = depth + 1;
@@ -880,7 +925,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
ext4_cache_extents(inode, eh);
/* walk through the tree */
while (i) {
- ext_debug("depth %d: num %d, max %d\n",
+ ext_debug(inode, "depth %d: num %d, max %d\n",
ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
ext4_ext_binsearch_idx(inode, path + ppos, block);
@@ -888,8 +933,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
- flags);
+ bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
goto err;
@@ -916,8 +960,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
return path;
err:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (orig_path)
*orig_path = NULL;
return ERR_PTR(ret);
@@ -957,18 +1000,20 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
/* insert after */
- ext_debug("insert new index %d after: %llu\n", logical, ptr);
+ ext_debug(inode, "insert new index %d after: %llu\n",
+ logical, ptr);
ix = curp->p_idx + 1;
} else {
/* insert before */
- ext_debug("insert new index %d before: %llu\n", logical, ptr);
+ ext_debug(inode, "insert new index %d before: %llu\n",
+ logical, ptr);
ix = curp->p_idx;
}
len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
BUG_ON(len < 0);
if (len > 0) {
- ext_debug("insert new index %d: "
+ ext_debug(inode, "insert new index %d: "
"move %d indices from 0x%p to 0x%p\n",
logical, len, ix, ix + 1);
memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
@@ -1017,9 +1062,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock, oldblock;
__le32 border;
ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
+ gfp_t gfp_flags = GFP_NOFS;
int err = 0;
size_t ext_size = 0;
+ if (flags & EXT4_EX_NOFAIL)
+ gfp_flags |= __GFP_NOFAIL;
+
/* make decision: where to split? */
/* FIXME: now decision is simplest: at current extent */
@@ -1031,12 +1080,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
}
if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
border = path[depth].p_ext[1].ee_block;
- ext_debug("leaf will be split."
+ ext_debug(inode, "leaf will be split."
" next leaf starts at %d\n",
le32_to_cpu(border));
} else {
border = newext->ee_block;
- ext_debug("leaf will be added."
+ ext_debug(inode, "leaf will be added."
" next leaf starts at %d\n",
le32_to_cpu(border));
}
@@ -1053,12 +1102,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
* We need this to handle errors and free blocks
* upon them.
*/
- ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), GFP_NOFS);
+ ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags);
if (!ablocks)
return -ENOMEM;
/* allocate all needed blocks */
- ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
+ ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
newblock = ext4_ext_new_meta_block(handle, inode, path,
newext, &err, flags);
@@ -1081,7 +1130,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
}
lock_buffer(bh);
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto cleanup;
@@ -1090,6 +1140,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
+ neh->eh_generation = 0;
/* move remainder of path[depth] to the new leaf */
if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -1144,7 +1195,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
goto cleanup;
}
if (k)
- ext_debug("create %d intermediate indices\n", k);
+ ext_debug(inode, "create %d intermediate indices\n", k);
/* insert new index into current index block */
/* current depth stored in i var */
i = depth - 1;
@@ -1158,7 +1209,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
}
lock_buffer(bh);
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto cleanup;
@@ -1167,11 +1219,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
neh->eh_depth = cpu_to_le16(depth - i);
+ neh->eh_generation = 0;
fidx = EXT_FIRST_INDEX(neh);
fidx->ei_block = border;
ext4_idx_store_pblock(fidx, oldblock);
- ext_debug("int.index at %d (block %llu): %u -> %llu\n",
+ ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n",
i, newblock, le32_to_cpu(border), oldblock);
/* move remainder of path[i] to the new index block */
@@ -1185,7 +1238,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
}
/* start copy indexes */
m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
- ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+ ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
EXT_MAX_INDEX(path[i].p_hdr));
ext4_ext_show_move(inode, path, newblock, i);
if (m) {
@@ -1283,7 +1336,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
return -ENOMEM;
lock_buffer(bh);
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err) {
unlock_buffer(bh);
goto out;
@@ -1306,6 +1360,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
neh->eh_magic = EXT4_EXT_MAGIC;
ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, inode, bh);
@@ -1322,13 +1377,13 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
EXT_FIRST_INDEX(neh)->ei_block =
EXT_FIRST_EXTENT(neh)->ee_block;
}
- ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
+ ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n",
le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
le16_add_cpu(&neh->eh_depth, 1);
- ext4_mark_inode_dirty(handle, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
out:
brelse(bh);
@@ -1449,8 +1504,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT4_ERROR_INODE(inode,
"ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
- EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
- le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
+ le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block),
depth);
return -EFSCORRUPTED;
}
@@ -1471,22 +1525,21 @@ static int ext4_ext_search_left(struct inode *inode,
}
/*
- * search the closest allocated block to the right for *logical
- * and returns it at @logical + it's physical address at @phys
- * if *logical is the largest allocated block, the function
- * returns 0 at @phys
- * return value contains 0 (success) or error code
+ * Search the closest allocated block to the right for *logical
+ * and returns it at @logical + it's physical address at @phys.
+ * If not exists, return 0 and @phys is set to 0. We will return
+ * 1 which means we found an allocated block and ret_ex is valid.
+ * Or return a (< 0) error code.
*/
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t *logical, ext4_fsblk_t *phys,
- struct ext4_extent **ret_ex)
+ struct ext4_extent *ret_ex)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
struct ext4_extent_idx *ix;
struct ext4_extent *ex;
- ext4_fsblk_t block;
int depth; /* Note, NOT eh_depth; depth from top of tree */
int ee_len;
@@ -1553,20 +1606,17 @@ got_index:
* follow it and find the closest allocated
* block to the right */
ix++;
- block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, block,
- path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
ix = EXT_FIRST_INDEX(eh);
- block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -1574,10 +1624,11 @@ got_index:
found_extent:
*logical = le32_to_cpu(ex->ee_block);
*phys = ext4_ext_pblock(ex);
- *ret_ex = ex;
+ if (ret_ex)
+ *ret_ex = *ex;
if (bh)
put_bh(bh);
- return 0;
+ return 1;
}
/*
@@ -1909,7 +1960,7 @@ out:
/*
* ext4_ext_insert_extent:
- * tries to merge requsted extent into the existing extent or
+ * tries to merge requested extent into the existing extent or
* inserts requested extent as new one into the tree,
* creating new leaf in the no-space case.
*/
@@ -1964,7 +2015,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
/* Try to append newex to the ex */
if (ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append [%d]%d block to %u:[%d]%d"
+ ext_debug(inode, "append [%d]%d block to %u:[%d]%d"
"(from %llu)\n",
ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
@@ -1981,7 +2032,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+ ext4_ext_get_actual_len(newext));
if (unwritten)
ext4_ext_mark_unwritten(ex);
- eh = path[depth].p_hdr;
nearex = ex;
goto merge;
}
@@ -1989,7 +2039,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
prepend:
/* Try to prepend newex to the ex */
if (ext4_can_extents_be_merged(inode, newext, ex)) {
- ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
+ ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d"
"(from %llu)\n",
le32_to_cpu(newext->ee_block),
ext4_ext_is_unwritten(newext),
@@ -2010,7 +2060,6 @@ prepend:
+ ext4_ext_get_actual_len(newext));
if (unwritten)
ext4_ext_mark_unwritten(ex);
- eh = path[depth].p_hdr;
nearex = ex;
goto merge;
}
@@ -2027,20 +2076,20 @@ prepend:
if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
next = ext4_ext_next_leaf_block(path);
if (next != EXT_MAX_BLOCKS) {
- ext_debug("next leaf block - %u\n", next);
+ ext_debug(inode, "next leaf block - %u\n", next);
BUG_ON(npath != NULL);
- npath = ext4_find_extent(inode, next, NULL, 0);
+ npath = ext4_find_extent(inode, next, NULL, gb_flags);
if (IS_ERR(npath))
return PTR_ERR(npath);
BUG_ON(npath->p_depth != path->p_depth);
eh = npath[depth].p_hdr;
if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
- ext_debug("next leaf isn't full(%d)\n",
+ ext_debug(inode, "next leaf isn't full(%d)\n",
le16_to_cpu(eh->eh_entries));
path = npath;
goto has_space;
}
- ext_debug("next leaf has no free space(%d,%d)\n",
+ ext_debug(inode, "next leaf has no free space(%d,%d)\n",
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
}
@@ -2066,7 +2115,7 @@ has_space:
if (!nearex) {
/* there is no extent in this leaf, create first one */
- ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
+ ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
ext4_ext_is_unwritten(newext),
@@ -2076,7 +2125,7 @@ has_space:
if (le32_to_cpu(newext->ee_block)
> le32_to_cpu(nearex->ee_block)) {
/* Insert after */
- ext_debug("insert %u:%llu:[%d]%d before: "
+ ext_debug(inode, "insert %u:%llu:[%d]%d before: "
"nearest %p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
@@ -2087,7 +2136,7 @@ has_space:
} else {
/* Insert before */
BUG_ON(newext->ee_block == nearex->ee_block);
- ext_debug("insert %u:%llu:[%d]%d after: "
+ ext_debug(inode, "insert %u:%llu:[%d]%d after: "
"nearest %p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
@@ -2097,7 +2146,7 @@ has_space:
}
len = EXT_LAST_EXTENT(eh) - nearex + 1;
if (len > 0) {
- ext_debug("insert %u:%llu:[%d]%d: "
+ ext_debug(inode, "insert %u:%llu:[%d]%d: "
"move %d extents from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
@@ -2129,157 +2178,7 @@ merge:
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
cleanup:
- ext4_ext_drop_refs(npath);
- kfree(npath);
- return err;
-}
-
-static int ext4_fill_fiemap_extents(struct inode *inode,
- ext4_lblk_t block, ext4_lblk_t num,
- struct fiemap_extent_info *fieinfo)
-{
- struct ext4_ext_path *path = NULL;
- struct ext4_extent *ex;
- struct extent_status es;
- ext4_lblk_t next, next_del, start = 0, end = 0;
- ext4_lblk_t last = block + num;
- int exists, depth = 0, err = 0;
- unsigned int flags = 0;
- unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
-
- while (block < last && block != EXT_MAX_BLOCKS) {
- num = last - block;
- /* find extent for this block */
- down_read(&EXT4_I(inode)->i_data_sem);
-
- path = ext4_find_extent(inode, block, &path, 0);
- if (IS_ERR(path)) {
- up_read(&EXT4_I(inode)->i_data_sem);
- err = PTR_ERR(path);
- path = NULL;
- break;
- }
-
- depth = ext_depth(inode);
- if (unlikely(path[depth].p_hdr == NULL)) {
- up_read(&EXT4_I(inode)->i_data_sem);
- EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- err = -EFSCORRUPTED;
- break;
- }
- ex = path[depth].p_ext;
- next = ext4_ext_next_allocated_block(path);
-
- flags = 0;
- exists = 0;
- if (!ex) {
- /* there is no extent yet, so try to allocate
- * all requested space */
- start = block;
- end = block + num;
- } else if (le32_to_cpu(ex->ee_block) > block) {
- /* need to allocate space before found extent */
- start = block;
- end = le32_to_cpu(ex->ee_block);
- if (block + num < end)
- end = block + num;
- } else if (block >= le32_to_cpu(ex->ee_block)
- + ext4_ext_get_actual_len(ex)) {
- /* need to allocate space after found extent */
- start = block;
- end = block + num;
- if (end >= next)
- end = next;
- } else if (block >= le32_to_cpu(ex->ee_block)) {
- /*
- * some part of requested space is covered
- * by found extent
- */
- start = block;
- end = le32_to_cpu(ex->ee_block)
- + ext4_ext_get_actual_len(ex);
- if (block + num < end)
- end = block + num;
- exists = 1;
- } else {
- BUG();
- }
- BUG_ON(end <= start);
-
- if (!exists) {
- es.es_lblk = start;
- es.es_len = end - start;
- es.es_pblk = 0;
- } else {
- es.es_lblk = le32_to_cpu(ex->ee_block);
- es.es_len = ext4_ext_get_actual_len(ex);
- es.es_pblk = ext4_ext_pblock(ex);
- if (ext4_ext_is_unwritten(ex))
- flags |= FIEMAP_EXTENT_UNWRITTEN;
- }
-
- /*
- * Find delayed extent and update es accordingly. We call
- * it even in !exists case to find out whether es is the
- * last existing extent or not.
- */
- next_del = ext4_find_delayed_extent(inode, &es);
- if (!exists && next_del) {
- exists = 1;
- flags |= (FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- }
- up_read(&EXT4_I(inode)->i_data_sem);
-
- if (unlikely(es.es_len == 0)) {
- EXT4_ERROR_INODE(inode, "es.es_len == 0");
- err = -EFSCORRUPTED;
- break;
- }
-
- /*
- * This is possible iff next == next_del == EXT_MAX_BLOCKS.
- * we need to check next == EXT_MAX_BLOCKS because it is
- * possible that an extent is with unwritten and delayed
- * status due to when an extent is delayed allocated and
- * is allocated by fallocate status tree will track both of
- * them in a extent.
- *
- * So we could return a unwritten and delayed extent, and
- * its block is equal to 'next'.
- */
- if (next == next_del && next == EXT_MAX_BLOCKS) {
- flags |= FIEMAP_EXTENT_LAST;
- if (unlikely(next_del != EXT_MAX_BLOCKS ||
- next != EXT_MAX_BLOCKS)) {
- EXT4_ERROR_INODE(inode,
- "next extent == %u, next "
- "delalloc extent = %u",
- next, next_del);
- err = -EFSCORRUPTED;
- break;
- }
- }
-
- if (exists) {
- err = fiemap_fill_next_extent(fieinfo,
- (__u64)es.es_lblk << blksize_bits,
- (__u64)es.es_pblk << blksize_bits,
- (__u64)es.es_len << blksize_bits,
- flags);
- if (err < 0)
- break;
- if (err == 1) {
- err = 0;
- break;
- }
- }
-
- block = es.es_lblk + es.es_len;
- }
-
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(npath);
return err;
}
@@ -2390,7 +2289,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
return;
hole_len = min(es.es_lblk - hole_start, hole_len);
}
- ext_debug(" -> %u:%u\n", hole_start, hole_len);
+ ext_debug(inode, " -> %u:%u\n", hole_start, hole_len);
ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
EXTENT_STATUS_HOLE);
}
@@ -2427,7 +2326,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
err = ext4_ext_dirty(handle, inode, path);
if (err)
return err;
- ext_debug("index is empty, remove it, free block %llu\n", leaf);
+ ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
trace_ext4_ext_rm_idx(inode, leaf);
ext4_free_blocks(handle, inode, NULL, leaf, 1,
@@ -2706,7 +2605,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ext4_fsblk_t pblk;
/* the header must be checked already in ext4_ext_remove_space() */
- ext_debug("truncate since %u in leaf to %u\n", start, end);
+ ext_debug(inode, "truncate since %u in leaf to %u\n", start, end);
if (!path[depth].p_hdr)
path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
eh = path[depth].p_hdr;
@@ -2732,7 +2631,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
else
unwritten = 0;
- ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+ ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block,
unwritten, ex_ee_len);
path[depth].p_ext = ex;
@@ -2740,7 +2639,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
b = ex_ee_block+ex_ee_len - 1 < end ?
ex_ee_block+ex_ee_len - 1 : end;
- ext_debug(" border %u:%u\n", a, b);
+ ext_debug(inode, " border %u:%u\n", a, b);
/* If this extent is beyond the end of the hole, skip it */
if (end < ex_ee_block) {
@@ -2849,7 +2748,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (err)
goto out;
- ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
+ ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num,
ext4_ext_pblock(ex));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2926,7 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
partial.lblk = 0;
partial.state = initial;
- ext_debug("truncate since %u to %u\n", start, end);
+ ext_debug(inode, "truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
@@ -2951,7 +2850,8 @@ again:
ext4_fsblk_t pblk;
/* find extent for or closest extent to this block */
- path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
+ path = ext4_find_extent(inode, end, NULL,
+ EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -2986,7 +2886,7 @@ again:
* in use to avoid freeing it when removing blocks.
*/
if (sbi->s_cluster_ratio > 1) {
- pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
+ pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
partial.pclu = EXT4_B2C(sbi, pblk);
partial.state = nofree;
}
@@ -3016,8 +2916,8 @@ again:
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
- &ex);
- if (err)
+ NULL);
+ if (err < 0)
goto out;
if (pblk) {
partial.pclu = EXT4_B2C(sbi, pblk);
@@ -3037,7 +2937,7 @@ again:
le16_to_cpu(path[k].p_hdr->eh_entries)+1;
} else {
path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
- GFP_NOFS);
+ GFP_NOFS | __GFP_NOFAIL);
if (path == NULL) {
ext4_journal_stop(handle);
return -ENOMEM;
@@ -3067,7 +2967,7 @@ again:
/* this is index block */
if (!path[i].p_hdr) {
- ext_debug("initialize header\n");
+ ext_debug(inode, "initialize header\n");
path[i].p_hdr = ext_block_hdr(path[i].p_bh);
}
@@ -3075,7 +2975,7 @@ again:
/* this level hasn't been touched yet */
path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
- ext_debug("init index ptr: hdr 0x%p, num %d\n",
+ ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
path[i].p_hdr,
le16_to_cpu(path[i].p_hdr->eh_entries));
} else {
@@ -3083,18 +2983,18 @@ again:
path[i].p_idx--;
}
- ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
+ ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
i, EXT_FIRST_INDEX(path[i].p_hdr),
path[i].p_idx);
if (ext4_ext_more_to_rm(path + i)) {
struct buffer_head *bh;
/* go to the next level */
- ext_debug("move to level %d (block %llu)\n",
+ ext_debug(inode, "move to level %d (block %llu)\n",
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx), depth - i - 1,
- EXT4_EX_NOCACHE);
+ bh = read_extent_tree_block(inode, path[i].p_idx,
+ depth - i - 1,
+ EXT4_EX_NOCACHE);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -3125,7 +3025,7 @@ again:
brelse(path[i].p_bh);
path[i].p_bh = NULL;
i--;
- ext_debug("return to level %d\n", i);
+ ext_debug(inode, "return to level %d\n", i);
}
}
@@ -3164,8 +3064,7 @@ again:
}
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
path = NULL;
if (err == -EAGAIN)
goto again;
@@ -3267,7 +3166,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
*
*
* Splits extent [a, b] into two extents [a, @split) and [@split, b], states
- * of which are deterimined by split_flag.
+ * of which are determined by split_flag.
*
* There are two cases:
* a> the extent are splitted into two extent.
@@ -3293,8 +3192,7 @@ static int ext4_split_extent_at(handle_t *handle,
BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
(EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
- ext_debug("ext4_split_extents_at: inode %lu, logical"
- "block %llu\n", inode->i_ino, (unsigned long long)split);
+ ext_debug(inode, "logical block %llu\n", (unsigned long long)split);
ext4_ext_show_leaf(inode, path);
@@ -3354,7 +3252,10 @@ static int ext4_split_extent_at(handle_t *handle,
ext4_ext_mark_unwritten(ex2);
err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
- if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ if (err != -ENOSPC && err != -EDQUOT)
+ goto out;
+
+ if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
if (split_flag & EXT4_EXT_DATA_VALID1) {
err = ext4_ext_zeroout(inode, ex2);
@@ -3380,30 +3281,34 @@ static int ext4_split_extent_at(handle_t *handle,
ext4_ext_pblock(&orig_ex));
}
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_len = cpu_to_le16(ee_len);
- ext4_ext_try_to_merge(handle, inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + path->p_depth);
- if (err)
- goto fix_extent_len;
-
- /* update extent status tree */
- err = ext4_zeroout_es(inode, &zero_ex);
-
- goto out;
- } else if (err)
- goto fix_extent_len;
-
-out:
- ext4_ext_show_leaf(inode, path);
- return err;
+ if (!err) {
+ /* update the extent length and mark as initialized */
+ ex->ee_len = cpu_to_le16(ee_len);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ if (!err)
+ /* update extent status tree */
+ err = ext4_zeroout_es(inode, &zero_ex);
+ /* If we failed at this point, we don't know in which
+ * state the extent tree exactly is so don't try to fix
+ * length of the original extent as it may do even more
+ * damage.
+ */
+ goto out;
+ }
+ }
fix_extent_len:
ex->ee_len = orig_ex.ee_len;
+ /*
+ * Ignore ext4_ext_dirty return value since we are already in error path
+ * and err is a non-zero error code.
+ */
ext4_ext_dirty(handle, inode, path + path->p_depth);
return err;
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
}
/*
@@ -3458,7 +3363,7 @@ static int ext4_split_extent(handle_t *handle,
* Update path is required because previous ext4_split_extent_at() may
* result in split of original leaf or extent zeroout.
*/
- path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+ path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
if (IS_ERR(path))
return PTR_ERR(path);
depth = ext_depth(inode);
@@ -3469,7 +3374,6 @@ static int ext4_split_extent(handle_t *handle,
return -EFSCORRUPTED;
}
unwritten = ext4_ext_is_unwritten(ex);
- split_flag1 = 0;
if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
@@ -3527,13 +3431,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
int err = 0;
int split_flag = EXT4_EXT_DATA_VALID2;
- ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
- "block %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)map->m_lblk, map_len);
+ ext_debug(inode, "logical block %llu, max_blocks %u\n",
+ (unsigned long long)map->m_lblk, map_len);
sbi = EXT4_SB(inode->i_sb);
- eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
- inode->i_sb->s_blocksize_bits;
+ eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
+ >> inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map_len)
eof_block = map->m_lblk + map_len;
@@ -3661,7 +3564,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
}
if (allocated) {
/* Mark the block containing both extents as dirty */
- ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + depth);
/* Update path to point to the right extent */
path[depth].p_ext = abut_ex;
@@ -3707,7 +3610,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
split_map.m_len - ee_block);
err = ext4_ext_zeroout(inode, &zero_ex1);
if (err)
- goto out;
+ goto fallback;
split_map.m_len = allocated;
}
if (split_map.m_lblk - ee_block + split_map.m_len <
@@ -3721,7 +3624,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_pblock(ex));
err = ext4_ext_zeroout(inode, &zero_ex2);
if (err)
- goto out;
+ goto fallback;
}
split_map.m_len += split_map.m_lblk - ee_block;
@@ -3730,6 +3633,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
}
}
+fallback:
err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
flags);
if (err > 0)
@@ -3781,17 +3685,16 @@ static int ext4_split_convert_extents(handle_t *handle,
unsigned int ee_len;
int split_flag = 0, depth;
- ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
- __func__, inode->i_ino,
+ ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)map->m_lblk, map->m_len);
- eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
- inode->i_sb->s_blocksize_bits;
+ eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
+ >> inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
eof_block = map->m_lblk + map->m_len;
/*
* It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully insde i_size or new_size.
+ * zeroout only if extent is fully inside i_size or new_size.
*/
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -3828,8 +3731,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
- "block %llu, max_blocks %u\n", inode->i_ino,
+ ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)ee_block, ee_len);
/* If extent is larger than requested it is a clear sign that we still
@@ -3874,64 +3776,11 @@ out:
return err;
}
-/*
- * Handle EOFBLOCKS_FL flag, clearing it if necessary
- */
-static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
- ext4_lblk_t lblk,
- struct ext4_ext_path *path,
- unsigned int len)
-{
- int i, depth;
- struct ext4_extent_header *eh;
- struct ext4_extent *last_ex;
-
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
- return 0;
-
- depth = ext_depth(inode);
- eh = path[depth].p_hdr;
-
- /*
- * We're going to remove EOFBLOCKS_FL entirely in future so we
- * do not care for this case anymore. Simply remove the flag
- * if there are no extents.
- */
- if (unlikely(!eh->eh_entries))
- goto out;
- last_ex = EXT_LAST_EXTENT(eh);
- /*
- * We should clear the EOFBLOCKS_FL flag if we are writing the
- * last block in the last extent in the file. We test this by
- * first checking to see if the caller to
- * ext4_ext_get_blocks() was interested in the last block (or
- * a block beyond the last block) in the current extent. If
- * this turns out to be false, we can bail out from this
- * function immediately.
- */
- if (lblk + len < le32_to_cpu(last_ex->ee_block) +
- ext4_ext_get_actual_len(last_ex))
- return 0;
- /*
- * If the caller does appear to be planning to write at or
- * beyond the end of the current extent, we then test to see
- * if the current extent is the last extent in the file, by
- * checking to make sure it was reached via the rightmost node
- * at each level of the tree.
- */
- for (i = depth-1; i >= 0; i--)
- if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
- return 0;
-out:
- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
- return ext4_mark_inode_dirty(handle, inode);
-}
-
static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path **ppath,
- unsigned int allocated)
+ unsigned int *allocated)
{
struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
@@ -3952,8 +3801,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- ext_debug("%s: inode %lu, logical"
- "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+ ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)ee_block, ee_len);
if (ee_block != map->m_lblk || ee_len > map->m_len) {
@@ -3991,14 +3839,12 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
ext4_ext_show_leaf(inode, path);
ext4_update_inode_fsync_trans(handle, inode, 1);
- err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
- if (err)
- return err;
+
map->m_flags |= EXT4_MAP_UNWRITTEN;
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_len = allocated;
- return allocated;
+ if (*allocated > map->m_len)
+ *allocated = map->m_len;
+ map->m_len = *allocated;
+ return 0;
}
static int
@@ -4007,14 +3853,13 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_ext_path **ppath, int flags,
unsigned int allocated, ext4_fsblk_t newblock)
{
- struct ext4_ext_path *path = *ppath;
+ struct ext4_ext_path __maybe_unused *path = *ppath;
int ret = 0;
int err = 0;
- ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
- "block %llu, max_blocks %u, flags %x, allocated %u\n",
- inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
- flags, allocated);
+ ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
+ (unsigned long long)map->m_lblk, map->m_len, flags,
+ allocated);
ext4_ext_show_leaf(inode, path);
/*
@@ -4026,41 +3871,38 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
allocated, newblock);
- /* get_block() before submit the IO, split the extent */
+ /* get_block() before submitting IO, split the extent */
if (flags & EXT4_GET_BLOCKS_PRE_IO) {
ret = ext4_split_convert_extents(handle, inode, map, ppath,
flags | EXT4_GET_BLOCKS_CONVERT);
- if (ret <= 0)
- goto out;
+ if (ret < 0) {
+ err = ret;
+ goto out2;
+ }
+ /*
+ * shouldn't get a 0 return when splitting an extent unless
+ * m_len is 0 (bug) or extent has been corrupted
+ */
+ if (unlikely(ret == 0)) {
+ EXT4_ERROR_INODE(inode,
+ "unexpected ret == 0, m_len = %u",
+ map->m_len);
+ err = -EFSCORRUPTED;
+ goto out2;
+ }
map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
if (flags & EXT4_GET_BLOCKS_CONVERT) {
- if (flags & EXT4_GET_BLOCKS_ZERO) {
- if (allocated > map->m_len)
- allocated = map->m_len;
- err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
- allocated);
- if (err < 0)
- goto out2;
- }
- ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
+ err = ext4_convert_unwritten_extents_endio(handle, inode, map,
ppath);
- if (ret >= 0) {
- ext4_update_inode_fsync_trans(handle, inode, 1);
- err = check_eofblocks_fl(handle, inode, map->m_lblk,
- path, map->m_len);
- } else
- err = ret;
- map->m_flags |= EXT4_MAP_MAPPED;
- map->m_pblk = newblock;
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_len = allocated;
- goto out2;
+ if (err < 0)
+ goto out2;
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ goto map_out;
}
- /* buffered IO case */
+ /* buffered IO cases */
/*
* repeat fallocate creation request
* we already have an unwritten extent
@@ -4083,35 +3925,39 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
goto out1;
}
- /* buffered write, writepage time, convert*/
+ /*
+ * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1.
+ * For buffered writes, at writepage time, etc. Convert a
+ * discovered unwritten extent to written.
+ */
ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out:
- if (ret <= 0) {
+ if (ret < 0) {
err = ret;
goto out2;
- } else
- allocated = ret;
- map->m_flags |= EXT4_MAP_NEW;
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_len = allocated;
+ }
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ /*
+ * shouldn't get a 0 return when converting an unwritten extent
+ * unless m_len is 0 (bug) or extent has been corrupted
+ */
+ if (unlikely(ret == 0)) {
+ EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
+ map->m_len);
+ err = -EFSCORRUPTED;
+ goto out2;
+ }
+out:
+ allocated = ret;
+ map->m_flags |= EXT4_MAP_NEW;
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
- if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
- err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
- map->m_len);
- if (err < 0)
- goto out2;
- }
out1:
+ map->m_pblk = newblock;
if (allocated > map->m_len)
allocated = map->m_len;
- ext4_ext_show_leaf(inode, path);
- map->m_pblk = newblock;
map->m_len = allocated;
+ ext4_ext_show_leaf(inode, path);
out2:
return err ? err : allocated;
}
@@ -4227,7 +4073,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
* (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
*
- * return > 0, number of of blocks already mapped/allocated
+ * return > 0, number of blocks already mapped/allocated
* if create == 0 and these are pre-allocated blocks
* buffer head is unmapped
* otherwise blocks are mapped
@@ -4241,18 +4087,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
struct ext4_ext_path *path = NULL;
- struct ext4_extent newex, *ex, *ex2;
+ struct ext4_extent newex, *ex, ex2;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- ext4_fsblk_t newblock = 0;
- int free_on_err = 0, err = 0, depth, ret;
+ ext4_fsblk_t newblock = 0, pblk;
+ int err = 0, depth, ret;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
ext4_lblk_t cluster_offset;
- bool map_from_cluster = false;
- ext_debug("blocks %u/%u requested for inode %lu\n",
- map->m_lblk, map->m_len, inode->i_ino);
+ ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len);
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
@@ -4260,7 +4104,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
- goto out2;
+ goto out;
}
depth = ext_depth(inode);
@@ -4276,7 +4120,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
(unsigned long) map->m_lblk, depth,
path[depth].p_block);
err = -EFSCORRUPTED;
- goto out2;
+ goto out;
}
ex = path[depth].p_ext;
@@ -4299,8 +4143,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
newblock = map->m_lblk - ee_block + ee_start;
/* number of remaining blocks in the extent */
allocated = ee_len - (map->m_lblk - ee_block);
- ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
- ee_block, ee_len, newblock);
+ ext_debug(inode, "%u fit into %u:%d -> %llu\n",
+ map->m_lblk, ee_block, ee_len, newblock);
/*
* If the extent is initialized check whether the
@@ -4308,12 +4152,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
*/
if ((!ext4_ext_is_unwritten(ex)) &&
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
- allocated = convert_initialized_extent(
- handle, inode, map, &path,
- allocated);
- goto out2;
- } else if (!ext4_ext_is_unwritten(ex))
+ err = convert_initialized_extent(handle,
+ inode, map, &path, &allocated);
goto out;
+ } else if (!ext4_ext_is_unwritten(ex)) {
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = newblock;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_len = allocated;
+ ext4_ext_show_leaf(inode, path);
+ goto out;
+ }
ret = ext4_ext_handle_unwritten_extents(
handle, inode, map, &path, flags,
@@ -4322,7 +4172,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
err = ret;
else
allocated = ret;
- goto out2;
+ goto out;
}
}
@@ -4347,7 +4197,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
map->m_pblk = 0;
map->m_len = min_t(unsigned int, map->m_len, hole_len);
- goto out2;
+ goto out;
}
/*
@@ -4364,7 +4214,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map_from_cluster = true;
goto got_allocated_blocks;
}
@@ -4372,20 +4221,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ar.lleft = map->m_lblk;
err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
if (err)
- goto out2;
+ goto out;
ar.lright = map->m_lblk;
- ex2 = NULL;
err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
- if (err)
- goto out2;
+ if (err < 0)
+ goto out;
/* Check if the extent after searching to the right implies a
* cluster we can use. */
- if ((sbi->s_cluster_ratio > 1) && ex2 &&
- get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+ if ((sbi->s_cluster_ratio > 1) && err &&
+ get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map_from_cluster = true;
goto got_allocated_blocks;
}
@@ -4439,51 +4286,44 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ar.flags |= EXT4_MB_USE_RESERVED;
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
- goto out2;
- ext_debug("allocate new block: goal %llu, found %llu/%u\n",
- ar.goal, newblock, allocated);
- free_on_err = 1;
+ goto out;
allocated_clusters = ar.len;
ar.len = EXT4_C2B(sbi, ar.len) - offset;
+ ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n",
+ ar.goal, newblock, ar.len, allocated);
if (ar.len > allocated)
ar.len = allocated;
got_allocated_blocks:
/* try to insert new extent into found leaf and return */
- ext4_ext_store_pblock(&newex, newblock + offset);
+ pblk = newblock + offset;
+ ext4_ext_store_pblock(&newex, pblk);
newex.ee_len = cpu_to_le16(ar.len);
/* Mark unwritten */
- if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
+ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
ext4_ext_mark_unwritten(&newex);
map->m_flags |= EXT4_MAP_UNWRITTEN;
}
- err = 0;
- if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
- err = check_eofblocks_fl(handle, inode, map->m_lblk,
- path, ar.len);
- if (!err)
- err = ext4_ext_insert_extent(handle, inode, &path,
- &newex, flags);
-
- if (err && free_on_err) {
- int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
- EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
- /* free data blocks we just allocated */
- /* not a good idea to call discard here directly,
- * but otherwise we'd need to call it every free() */
- ext4_discard_preallocations(inode);
- ext4_free_blocks(handle, inode, NULL, newblock,
- EXT4_C2B(sbi, allocated_clusters), fb_flags);
- goto out2;
- }
+ err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
+ if (err) {
+ if (allocated_clusters) {
+ int fb_flags = 0;
- /* previous routine could use block we allocated */
- newblock = ext4_ext_pblock(&newex);
- allocated = ext4_ext_get_actual_len(&newex);
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_flags |= EXT4_MAP_NEW;
+ /*
+ * free data blocks we just allocated.
+ * not a good idea to call discard here directly,
+ * but otherwise we'd need to call it every free().
+ */
+ ext4_discard_preallocations(inode, 0);
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
+ ext4_free_blocks(handle, inode, NULL, newblock,
+ EXT4_C2B(sbi, allocated_clusters),
+ fb_flags);
+ }
+ goto out;
+ }
/*
* Reduce the reserved cluster count to reflect successful deferred
@@ -4491,7 +4331,7 @@ got_allocated_blocks:
* clusters discovered to be delayed allocated. Once allocated, a
* cluster is not included in the reserved count.
*/
- if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
+ if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
/*
* When allocating delayed allocated clusters, simply
@@ -4530,16 +4370,14 @@ got_allocated_blocks:
ext4_update_inode_fsync_trans(handle, inode, 1);
else
ext4_update_inode_fsync_trans(handle, inode, 0);
-out:
- if (allocated > map->m_len)
- allocated = map->m_len;
+
+ map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED);
+ map->m_pblk = pblk;
+ map->m_len = ar.len;
+ allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
- map->m_flags |= EXT4_MAP_MAPPED;
- map->m_pblk = newblock;
- map->m_len = allocated;
-out2:
- ext4_ext_drop_refs(path);
- kfree(path);
+out:
+ ext4_free_ext_path(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
err ? err : allocated);
@@ -4570,13 +4408,18 @@ retry:
err = ext4_es_remove_extent(inode, last_block,
EXT_MAX_BLOCKS - last_block);
if (err == -ENOMEM) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ memalloc_retry_wait(GFP_ATOMIC);
goto retry;
}
if (err)
return err;
- return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+retry_remove_space:
+ err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+ if (err == -ENOMEM) {
+ memalloc_retry_wait(GFP_ATOMIC);
+ goto retry_remove_space;
+ }
+ return err;
}
static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
@@ -4585,8 +4428,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
{
struct inode *inode = file_inode(file);
handle_t *handle;
- int ret = 0;
- int ret2 = 0;
+ int ret = 0, ret2 = 0, ret3 = 0;
int retries = 0;
int depth = 0;
struct ext4_map_blocks map;
@@ -4611,7 +4453,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
depth = ext_depth(inode);
retry:
- while (ret >= 0 && len) {
+ while (len) {
/*
* Recalculate credits when extent tree depth changes.
*/
@@ -4633,9 +4475,13 @@ retry:
inode->i_ino, map.m_lblk,
map.m_len, ret);
ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
+ ext4_journal_stop(handle);
break;
}
+ /*
+ * allow a full retry cycle for any remaining allocations
+ */
+ retries = 0;
map.m_lblk += ret;
map.m_len = len = len - ret;
epos = (loff_t)map.m_lblk << inode->i_blkbits;
@@ -4645,34 +4491,29 @@ retry:
epos = new_size;
if (ext4_update_inode_size(inode, epos) & 0x1)
inode->i_mtime = inode->i_ctime;
- } else {
- if (epos > inode->i_size)
- ext4_set_inode_flag(inode,
- EXT4_INODE_EOFBLOCKS);
}
- ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
- ret2 = ext4_journal_stop(handle);
- if (ret2)
+ ret3 = ext4_journal_stop(handle);
+ ret2 = ret3 ? ret3 : ret2;
+ if (unlikely(ret2))
break;
}
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries)) {
- ret = 0;
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
- }
return ret > 0 ? ret2 : ret;
}
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
handle_t *handle = NULL;
unsigned int max_blocks;
loff_t new_size = 0;
@@ -4694,7 +4535,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
/*
- * Round up offset. This is not fallocate, we neet to zero out
+ * Round up offset. This is not fallocate, we need to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
* range.
@@ -4717,7 +4558,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
inode_lock(inode);
/*
- * Indirect files do not support unwritten extnets
+ * Indirect files do not support unwritten extents
*/
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
ret = -EOPNOTSUPP;
@@ -4734,12 +4575,14 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
- if (mode & FALLOC_FL_KEEP_SIZE)
- flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
- /* Wait all existing dio workers, newcomers will block on i_mutex */
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
@@ -4761,17 +4604,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* Prevent page faults from reinstantiating pages we have
* released from page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
goto out_mutex;
}
ret = ext4_update_disksize_before_punch(inode, offset, len);
if (ret) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
goto out_mutex;
}
/* Now release the pages and zero block aligned part of pages */
@@ -4780,7 +4623,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags);
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
if (ret)
goto out_mutex;
}
@@ -4802,18 +4645,11 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
inode->i_mtime = inode->i_ctime = current_time(inode);
- if (new_size) {
+ if (new_size)
ext4_update_inode_size(inode, new_size);
- } else {
- /*
- * Mark that we allocate beyond EOF so the subsequent truncate
- * can proceed even if the new size is the same as i_size.
- */
- if (offset + len > inode->i_size)
- ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
- }
- ext4_mark_inode_dirty(handle, inode);
-
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret))
+ goto out_handle;
/* Zero out partial block at the edges of the range */
ret = ext4_zero_partial_blocks(handle, inode, offset, len);
if (ret >= 0)
@@ -4822,6 +4658,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (file->f_flags & O_SYNC)
ext4_handle_sync(handle);
+out_handle:
ext4_journal_stop(handle);
out_mutex:
inode_unlock(inode);
@@ -4861,29 +4698,36 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- return ext4_punch_hole(inode, offset, len);
-
+ inode_lock(inode);
ret = ext4_convert_inline_data(inode);
+ inode_unlock(inode);
if (ret)
- return ret;
+ goto exit;
- if (mode & FALLOC_FL_COLLAPSE_RANGE)
- return ext4_collapse_range(inode, offset, len);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ ret = ext4_punch_hole(file, offset, len);
+ goto exit;
+ }
- if (mode & FALLOC_FL_INSERT_RANGE)
- return ext4_insert_range(inode, offset, len);
+ if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ ret = ext4_collapse_range(file, offset, len);
+ goto exit;
+ }
- if (mode & FALLOC_FL_ZERO_RANGE)
- return ext4_zero_range(file, offset, len, mode);
+ if (mode & FALLOC_FL_INSERT_RANGE) {
+ ret = ext4_insert_range(file, offset, len);
+ goto exit;
+ }
+ if (mode & FALLOC_FL_ZERO_RANGE) {
+ ret = ext4_zero_range(file, offset, len, mode);
+ goto exit;
+ }
trace_ext4_fallocate_enter(inode, offset, len, mode);
lblk = offset >> blkbits;
max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
- if (mode & FALLOC_FL_KEEP_SIZE)
- flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
inode_lock(inode);
@@ -4904,20 +4748,25 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
}
- /* Wait all existing dio workers, newcomers will block on i_mutex */
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
if (ret)
goto out;
if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
- ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
- EXT4_I(inode)->i_sync_tid);
+ ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
}
out:
inode_unlock(inode);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+exit:
return ret;
}
@@ -4935,8 +4784,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len)
{
unsigned int max_blocks;
- int ret = 0;
- int ret2 = 0;
+ int ret = 0, ret2 = 0, ret3 = 0;
struct ext4_map_blocks map;
unsigned int blkbits = inode->i_blkbits;
unsigned int credits = 0;
@@ -4969,9 +4817,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
"ext4_ext_map_blocks returned %d",
inode->i_ino, map.m_lblk,
map.m_len, ret);
- ext4_mark_inode_dirty(handle, inode);
- if (credits)
- ret2 = ext4_journal_stop(handle);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+ if (credits) {
+ ret3 = ext4_journal_stop(handle);
+ if (unlikely(ret3))
+ ret2 = ret3;
+ }
+
if (ret <= 0 || ret2)
break;
}
@@ -4980,7 +4832,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
{
- int ret, err = 0;
+ int ret = 0, err = 0;
struct ext4_io_end_vec *io_end_vec;
/*
@@ -5009,64 +4861,13 @@ int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
return ret < 0 ? ret : err;
}
-/*
- * If newes is not existing extent (newes->ec_pblk equals zero) find
- * delayed extent at start of newes and update newes accordingly and
- * return start of the next delayed extent.
- *
- * If newes is existing extent (newes->ec_pblk is not equal zero)
- * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
- * extent found. Leave newes unmodified.
- */
-static int ext4_find_delayed_extent(struct inode *inode,
- struct extent_status *newes)
-{
- struct extent_status es;
- ext4_lblk_t block, next_del;
-
- if (newes->es_pblk == 0) {
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- newes->es_lblk,
- newes->es_lblk + newes->es_len - 1,
- &es);
-
- /*
- * No extent in extent-tree contains block @newes->es_pblk,
- * then the block may stay in 1)a hole or 2)delayed-extent.
- */
- if (es.es_len == 0)
- /* A hole found. */
- return 0;
-
- if (es.es_lblk > newes->es_lblk) {
- /* A hole found. */
- newes->es_len = min(es.es_lblk - newes->es_lblk,
- newes->es_len);
- return 0;
- }
-
- newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
- }
-
- block = newes->es_lblk + newes->es_len;
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
- EXT_MAX_BLOCKS, &es);
- if (es.es_len == 0)
- next_del = EXT_MAX_BLOCKS;
- else
- next_del = es.es_lblk;
-
- return next_del;
-}
-
-static int ext4_xattr_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo)
+static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
{
__u64 physical = 0;
- __u64 length;
- __u32 flags = FIEMAP_EXTENT_LAST;
+ __u64 length = 0;
int blockbits = inode->i_sb->s_blocksize_bits;
int error = 0;
+ u16 iomap_type;
/* in-inode? */
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
@@ -5081,40 +4882,69 @@ static int ext4_xattr_fiemap(struct inode *inode,
EXT4_I(inode)->i_extra_isize;
physical += offset;
length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
- flags |= FIEMAP_EXTENT_DATA_INLINE;
brelse(iloc.bh);
- } else { /* external block */
+ iomap_type = IOMAP_INLINE;
+ } else if (EXT4_I(inode)->i_file_acl) { /* external block */
physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
length = inode->i_sb->s_blocksize;
+ iomap_type = IOMAP_MAPPED;
+ } else {
+ /* no in-inode or external block for xattr, so return -ENOENT */
+ error = -ENOENT;
+ goto out;
}
- if (physical)
- error = fiemap_fill_next_extent(fieinfo, 0, physical,
- length, flags);
- return (error < 0 ? error : 0);
+ iomap->addr = physical;
+ iomap->offset = 0;
+ iomap->length = length;
+ iomap->type = iomap_type;
+ iomap->flags = 0;
+out:
+ return error;
}
-static int _ext4_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len,
- int (*fill)(struct inode *, ext4_lblk_t,
- ext4_lblk_t,
- struct fiemap_extent_info *))
+static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
- ext4_lblk_t start_blk;
- u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
+ int error;
- int error = 0;
+ error = ext4_iomap_xattr_fiemap(inode, iomap);
+ if (error == 0 && (offset >= iomap->length))
+ error = -ENOENT;
+ return error;
+}
- if (ext4_has_inline_data(inode)) {
- int has_inline = 1;
+static const struct iomap_ops ext4_iomap_xattr_ops = {
+ .iomap_begin = ext4_iomap_xattr_begin,
+};
- error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
- start, len);
+static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
+{
+ u64 maxbytes;
- if (has_inline)
- return error;
- }
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ maxbytes = inode->i_sb->s_maxbytes;
+ else
+ maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+
+ if (*len == 0)
+ return -EINVAL;
+ if (start > maxbytes)
+ return -EFBIG;
+
+ /*
+ * Shrink request scope to what the fs can actually handle.
+ */
+ if (*len > maxbytes || (maxbytes - *len) < start)
+ *len = maxbytes - start;
+ return 0;
+}
+
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ int error = 0;
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
error = ext4_ext_precache(inode);
@@ -5123,48 +4953,31 @@ static int _ext4_fiemap(struct inode *inode,
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
- /* fallback to generic here if not in extents fmt */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
- fill == ext4_fill_fiemap_extents)
- return generic_block_fiemap(inode, fieinfo, start, len,
- ext4_get_block);
-
- if (fill == ext4_fill_es_cache_info)
- ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
- if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
- return -EBADR;
+ /*
+ * For bitmap files the maximum size limit could be smaller than
+ * s_maxbytes, so check len here manually instead of just relying on the
+ * generic check.
+ */
+ error = ext4_fiemap_check_ranges(inode, start, &len);
+ if (error)
+ return error;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
- error = ext4_xattr_fiemap(inode, fieinfo);
- } else {
- ext4_lblk_t len_blks;
- __u64 last_blk;
-
- start_blk = start >> inode->i_sb->s_blocksize_bits;
- last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
- if (last_blk >= EXT_MAX_BLOCKS)
- last_blk = EXT_MAX_BLOCKS-1;
- len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
-
- /*
- * Walk the extent tree gathering extent information
- * and pushing extents back to the user.
- */
- error = fill(inode, start_blk, len_blks, fieinfo);
+ fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
+ return iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_xattr_ops);
}
- return error;
-}
-int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
-{
- return _ext4_fiemap(inode, fieinfo, start, len,
- ext4_fill_fiemap_extents);
+ return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
}
int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
+ ext4_lblk_t start_blk, len_blks;
+ __u64 last_blk;
+ int error = 0;
+
if (ext4_has_inline_data(inode)) {
int has_inline;
@@ -5175,39 +4988,32 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
return 0;
}
- return _ext4_fiemap(inode, fieinfo, start, len,
- ext4_fill_es_cache_info);
-}
+ if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ error = ext4_ext_precache(inode);
+ if (error)
+ return error;
+ fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
+ }
+ error = fiemap_prep(inode, fieinfo, start, &len, 0);
+ if (error)
+ return error;
-/*
- * ext4_access_path:
- * Function to access the path buffer for marking it dirty.
- * It also checks if there are sufficient credits left in the journal handle
- * to update path.
- */
-static int
-ext4_access_path(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
-{
- int credits, err;
+ error = ext4_fiemap_check_ranges(inode, start, &len);
+ if (error)
+ return error;
- if (!ext4_handle_valid(handle))
- return 0;
+ start_blk = start >> inode->i_sb->s_blocksize_bits;
+ last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+ if (last_blk >= EXT_MAX_BLOCKS)
+ last_blk = EXT_MAX_BLOCKS-1;
+ len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
/*
- * Check if need to extend journal credits
- * 3 for leaf, sb, and inode plus 2 (bmap and group
- * descriptor) for each block group; assume two block
- * groups
+ * Walk the extent tree gathering extent information
+ * and pushing extents back to the user.
*/
- credits = ext4_writepage_trans_blocks(inode);
- err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
- if (err < 0)
- return err;
-
- err = ext4_ext_get_access(handle, inode, path);
- return err;
+ return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo);
}
/*
@@ -5224,6 +5030,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
int depth, err = 0;
struct ext4_extent *ex_start, *ex_last;
bool update = false;
+ int credits, restart_credits;
depth = path->p_depth;
while (depth >= 0) {
@@ -5233,13 +5040,26 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
return -EFSCORRUPTED;
ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+ /* leaf + sb + inode */
+ credits = 3;
+ if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
+ update = true;
+ /* extent tree + sb + inode */
+ credits = depth + 2;
+ }
- err = ext4_access_path(handle, inode, path + depth);
- if (err)
+ restart_credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_datasem_ensure_credits(handle, inode, credits,
+ restart_credits, 0);
+ if (err) {
+ if (err > 0)
+ err = -EAGAIN;
goto out;
+ }
- if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
- update = true;
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
while (ex_start <= ex_last) {
if (SHIFT == SHIFT_LEFT) {
@@ -5270,7 +5090,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
}
/* Update index too */
- err = ext4_access_path(handle, inode, path + depth);
+ err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
@@ -5309,6 +5129,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
int ret = 0, depth;
struct ext4_extent *extent;
ext4_lblk_t stop, *iterator, ex_start, ex_end;
+ ext4_lblk_t tmp = EXT_MAX_BLOCKS;
/* Let path point to the last extent */
path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
@@ -5362,11 +5183,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
* till we reach stop. In case of right shift, iterator points to stop
* and it is decreased till we reach start.
*/
+again:
if (SHIFT == SHIFT_LEFT)
iterator = &start;
else
iterator = &stop;
+ if (tmp != EXT_MAX_BLOCKS)
+ *iterator = tmp;
+
/*
* Its safe to start updating extents. Start and stop are unsigned, so
* in case of right shift if extent with 0 block is reached, iterator
@@ -5395,6 +5220,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
}
}
+ tmp = *iterator;
if (SHIFT == SHIFT_LEFT) {
extent = EXT_LAST_EXTENT(path[depth].p_hdr);
*iterator = le32_to_cpu(extent->ee_block) +
@@ -5413,12 +5239,14 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
}
ret = ext4_ext_shift_path_extents(path, shift, inode,
handle, SHIFT);
+ /* iterator can be NULL which means we should break */
+ if (ret == -EAGAIN)
+ goto again;
if (ret)
break;
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -5427,9 +5255,11 @@ out:
* This implements the fallocate's collapse range functionality for ext4
* Returns: 0 and non-zero on error.
*/
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
ext4_lblk_t punch_start, punch_stop;
handle_t *handle;
unsigned int credits;
@@ -5479,11 +5309,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
@@ -5498,15 +5332,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
* Write tail of the last page before removed range since it will get
* removed from the page cache below.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+ ret = filemap_write_and_wait_range(mapping, ioffset, offset);
if (ret)
goto out_mmap;
/*
* Write data that will be shifted to preserve them when discarding
* page cache below. We are also protected from pages becoming dirty
- * by i_mmap_sem.
+ * by i_rwsem and invalidate_lock.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+ ret = filemap_write_and_wait_range(mapping, offset + len,
LLONG_MAX);
if (ret)
goto out_mmap;
@@ -5518,9 +5352,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ret = PTR_ERR(handle);
goto out_mmap;
}
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_es_remove_extent(inode, punch_start,
EXT_MAX_BLOCKS - punch_start);
@@ -5534,7 +5369,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_ext_shift_extents(inode, handle, punch_stop,
punch_stop - punch_start, SHIFT_LEFT);
@@ -5551,13 +5386,13 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
inode->i_mtime = inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ ret = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
ext4_journal_stop(handle);
out_mmap:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
@@ -5571,9 +5406,11 @@ out_mutex:
* by len bytes.
* Returns 0 on success, error otherwise.
*/
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
handle_t *handle;
struct ext4_ext_path *path;
struct ext4_extent *extent;
@@ -5628,11 +5465,15 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
@@ -5656,6 +5497,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
ret = PTR_ERR(handle);
goto out_mmap;
}
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
@@ -5666,7 +5508,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
path = ext4_find_extent(inode, offset_lblk, NULL, 0);
if (IS_ERR(path)) {
@@ -5696,15 +5538,13 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
EXT4_GET_BLOCKS_METADATA_NOFAIL);
}
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (ret < 0) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
} else {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
ret = ext4_es_remove_extent(inode, offset_lblk,
@@ -5731,7 +5571,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
out_mmap:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
@@ -5752,7 +5592,7 @@ out_mutex:
* stuff such as page-cache locking consistency, bh mapping consistency or
* extent's data copying must be performed by caller.
* Locking:
- * i_mutex is held for both inodes
+ * i_rwsem is held for both inodes
* i_data_sem is locked for write for both inodes
* Assumptions:
* All pages from requested range are locked for both inodes
@@ -5800,7 +5640,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
}
ex1 = path1[path1->p_depth].p_ext;
ex2 = path2[path2->p_depth].p_ext;
- /* Do we have somthing to swap ? */
+ /* Do we have something to swap ? */
if (unlikely(!ex2 || !ex1))
goto finish;
@@ -5924,10 +5764,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
count -= len;
repeat:
- ext4_ext_drop_refs(path1);
- kfree(path1);
- ext4_ext_drop_refs(path2);
- kfree(path2);
+ ext4_free_ext_path(path1);
+ ext4_free_ext_path(path2);
path1 = path2 = NULL;
}
return replaced_count;
@@ -6006,8 +5844,269 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return err ? err : mapped;
}
+
+/*
+ * Updates physical block address and unwritten status of extent
+ * starting at lblk start and of len. If such an extent doesn't exist,
+ * this function splits the extent tree appropriately to create an
+ * extent like this. This function is called in the fast commit
+ * replay path. Returns 0 on success and error on failure.
+ */
+int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
+ int len, int unwritten, ext4_fsblk_t pblk)
+{
+ struct ext4_ext_path *path = NULL, *ppath;
+ struct ext4_extent *ex;
+ int ret;
+
+ path = ext4_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+
+ if (le32_to_cpu(ex->ee_block) != start ||
+ ext4_ext_get_actual_len(ex) != len) {
+ /* We need to split this extent to match our extent first */
+ ppath = path;
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out;
+ kfree(path);
+ path = ext4_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return -1;
+ ppath = path;
+ ex = path[path->p_depth].p_ext;
+ WARN_ON(le32_to_cpu(ex->ee_block) != start);
+ if (ext4_ext_get_actual_len(ex) != len) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_force_split_extent_at(NULL, inode, &ppath,
+ start + len, 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out;
+ kfree(path);
+ path = ext4_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return -EINVAL;
+ ex = path[path->p_depth].p_ext;
+ }
+ }
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+ ext4_ext_store_pblock(ex, pblk);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
+ up_write(&EXT4_I(inode)->i_data_sem);
+out:
+ ext4_free_ext_path(path);
+ ext4_mark_inode_dirty(NULL, inode);
+ return ret;
+}
+
+/* Try to shrink the extent tree */
+void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t old_cur, cur = 0;
+
+ while (cur < end) {
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ return;
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_free_ext_path(path);
+ ext4_mark_inode_dirty(NULL, inode);
+ return;
+ }
+ old_cur = cur;
+ cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+ if (cur <= old_cur)
+ cur = old_cur + 1;
+ ext4_ext_try_to_merge(NULL, inode, path, ex);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_mark_inode_dirty(NULL, inode);
+ ext4_free_ext_path(path);
+ }
+}
+
+/* Check if *cur is a hole and if it is, skip it */
+static int skip_hole(struct inode *inode, ext4_lblk_t *cur)
+{
+ int ret;
+ struct ext4_map_blocks map;
+
+ map.m_lblk = *cur;
+ map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ if (ret != 0)
+ return 0;
+ *cur = *cur + map.m_len;
+ return 0;
+}
+
+/* Count number of blocks used by this inode and update i_blocks */
+int ext4_ext_replay_set_iblocks(struct inode *inode)
+{
+ struct ext4_ext_path *path = NULL, *path2 = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t cur = 0, end;
+ int numblks = 0, i, ret = 0;
+ ext4_fsblk_t cmp1, cmp2;
+ struct ext4_map_blocks map;
+
+ /* Determin the size of the file first */
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_free_ext_path(path);
+ goto out;
+ }
+ end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+ ext4_free_ext_path(path);
+
+ /* Count the number of data blocks */
+ cur = 0;
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0)
+ numblks += ret;
+ cur = cur + map.m_len;
+ }
+
+ /*
+ * Count the number of extent tree blocks. We do it by looking up
+ * two successive extents and determining the difference between
+ * their paths. When path is different for 2 successive extents
+ * we compare the blocks in the path at each level and increment
+ * iblocks by total number of differences found.
+ */
+ cur = 0;
+ ret = skip_hole(inode, &cur);
+ if (ret < 0)
+ goto out;
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ goto out;
+ numblks += path->p_depth;
+ ext4_free_ext_path(path);
+ while (cur < end) {
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ break;
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_free_ext_path(path);
+ return 0;
+ }
+ cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
+ ext4_ext_get_actual_len(ex));
+ ret = skip_hole(inode, &cur);
+ if (ret < 0) {
+ ext4_free_ext_path(path);
+ break;
+ }
+ path2 = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path2)) {
+ ext4_free_ext_path(path);
+ break;
+ }
+ for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
+ cmp1 = cmp2 = 0;
+ if (i <= path->p_depth)
+ cmp1 = path[i].p_bh ?
+ path[i].p_bh->b_blocknr : 0;
+ if (i <= path2->p_depth)
+ cmp2 = path2[i].p_bh ?
+ path2[i].p_bh->b_blocknr : 0;
+ if (cmp1 != cmp2 && cmp2 != 0)
+ numblks++;
+ }
+ ext4_free_ext_path(path);
+ ext4_free_ext_path(path2);
+ }
+
+out:
+ inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
+ ext4_mark_inode_dirty(NULL, inode);
+ return 0;
+}
+
+int ext4_ext_clear_bb(struct inode *inode)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t cur = 0, end;
+ int j, ret = 0;
+ struct ext4_map_blocks map;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ return 0;
+
+ /* Determin the size of the file first */
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_free_ext_path(path);
+ return 0;
+ }
+ end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+ ext4_free_ext_path(path);
+
+ cur = 0;
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
+ if (!IS_ERR_OR_NULL(path)) {
+ for (j = 0; j < path->p_depth; j++) {
+
+ ext4_mb_mark_bb(inode->i_sb,
+ path[j].p_block, 1, 0);
+ ext4_fc_record_regions(inode->i_sb, inode->i_ino,
+ 0, path[j].p_block, 1, 1);
+ }
+ ext4_free_ext_path(path);
+ }
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ ext4_fc_record_regions(inode->i_sb, inode->i_ino,
+ map.m_lblk, map.m_pblk, map.m_len, 1);
+ }
+ cur = cur + map.m_len;
+ }
+
+ return 0;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index d996b44d2265..cd0a861853e3 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -311,6 +311,9 @@ void ext4_es_find_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
trace_ext4_es_find_extent_range_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
@@ -361,6 +364,9 @@ bool ext4_es_scan_range(struct inode *inode,
{
bool ret;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return false;
+
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_range(inode, matching_fn, lblk, end);
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -404,6 +410,9 @@ bool ext4_es_scan_clu(struct inode *inode,
{
bool ret;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return false;
+
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_clu(inode, matching_fn, lblk);
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -658,8 +667,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
}
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
static void ext4_es_insert_extent_ind_check(struct inode *inode,
@@ -812,6 +820,9 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
int err = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
@@ -873,6 +884,9 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
@@ -908,6 +922,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct rb_node *node;
int found = 0;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
trace_ext4_es_lookup_extent_enter(inode, lblk);
es_debug("lookup extent in block %u\n", lblk);
@@ -1054,7 +1071,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
/* record the first block of the first delonly extent seen */
- if (rc->first_do_lblk_found == false) {
+ if (!rc->first_do_lblk_found) {
rc->first_do_lblk = i;
rc->first_do_lblk_found = true;
}
@@ -1419,6 +1436,9 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
int err = 0;
int reserved = 0;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
@@ -1553,11 +1573,9 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
- if (!nr_to_scan)
- return ret;
-
nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
+ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
return nr_shrunk;
}
@@ -1635,7 +1653,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
- err = register_shrinker(&sbi->s_es_shrinker);
+ err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
+ sbi->s_sb->s_id);
if (err)
goto err4;
@@ -1969,6 +1988,9 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
struct extent_status newes;
int err = 0;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
lblk, inode->i_ino);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
new file mode 100644
index 000000000000..0f6d0a80467d
--- /dev/null
+++ b/fs/ext4/fast_commit.c
@@ -0,0 +1,2301 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * fs/ext4/fast_commit.c
+ *
+ * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+ *
+ * Ext4 fast commits routines.
+ */
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "mballoc.h"
+
+/*
+ * Ext4 Fast Commits
+ * -----------------
+ *
+ * Ext4 fast commits implement fine grained journalling for Ext4.
+ *
+ * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
+ * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
+ * TLV during the recovery phase. For the scenarios for which we currently
+ * don't have replay code, fast commit falls back to full commits.
+ * Fast commits record delta in one of the following three categories.
+ *
+ * (A) Directory entry updates:
+ *
+ * - EXT4_FC_TAG_UNLINK - records directory entry unlink
+ * - EXT4_FC_TAG_LINK - records directory entry link
+ * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
+ *
+ * (B) File specific data range updates:
+ *
+ * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
+ * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
+ *
+ * (C) Inode metadata (mtime / ctime etc):
+ *
+ * - EXT4_FC_TAG_INODE - record the inode that should be replayed
+ * during recovery. Note that iblocks field is
+ * not replayed and instead derived during
+ * replay.
+ * Commit Operation
+ * ----------------
+ * With fast commits, we maintain all the directory entry operations in the
+ * order in which they are issued in an in-memory queue. This queue is flushed
+ * to disk during the commit operation. We also maintain a list of inodes
+ * that need to be committed during a fast commit in another in memory queue of
+ * inodes. During the commit operation, we commit in the following order:
+ *
+ * [1] Lock inodes for any further data updates by setting COMMITTING state
+ * [2] Submit data buffers of all the inodes
+ * [3] Wait for [2] to complete
+ * [4] Commit all the directory entry updates in the fast commit space
+ * [5] Commit all the changed inode structures
+ * [6] Write tail tag (this tag ensures the atomicity, please read the following
+ * section for more details).
+ * [7] Wait for [4], [5] and [6] to complete.
+ *
+ * All the inode updates must call ext4_fc_start_update() before starting an
+ * update. If such an ongoing update is present, fast commit waits for it to
+ * complete. The completion of such an update is marked by
+ * ext4_fc_stop_update().
+ *
+ * Fast Commit Ineligibility
+ * -------------------------
+ *
+ * Not all operations are supported by fast commits today (e.g extended
+ * attributes). Fast commit ineligibility is marked by calling
+ * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
+ * to full commit.
+ *
+ * Atomicity of commits
+ * --------------------
+ * In order to guarantee atomicity during the commit operation, fast commit
+ * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
+ * tag contains CRC of the contents and TID of the transaction after which
+ * this fast commit should be applied. Recovery code replays fast commit
+ * logs only if there's at least 1 valid tail present. For every fast commit
+ * operation, there is 1 tail. This means, we may end up with multiple tails
+ * in the fast commit space. Here's an example:
+ *
+ * - Create a new file A and remove existing file B
+ * - fsync()
+ * - Append contents to file A
+ * - Truncate file A
+ * - fsync()
+ *
+ * The fast commit space at the end of above operations would look like this:
+ * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
+ * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
+ *
+ * Replay code should thus check for all the valid tails in the FC area.
+ *
+ * Fast Commit Replay Idempotence
+ * ------------------------------
+ *
+ * Fast commits tags are idempotent in nature provided the recovery code follows
+ * certain rules. The guiding principle that the commit path follows while
+ * committing is that it stores the result of a particular operation instead of
+ * storing the procedure.
+ *
+ * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
+ * was associated with inode 10. During fast commit, instead of storing this
+ * operation as a procedure "rename a to b", we store the resulting file system
+ * state as a "series" of outcomes:
+ *
+ * - Link dirent b to inode 10
+ * - Unlink dirent a
+ * - Inode <10> with valid refcount
+ *
+ * Now when recovery code runs, it needs "enforce" this state on the file
+ * system. This is what guarantees idempotence of fast commit replay.
+ *
+ * Let's take an example of a procedure that is not idempotent and see how fast
+ * commits make it idempotent. Consider following sequence of operations:
+ *
+ * rm A; mv B A; read A
+ * (x) (y) (z)
+ *
+ * (x), (y) and (z) are the points at which we can crash. If we store this
+ * sequence of operations as is then the replay is not idempotent. Let's say
+ * while in replay, we crash at (z). During the second replay, file A (which was
+ * actually created as a result of "mv B A" operation) would get deleted. Thus,
+ * file named A would be absent when we try to read A. So, this sequence of
+ * operations is not idempotent. However, as mentioned above, instead of storing
+ * the procedure fast commits store the outcome of each procedure. Thus the fast
+ * commit log for above procedure would be as follows:
+ *
+ * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
+ * inode 11 before the replay)
+ *
+ * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
+ * (w) (x) (y) (z)
+ *
+ * If we crash at (z), we will have file A linked to inode 11. During the second
+ * replay, we will remove file A (inode 11). But we will create it back and make
+ * it point to inode 11. We won't find B, so we'll just skip that step. At this
+ * point, the refcount for inode 11 is not reliable, but that gets fixed by the
+ * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
+ * similarly. Thus, by converting a non-idempotent procedure into a series of
+ * idempotent outcomes, fast commits ensured idempotence during the replay.
+ *
+ * TODOs
+ * -----
+ *
+ * 0) Fast commit replay path hardening: Fast commit replay code should use
+ * journal handles to make sure all the updates it does during the replay
+ * path are atomic. With that if we crash during fast commit replay, after
+ * trying to do recovery again, we will find a file system where fast commit
+ * area is invalid (because new full commit would be found). In order to deal
+ * with that, fast commit replay code should ensure that the "FC_REPLAY"
+ * superblock state is persisted before starting the replay, so that after
+ * the crash, fast commit recovery code can look at that flag and perform
+ * fast commit recovery even if that area is invalidated by later full
+ * commits.
+ *
+ * 1) Fast commit's commit path locks the entire file system during fast
+ * commit. This has significant performance penalty. Instead of that, we
+ * should use ext4_fc_start/stop_update functions to start inode level
+ * updates from ext4_journal_start/stop. Once we do that we can drop file
+ * system locking during commit path.
+ *
+ * 2) Handle more ineligible cases.
+ */
+
+#include <trace/events/ext4.h>
+static struct kmem_cache *ext4_fc_dentry_cachep;
+
+static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+ BUFFER_TRACE(bh, "");
+ if (uptodate) {
+ ext4_debug("%s: Block %lld up-to-date",
+ __func__, bh->b_blocknr);
+ set_buffer_uptodate(bh);
+ } else {
+ ext4_debug("%s: Block %lld not up-to-date",
+ __func__, bh->b_blocknr);
+ clear_buffer_uptodate(bh);
+ }
+
+ unlock_buffer(bh);
+}
+
+static inline void ext4_fc_reset_inode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ei->i_fc_lblk_start = 0;
+ ei->i_fc_lblk_len = 0;
+}
+
+void ext4_fc_init_inode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ext4_fc_reset_inode(inode);
+ ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+ INIT_LIST_HEAD(&ei->i_fc_list);
+ INIT_LIST_HEAD(&ei->i_fc_dilist);
+ init_waitqueue_head(&ei->i_fc_wait);
+ atomic_set(&ei->i_fc_updates, 0);
+}
+
+/* This function must be called with sbi->s_fc_lock held. */
+static void ext4_fc_wait_committing_inode(struct inode *inode)
+__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
+{
+ wait_queue_head_t *wq;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+}
+
+static bool ext4_fc_disabled(struct super_block *sb)
+{
+ return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
+}
+
+/*
+ * Inform Ext4's fast about start of an inode update
+ *
+ * This function is called by the high level call VFS callbacks before
+ * performing any inode update. This function blocks if there's an ongoing
+ * fast commit on the inode in question.
+ */
+void ext4_fc_start_update(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+restart:
+ spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ if (list_empty(&ei->i_fc_list))
+ goto out;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+ ext4_fc_wait_committing_inode(inode);
+ goto restart;
+ }
+out:
+ atomic_inc(&ei->i_fc_updates);
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+}
+
+/*
+ * Stop inode update and wake up waiting fast commits if any.
+ */
+void ext4_fc_stop_update(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ if (atomic_dec_and_test(&ei->i_fc_updates))
+ wake_up_all(&ei->i_fc_wait);
+}
+
+/*
+ * Remove inode from fast commit list. If the inode is being committed
+ * we wait until inode commit is done.
+ */
+void ext4_fc_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_fc_dentry_update *fc_dentry;
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+restart:
+ spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ return;
+ }
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+ ext4_fc_wait_committing_inode(inode);
+ goto restart;
+ }
+
+ if (!list_empty(&ei->i_fc_list))
+ list_del_init(&ei->i_fc_list);
+
+ /*
+ * Since this inode is getting removed, let's also remove all FC
+ * dentry create references, since it is not needed to log it anyways.
+ */
+ if (list_empty(&ei->i_fc_dilist)) {
+ spin_unlock(&sbi->s_fc_lock);
+ return;
+ }
+
+ fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
+ WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
+ list_del_init(&fc_dentry->fcd_list);
+ list_del_init(&fc_dentry->fcd_dilist);
+
+ WARN_ON(!list_empty(&ei->i_fc_dilist));
+ spin_unlock(&sbi->s_fc_lock);
+
+ if (fc_dentry->fcd_name.name &&
+ fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
+ kfree(fc_dentry->fcd_name.name);
+ kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
+
+ return;
+}
+
+/*
+ * Mark file system as fast commit ineligible, and record latest
+ * ineligible transaction tid. This means until the recorded
+ * transaction, commit operation would result in a full jbd2 commit.
+ */
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ tid_t tid;
+
+ if (ext4_fc_disabled(sb))
+ return;
+
+ ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ if (handle && !IS_ERR(handle))
+ tid = handle->h_transaction->t_tid;
+ else {
+ read_lock(&sbi->s_journal->j_state_lock);
+ tid = sbi->s_journal->j_running_transaction ?
+ sbi->s_journal->j_running_transaction->t_tid : 0;
+ read_unlock(&sbi->s_journal->j_state_lock);
+ }
+ spin_lock(&sbi->s_fc_lock);
+ if (sbi->s_fc_ineligible_tid < tid)
+ sbi->s_fc_ineligible_tid = tid;
+ spin_unlock(&sbi->s_fc_lock);
+ WARN_ON(reason >= EXT4_FC_REASON_MAX);
+ sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
+}
+
+/*
+ * Generic fast commit tracking function. If this is the first time this we are
+ * called after a full commit, we initialize fast commit fields and then call
+ * __fc_track_fn() with update = 0. If we have already been called after a full
+ * commit, we pass update = 1. Based on that, the track function can determine
+ * if it needs to track a field for the first time or if it needs to just
+ * update the previously tracked value.
+ *
+ * If enqueue is set, this function enqueues the inode in fast commit list.
+ */
+static int ext4_fc_track_template(
+ handle_t *handle, struct inode *inode,
+ int (*__fc_track_fn)(struct inode *, void *, bool),
+ void *args, int enqueue)
+{
+ bool update = false;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ tid_t tid = 0;
+ int ret;
+
+ tid = handle->h_transaction->t_tid;
+ mutex_lock(&ei->i_fc_lock);
+ if (tid == ei->i_sync_tid) {
+ update = true;
+ } else {
+ ext4_fc_reset_inode(inode);
+ ei->i_sync_tid = tid;
+ }
+ ret = __fc_track_fn(inode, args, update);
+ mutex_unlock(&ei->i_fc_lock);
+
+ if (!enqueue)
+ return ret;
+
+ spin_lock(&sbi->s_fc_lock);
+ if (list_empty(&EXT4_I(inode)->i_fc_list))
+ list_add_tail(&EXT4_I(inode)->i_fc_list,
+ (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
+ sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
+ &sbi->s_fc_q[FC_Q_STAGING] :
+ &sbi->s_fc_q[FC_Q_MAIN]);
+ spin_unlock(&sbi->s_fc_lock);
+
+ return ret;
+}
+
+struct __track_dentry_update_args {
+ struct dentry *dentry;
+ int op;
+};
+
+/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
+static int __track_dentry_update(struct inode *inode, void *arg, bool update)
+{
+ struct ext4_fc_dentry_update *node;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct __track_dentry_update_args *dentry_update =
+ (struct __track_dentry_update_args *)arg;
+ struct dentry *dentry = dentry_update->dentry;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ mutex_unlock(&ei->i_fc_lock);
+ node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
+ if (!node) {
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
+ mutex_lock(&ei->i_fc_lock);
+ return -ENOMEM;
+ }
+
+ node->fcd_op = dentry_update->op;
+ node->fcd_parent = dentry->d_parent->d_inode->i_ino;
+ node->fcd_ino = inode->i_ino;
+ if (dentry->d_name.len > DNAME_INLINE_LEN) {
+ node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
+ if (!node->fcd_name.name) {
+ kmem_cache_free(ext4_fc_dentry_cachep, node);
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_NOMEM, NULL);
+ mutex_lock(&ei->i_fc_lock);
+ return -ENOMEM;
+ }
+ memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
+ dentry->d_name.len);
+ } else {
+ memcpy(node->fcd_iname, dentry->d_name.name,
+ dentry->d_name.len);
+ node->fcd_name.name = node->fcd_iname;
+ }
+ node->fcd_name.len = dentry->d_name.len;
+ INIT_LIST_HEAD(&node->fcd_dilist);
+ spin_lock(&sbi->s_fc_lock);
+ if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
+ sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
+ list_add_tail(&node->fcd_list,
+ &sbi->s_fc_dentry_q[FC_Q_STAGING]);
+ else
+ list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
+
+ /*
+ * This helps us keep a track of all fc_dentry updates which is part of
+ * this ext4 inode. So in case the inode is getting unlinked, before
+ * even we get a chance to fsync, we could remove all fc_dentry
+ * references while evicting the inode in ext4_fc_del().
+ * Also with this, we don't need to loop over all the inodes in
+ * sbi->s_fc_q to get the corresponding inode in
+ * ext4_fc_commit_dentry_updates().
+ */
+ if (dentry_update->op == EXT4_FC_TAG_CREAT) {
+ WARN_ON(!list_empty(&ei->i_fc_dilist));
+ list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+ mutex_lock(&ei->i_fc_lock);
+
+ return 0;
+}
+
+void __ext4_fc_track_unlink(handle_t *handle,
+ struct inode *inode, struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_UNLINK;
+
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
+}
+
+void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_unlink(handle, inode, dentry);
+}
+
+void __ext4_fc_track_link(handle_t *handle,
+ struct inode *inode, struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_LINK;
+
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_link(handle, inode, dentry, ret);
+}
+
+void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_link(handle, inode, dentry);
+}
+
+void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+ struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_CREAT;
+
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_create(handle, inode, dentry, ret);
+}
+
+void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_create(handle, inode, dentry);
+}
+
+/* __track_fn for inode tracking */
+static int __track_inode(struct inode *inode, void *arg, bool update)
+{
+ if (update)
+ return -EEXIST;
+
+ EXT4_I(inode)->i_fc_lblk_len = 0;
+
+ return 0;
+}
+
+void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
+{
+ int ret;
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ if (ext4_should_journal_data(inode)) {
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
+ return;
+ }
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
+ trace_ext4_fc_track_inode(handle, inode, ret);
+}
+
+struct __track_range_args {
+ ext4_lblk_t start, end;
+};
+
+/* __track_fn for tracking data updates */
+static int __track_range(struct inode *inode, void *arg, bool update)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_lblk_t oldstart;
+ struct __track_range_args *__arg =
+ (struct __track_range_args *)arg;
+
+ if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
+ ext4_debug("Special inode %ld being modified\n", inode->i_ino);
+ return -ECANCELED;
+ }
+
+ oldstart = ei->i_fc_lblk_start;
+
+ if (update && ei->i_fc_lblk_len > 0) {
+ ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
+ ei->i_fc_lblk_len =
+ max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
+ ei->i_fc_lblk_start + 1;
+ } else {
+ ei->i_fc_lblk_start = __arg->start;
+ ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
+ }
+
+ return 0;
+}
+
+void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
+{
+ struct __track_range_args args;
+ int ret;
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ args.start = start;
+ args.end = end;
+
+ ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
+
+ trace_ext4_fc_track_range(handle, inode, start, end, ret);
+}
+
+static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
+{
+ blk_opf_t write_flags = REQ_SYNC;
+ struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
+
+ /* Add REQ_FUA | REQ_PREFLUSH only its tail */
+ if (test_opt(sb, BARRIER) && is_tail)
+ write_flags |= REQ_FUA | REQ_PREFLUSH;
+ lock_buffer(bh);
+ set_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ bh->b_end_io = ext4_end_buffer_io_sync;
+ submit_bh(REQ_OP_WRITE | write_flags, bh);
+ EXT4_SB(sb)->s_fc_bh = NULL;
+}
+
+/* Ext4 commit path routines */
+
+/* memzero and update CRC */
+static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
+ u32 *crc)
+{
+ void *ret;
+
+ ret = memset(dst, 0, len);
+ if (crc)
+ *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
+ return ret;
+}
+
+/*
+ * Allocate len bytes on a fast commit buffer.
+ *
+ * During the commit time this function is used to manage fast commit
+ * block space. We don't split a fast commit log onto different
+ * blocks. So this function makes sure that if there's not enough space
+ * on the current block, the remaining space in the current block is
+ * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
+ * new block is from jbd2 and CRC is updated to reflect the padding
+ * we added.
+ */
+static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
+{
+ struct ext4_fc_tl *tl;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh;
+ int bsize = sbi->s_journal->j_blocksize;
+ int ret, off = sbi->s_fc_bytes % bsize;
+ int pad_len;
+
+ /*
+ * After allocating len, we should have space at least for a 0 byte
+ * padding.
+ */
+ if (len + EXT4_FC_TAG_BASE_LEN > bsize)
+ return NULL;
+
+ if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) {
+ /*
+ * Only allocate from current buffer if we have enough space for
+ * this request AND we have space to add a zero byte padding.
+ */
+ if (!sbi->s_fc_bh) {
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
+ }
+ sbi->s_fc_bytes += len;
+ return sbi->s_fc_bh->b_data + off;
+ }
+ /* Need to add PAD tag */
+ tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
+ tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
+ pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN;
+ tl->fc_len = cpu_to_le16(pad_len);
+ if (crc)
+ *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN);
+ if (pad_len > 0)
+ ext4_fc_memzero(sb, tl + 1, pad_len, crc);
+ ext4_fc_submit_bh(sb, false);
+
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
+ sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
+ return sbi->s_fc_bh->b_data;
+}
+
+/* memcpy to fc reserved space and update CRC */
+static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
+ int len, u32 *crc)
+{
+ if (crc)
+ *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
+ return memcpy(dst, src, len);
+}
+
+/*
+ * Complete a fast commit by writing tail tag.
+ *
+ * Writing tail tag marks the end of a fast commit. In order to guarantee
+ * atomicity, after writing tail tag, even if there's space remaining
+ * in the block, next commit shouldn't use it. That's why tail tag
+ * has the length as that of the remaining space on the block.
+ */
+static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_tl tl;
+ struct ext4_fc_tail tail;
+ int off, bsize = sbi->s_journal->j_blocksize;
+ u8 *dst;
+
+ /*
+ * ext4_fc_reserve_space takes care of allocating an extra block if
+ * there's no enough space on this block for accommodating this tail.
+ */
+ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
+ if (!dst)
+ return -ENOSPC;
+
+ off = sbi->s_fc_bytes % bsize;
+
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
+ tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
+ sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
+
+ ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc);
+ dst += EXT4_FC_TAG_BASE_LEN;
+ tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
+ ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
+ dst += sizeof(tail.fc_tid);
+ tail.fc_crc = cpu_to_le32(crc);
+ ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
+
+ ext4_fc_submit_bh(sb, true);
+
+ return 0;
+}
+
+/*
+ * Adds tag, length, value and updates CRC. Returns true if tlv was added.
+ * Returns false if there's not enough space.
+ */
+static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
+ u32 *crc)
+{
+ struct ext4_fc_tl tl;
+ u8 *dst;
+
+ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
+ if (!dst)
+ return false;
+
+ tl.fc_tag = cpu_to_le16(tag);
+ tl.fc_len = cpu_to_le16(len);
+
+ ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+ ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc);
+
+ return true;
+}
+
+/* Same as above, but adds dentry tlv. */
+static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
+ struct ext4_fc_dentry_update *fc_dentry)
+{
+ struct ext4_fc_dentry_info fcd;
+ struct ext4_fc_tl tl;
+ int dlen = fc_dentry->fcd_name.len;
+ u8 *dst = ext4_fc_reserve_space(sb,
+ EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
+
+ if (!dst)
+ return false;
+
+ fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
+ fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
+ tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
+ tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
+ ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+ dst += EXT4_FC_TAG_BASE_LEN;
+ ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
+ dst += sizeof(fcd);
+ ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
+
+ return true;
+}
+
+/*
+ * Writes inode in the fast commit space under TLV with tag @tag.
+ * Returns 0 on success, error on failure.
+ */
+static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
+ int ret;
+ struct ext4_iloc iloc;
+ struct ext4_fc_inode fc_inode;
+ struct ext4_fc_tl tl;
+ u8 *dst;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ inode_len = EXT4_INODE_SIZE(inode->i_sb);
+ else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
+ inode_len += ei->i_extra_isize;
+
+ fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
+ tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
+
+ ret = -ECANCELED;
+ dst = ext4_fc_reserve_space(inode->i_sb,
+ EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
+ if (!dst)
+ goto err;
+
+ if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc))
+ goto err;
+ dst += EXT4_FC_TAG_BASE_LEN;
+ if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
+ goto err;
+ dst += sizeof(fc_inode);
+ if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
+ inode_len, crc))
+ goto err;
+ ret = 0;
+err:
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * Writes updated data ranges for the inode in question. Updates CRC.
+ * Returns 0 on success, error otherwise.
+ */
+static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
+{
+ ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_map_blocks map;
+ struct ext4_fc_add_range fc_ext;
+ struct ext4_fc_del_range lrange;
+ struct ext4_extent *ex;
+ int ret;
+
+ mutex_lock(&ei->i_fc_lock);
+ if (ei->i_fc_lblk_len == 0) {
+ mutex_unlock(&ei->i_fc_lock);
+ return 0;
+ }
+ old_blk_size = ei->i_fc_lblk_start;
+ new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
+ ei->i_fc_lblk_len = 0;
+ mutex_unlock(&ei->i_fc_lock);
+
+ cur_lblk_off = old_blk_size;
+ ext4_debug("will try writing %d to %d for inode %ld\n",
+ cur_lblk_off, new_blk_size, inode->i_ino);
+
+ while (cur_lblk_off <= new_blk_size) {
+ map.m_lblk = cur_lblk_off;
+ map.m_len = new_blk_size - cur_lblk_off + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return -ECANCELED;
+
+ if (map.m_len == 0) {
+ cur_lblk_off++;
+ continue;
+ }
+
+ if (ret == 0) {
+ lrange.fc_ino = cpu_to_le32(inode->i_ino);
+ lrange.fc_lblk = cpu_to_le32(map.m_lblk);
+ lrange.fc_len = cpu_to_le32(map.m_len);
+ if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
+ sizeof(lrange), (u8 *)&lrange, crc))
+ return -ENOSPC;
+ } else {
+ unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
+ EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
+
+ /* Limit the number of blocks in one extent */
+ map.m_len = min(max, map.m_len);
+
+ fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
+ ex = (struct ext4_extent *)&fc_ext.fc_ex;
+ ex->ee_block = cpu_to_le32(map.m_lblk);
+ ex->ee_len = cpu_to_le16(map.m_len);
+ ext4_ext_store_pblock(ex, map.m_pblk);
+ if (map.m_flags & EXT4_MAP_UNWRITTEN)
+ ext4_ext_mark_unwritten(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+ if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
+ sizeof(fc_ext), (u8 *)&fc_ext, crc))
+ return -ENOSPC;
+ }
+
+ cur_lblk_off += map.m_len;
+ }
+
+ return 0;
+}
+
+
+/* Submit data for all the fast commit inodes */
+static int ext4_fc_submit_inode_data_all(journal_t *journal)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *ei;
+ int ret = 0;
+
+ spin_lock(&sbi->s_fc_lock);
+ list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
+ while (atomic_read(&ei->i_fc_updates)) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&ei->i_fc_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&ei->i_fc_updates)) {
+ spin_unlock(&sbi->s_fc_lock);
+ schedule();
+ spin_lock(&sbi->s_fc_lock);
+ }
+ finish_wait(&ei->i_fc_wait, &wait);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+ ret = jbd2_submit_inode_data(ei->jinode);
+ if (ret)
+ return ret;
+ spin_lock(&sbi->s_fc_lock);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+
+ return ret;
+}
+
+/* Wait for completion of data for all the fast commit inodes */
+static int ext4_fc_wait_inode_data_all(journal_t *journal)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *pos, *n;
+ int ret = 0;
+
+ spin_lock(&sbi->s_fc_lock);
+ list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ if (!ext4_test_inode_state(&pos->vfs_inode,
+ EXT4_STATE_FC_COMMITTING))
+ continue;
+ spin_unlock(&sbi->s_fc_lock);
+
+ ret = jbd2_wait_inode_data(journal, pos->jinode);
+ if (ret)
+ return ret;
+ spin_lock(&sbi->s_fc_lock);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+
+ return 0;
+}
+
+/* Commit all the directory entry updates */
+static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
+__acquires(&sbi->s_fc_lock)
+__releases(&sbi->s_fc_lock)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
+ struct inode *inode;
+ struct ext4_inode_info *ei;
+ int ret;
+
+ if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
+ return 0;
+ list_for_each_entry_safe(fc_dentry, fc_dentry_n,
+ &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+ if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
+ spin_unlock(&sbi->s_fc_lock);
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
+ ret = -ENOSPC;
+ goto lock_and_exit;
+ }
+ spin_lock(&sbi->s_fc_lock);
+ continue;
+ }
+ /*
+ * With fcd_dilist we need not loop in sbi->s_fc_q to get the
+ * corresponding inode pointer
+ */
+ WARN_ON(list_empty(&fc_dentry->fcd_dilist));
+ ei = list_first_entry(&fc_dentry->fcd_dilist,
+ struct ext4_inode_info, i_fc_dilist);
+ inode = &ei->vfs_inode;
+ WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
+
+ spin_unlock(&sbi->s_fc_lock);
+
+ /*
+ * We first write the inode and then the create dirent. This
+ * allows the recovery code to create an unnamed inode first
+ * and then link it to a directory entry. This allows us
+ * to use namei.c routines almost as is and simplifies
+ * the recovery code.
+ */
+ ret = ext4_fc_write_inode(inode, crc);
+ if (ret)
+ goto lock_and_exit;
+
+ ret = ext4_fc_write_inode_data(inode, crc);
+ if (ret)
+ goto lock_and_exit;
+
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
+ ret = -ENOSPC;
+ goto lock_and_exit;
+ }
+
+ spin_lock(&sbi->s_fc_lock);
+ }
+ return 0;
+lock_and_exit:
+ spin_lock(&sbi->s_fc_lock);
+ return ret;
+}
+
+static int ext4_fc_perform_commit(journal_t *journal)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *iter;
+ struct ext4_fc_head head;
+ struct inode *inode;
+ struct blk_plug plug;
+ int ret = 0;
+ u32 crc = 0;
+
+ ret = ext4_fc_submit_inode_data_all(journal);
+ if (ret)
+ return ret;
+
+ ret = ext4_fc_wait_inode_data_all(journal);
+ if (ret)
+ return ret;
+
+ /*
+ * If file system device is different from journal device, issue a cache
+ * flush before we start writing fast commit blocks.
+ */
+ if (journal->j_fs_dev != journal->j_dev)
+ blkdev_issue_flush(journal->j_fs_dev);
+
+ blk_start_plug(&plug);
+ if (sbi->s_fc_bytes == 0) {
+ /*
+ * Add a head tag only if this is the first fast commit
+ * in this TID.
+ */
+ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
+ head.fc_tid = cpu_to_le32(
+ sbi->s_journal->j_running_transaction->t_tid);
+ if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
+ (u8 *)&head, &crc)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ }
+
+ spin_lock(&sbi->s_fc_lock);
+ ret = ext4_fc_commit_dentry_updates(journal, &crc);
+ if (ret) {
+ spin_unlock(&sbi->s_fc_lock);
+ goto out;
+ }
+
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ inode = &iter->vfs_inode;
+ if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ continue;
+
+ spin_unlock(&sbi->s_fc_lock);
+ ret = ext4_fc_write_inode_data(inode, &crc);
+ if (ret)
+ goto out;
+ ret = ext4_fc_write_inode(inode, &crc);
+ if (ret)
+ goto out;
+ spin_lock(&sbi->s_fc_lock);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+
+ ret = ext4_fc_write_tail(sb, crc);
+
+out:
+ blk_finish_plug(&plug);
+ return ret;
+}
+
+static void ext4_fc_update_stats(struct super_block *sb, int status,
+ u64 commit_time, int nblks, tid_t commit_tid)
+{
+ struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
+
+ ext4_debug("Fast commit ended with status = %d for tid %u",
+ status, commit_tid);
+ if (status == EXT4_FC_STATUS_OK) {
+ stats->fc_num_commits++;
+ stats->fc_numblks += nblks;
+ if (likely(stats->s_fc_avg_commit_time))
+ stats->s_fc_avg_commit_time =
+ (commit_time +
+ stats->s_fc_avg_commit_time * 3) / 4;
+ else
+ stats->s_fc_avg_commit_time = commit_time;
+ } else if (status == EXT4_FC_STATUS_FAILED ||
+ status == EXT4_FC_STATUS_INELIGIBLE) {
+ if (status == EXT4_FC_STATUS_FAILED)
+ stats->fc_failed_commits++;
+ stats->fc_ineligible_commits++;
+ } else {
+ stats->fc_skipped_commits++;
+ }
+ trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
+}
+
+/*
+ * The main commit entry point. Performs a fast commit for transaction
+ * commit_tid if needed. If it's not possible to perform a fast commit
+ * due to various reasons, we fall back to full commit. Returns 0
+ * on success, error otherwise.
+ */
+int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int nblks = 0, ret, bsize = journal->j_blocksize;
+ int subtid = atomic_read(&sbi->s_fc_subtid);
+ int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
+ ktime_t start_time, commit_time;
+
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
+ return jbd2_complete_transaction(journal, commit_tid);
+
+ trace_ext4_fc_commit_start(sb, commit_tid);
+
+ start_time = ktime_get();
+
+restart_fc:
+ ret = jbd2_fc_begin_commit(journal, commit_tid);
+ if (ret == -EALREADY) {
+ /* There was an ongoing commit, check if we need to restart */
+ if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
+ commit_tid > journal->j_commit_sequence)
+ goto restart_fc;
+ ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
+ commit_tid);
+ return 0;
+ } else if (ret) {
+ /*
+ * Commit couldn't start. Just update stats and perform a
+ * full commit.
+ */
+ ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
+ commit_tid);
+ return jbd2_complete_transaction(journal, commit_tid);
+ }
+
+ /*
+ * After establishing journal barrier via jbd2_fc_begin_commit(), check
+ * if we are fast commit ineligible.
+ */
+ if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
+ status = EXT4_FC_STATUS_INELIGIBLE;
+ goto fallback;
+ }
+
+ fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
+ ret = ext4_fc_perform_commit(journal);
+ if (ret < 0) {
+ status = EXT4_FC_STATUS_FAILED;
+ goto fallback;
+ }
+ nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
+ ret = jbd2_fc_wait_bufs(journal, nblks);
+ if (ret < 0) {
+ status = EXT4_FC_STATUS_FAILED;
+ goto fallback;
+ }
+ atomic_inc(&sbi->s_fc_subtid);
+ ret = jbd2_fc_end_commit(journal);
+ /*
+ * weight the commit time higher than the average time so we
+ * don't react too strongly to vast changes in the commit time
+ */
+ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+ ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
+ return ret;
+
+fallback:
+ ret = jbd2_fc_end_commit_fallback(journal);
+ ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
+ return ret;
+}
+
+/*
+ * Fast commit cleanup routine. This is called after every fast commit and
+ * full commit. full is true if we are called after a full commit.
+ */
+static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *iter, *iter_n;
+ struct ext4_fc_dentry_update *fc_dentry;
+
+ if (full && sbi->s_fc_bh)
+ sbi->s_fc_bh = NULL;
+
+ trace_ext4_fc_cleanup(journal, full, tid);
+ jbd2_fc_release_bufs(journal);
+
+ spin_lock(&sbi->s_fc_lock);
+ list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
+ i_fc_list) {
+ list_del_init(&iter->i_fc_list);
+ ext4_clear_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ if (iter->i_sync_tid <= tid)
+ ext4_fc_reset_inode(&iter->vfs_inode);
+ /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+ smp_mb();
+#if (BITS_PER_LONG < 64)
+ wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
+#else
+ wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
+#endif
+ }
+
+ while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
+ fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
+ struct ext4_fc_dentry_update,
+ fcd_list);
+ list_del_init(&fc_dentry->fcd_list);
+ list_del_init(&fc_dentry->fcd_dilist);
+ spin_unlock(&sbi->s_fc_lock);
+
+ if (fc_dentry->fcd_name.name &&
+ fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
+ kfree(fc_dentry->fcd_name.name);
+ kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
+ spin_lock(&sbi->s_fc_lock);
+ }
+
+ list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
+ &sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
+ &sbi->s_fc_q[FC_Q_MAIN]);
+
+ if (tid >= sbi->s_fc_ineligible_tid) {
+ sbi->s_fc_ineligible_tid = 0;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ }
+
+ if (full)
+ sbi->s_fc_bytes = 0;
+ spin_unlock(&sbi->s_fc_lock);
+ trace_ext4_fc_stats(sb);
+}
+
+/* Ext4 Replay Path Routines */
+
+/* Helper struct for dentry replay routines */
+struct dentry_info_args {
+ int parent_ino, dname_len, ino, inode_len;
+ char *dname;
+};
+
+static inline void tl_to_darg(struct dentry_info_args *darg,
+ struct ext4_fc_tl *tl, u8 *val)
+{
+ struct ext4_fc_dentry_info fcd;
+
+ memcpy(&fcd, val, sizeof(fcd));
+
+ darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
+ darg->ino = le32_to_cpu(fcd.fc_ino);
+ darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
+ darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
+}
+
+static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val)
+{
+ memcpy(tl, val, EXT4_FC_TAG_BASE_LEN);
+ tl->fc_len = le16_to_cpu(tl->fc_len);
+ tl->fc_tag = le16_to_cpu(tl->fc_tag);
+}
+
+/* Unlink replay function */
+static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
+{
+ struct inode *inode, *old_parent;
+ struct qstr entry;
+ struct dentry_info_args darg;
+ int ret = 0;
+
+ tl_to_darg(&darg, tl, val);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ entry.name = darg.dname;
+ entry.len = darg.dname_len;
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode %d not found", darg.ino);
+ return 0;
+ }
+
+ old_parent = ext4_iget(sb, darg.parent_ino,
+ EXT4_IGET_NORMAL);
+ if (IS_ERR(old_parent)) {
+ ext4_debug("Dir with inode %d not found", darg.parent_ino);
+ iput(inode);
+ return 0;
+ }
+
+ ret = __ext4_unlink(NULL, old_parent, &entry, inode);
+ /* -ENOENT ok coz it might not exist anymore. */
+ if (ret == -ENOENT)
+ ret = 0;
+ iput(old_parent);
+ iput(inode);
+ return ret;
+}
+
+static int ext4_fc_replay_link_internal(struct super_block *sb,
+ struct dentry_info_args *darg,
+ struct inode *inode)
+{
+ struct inode *dir = NULL;
+ struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
+ struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
+ int ret = 0;
+
+ dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(dir)) {
+ ext4_debug("Dir with inode %d not found.", darg->parent_ino);
+ dir = NULL;
+ goto out;
+ }
+
+ dentry_dir = d_obtain_alias(dir);
+ if (IS_ERR(dentry_dir)) {
+ ext4_debug("Failed to obtain dentry");
+ dentry_dir = NULL;
+ goto out;
+ }
+
+ dentry_inode = d_alloc(dentry_dir, &qstr_dname);
+ if (!dentry_inode) {
+ ext4_debug("Inode dentry not created.");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = __ext4_link(dir, inode, dentry_inode);
+ /*
+ * It's possible that link already existed since data blocks
+ * for the dir in question got persisted before we crashed OR
+ * we replayed this tag and crashed before the entire replay
+ * could complete.
+ */
+ if (ret && ret != -EEXIST) {
+ ext4_debug("Failed to link\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (dentry_dir) {
+ d_drop(dentry_dir);
+ dput(dentry_dir);
+ } else if (dir) {
+ iput(dir);
+ }
+ if (dentry_inode) {
+ d_drop(dentry_inode);
+ dput(dentry_inode);
+ }
+
+ return ret;
+}
+
+/* Link replay function */
+static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
+{
+ struct inode *inode;
+ struct dentry_info_args darg;
+ int ret = 0;
+
+ tl_to_darg(&darg, tl, val);
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode not found.");
+ return 0;
+ }
+
+ ret = ext4_fc_replay_link_internal(sb, &darg, inode);
+ iput(inode);
+ return ret;
+}
+
+/*
+ * Record all the modified inodes during replay. We use this later to setup
+ * block bitmaps correctly.
+ */
+static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
+{
+ struct ext4_fc_replay_state *state;
+ int i;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_modified_inodes_used; i++)
+ if (state->fc_modified_inodes[i] == ino)
+ return 0;
+ if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
+ int *fc_modified_inodes;
+
+ fc_modified_inodes = krealloc(state->fc_modified_inodes,
+ sizeof(int) * (state->fc_modified_inodes_size +
+ EXT4_FC_REPLAY_REALLOC_INCREMENT),
+ GFP_KERNEL);
+ if (!fc_modified_inodes)
+ return -ENOMEM;
+ state->fc_modified_inodes = fc_modified_inodes;
+ state->fc_modified_inodes_size +=
+ EXT4_FC_REPLAY_REALLOC_INCREMENT;
+ }
+ state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
+ return 0;
+}
+
+/*
+ * Inode replay function
+ */
+static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
+{
+ struct ext4_fc_inode fc_inode;
+ struct ext4_inode *raw_inode;
+ struct ext4_inode *raw_fc_inode;
+ struct inode *inode = NULL;
+ struct ext4_iloc iloc;
+ int inode_len, ino, ret, tag = tl->fc_tag;
+ struct ext4_extent_header *eh;
+ size_t off_gen = offsetof(struct ext4_inode, i_generation);
+
+ memcpy(&fc_inode, val, sizeof(fc_inode));
+
+ ino = le32_to_cpu(fc_inode.fc_ino);
+ trace_ext4_fc_replay(sb, tag, ino, 0, 0);
+
+ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
+ if (!IS_ERR(inode)) {
+ ext4_ext_clear_bb(inode);
+ iput(inode);
+ }
+ inode = NULL;
+
+ ret = ext4_fc_record_modified_inode(sb, ino);
+ if (ret)
+ goto out;
+
+ raw_fc_inode = (struct ext4_inode *)
+ (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
+ ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
+ if (ret)
+ goto out;
+
+ inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
+ raw_inode = ext4_raw_inode(&iloc);
+
+ memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
+ memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
+ inode_len - off_gen);
+ if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
+ eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
+ if (eh->eh_magic != EXT4_EXT_MAGIC) {
+ memset(eh, 0, sizeof(*eh));
+ eh->eh_magic = EXT4_EXT_MAGIC;
+ eh->eh_max = cpu_to_le16(
+ (sizeof(raw_inode->i_block) -
+ sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent));
+ }
+ } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
+ memcpy(raw_inode->i_block, raw_fc_inode->i_block,
+ sizeof(raw_inode->i_block));
+ }
+
+ /* Immediately update the inode on disk. */
+ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
+ if (ret)
+ goto out;
+ ret = sync_dirty_buffer(iloc.bh);
+ if (ret)
+ goto out;
+ ret = ext4_mark_inode_used(sb, ino);
+ if (ret)
+ goto out;
+
+ /* Given that we just wrote the inode on disk, this SHOULD succeed. */
+ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode not found.");
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Our allocator could have made different decisions than before
+ * crashing. This should be fixed but until then, we calculate
+ * the number of blocks the inode.
+ */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ ext4_ext_replay_set_iblocks(inode);
+
+ inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
+ ext4_reset_inode_seed(inode);
+
+ ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
+ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
+ sync_dirty_buffer(iloc.bh);
+ brelse(iloc.bh);
+out:
+ iput(inode);
+ if (!ret)
+ blkdev_issue_flush(sb->s_bdev);
+
+ return 0;
+}
+
+/*
+ * Dentry create replay function.
+ *
+ * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
+ * inode for which we are trying to create a dentry here, should already have
+ * been replayed before we start here.
+ */
+static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
+{
+ int ret = 0;
+ struct inode *inode = NULL;
+ struct inode *dir = NULL;
+ struct dentry_info_args darg;
+
+ tl_to_darg(&darg, tl, val);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ /* This takes care of update group descriptor and other metadata */
+ ret = ext4_mark_inode_used(sb, darg.ino);
+ if (ret)
+ goto out;
+
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("inode %d not found.", darg.ino);
+ inode = NULL;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (S_ISDIR(inode->i_mode)) {
+ /*
+ * If we are creating a directory, we need to make sure that the
+ * dot and dot dot dirents are setup properly.
+ */
+ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(dir)) {
+ ext4_debug("Dir %d not found.", darg.ino);
+ goto out;
+ }
+ ret = ext4_init_new_dir(NULL, dir, inode);
+ iput(dir);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ }
+ ret = ext4_fc_replay_link_internal(sb, &darg, inode);
+ if (ret)
+ goto out;
+ set_nlink(inode, 1);
+ ext4_mark_inode_dirty(NULL, inode);
+out:
+ iput(inode);
+ return ret;
+}
+
+/*
+ * Record physical disk regions which are in use as per fast commit area,
+ * and used by inodes during replay phase. Our simple replay phase
+ * allocator excludes these regions from allocation.
+ */
+int ext4_fc_record_regions(struct super_block *sb, int ino,
+ ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
+{
+ struct ext4_fc_replay_state *state;
+ struct ext4_fc_alloc_region *region;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ /*
+ * during replay phase, the fc_regions_valid may not same as
+ * fc_regions_used, update it when do new additions.
+ */
+ if (replay && state->fc_regions_used != state->fc_regions_valid)
+ state->fc_regions_used = state->fc_regions_valid;
+ if (state->fc_regions_used == state->fc_regions_size) {
+ struct ext4_fc_alloc_region *fc_regions;
+
+ fc_regions = krealloc(state->fc_regions,
+ sizeof(struct ext4_fc_alloc_region) *
+ (state->fc_regions_size +
+ EXT4_FC_REPLAY_REALLOC_INCREMENT),
+ GFP_KERNEL);
+ if (!fc_regions)
+ return -ENOMEM;
+ state->fc_regions_size +=
+ EXT4_FC_REPLAY_REALLOC_INCREMENT;
+ state->fc_regions = fc_regions;
+ }
+ region = &state->fc_regions[state->fc_regions_used++];
+ region->ino = ino;
+ region->lblk = lblk;
+ region->pblk = pblk;
+ region->len = len;
+
+ if (replay)
+ state->fc_regions_valid++;
+
+ return 0;
+}
+
+/* Replay add range tag */
+static int ext4_fc_replay_add_range(struct super_block *sb,
+ struct ext4_fc_tl *tl, u8 *val)
+{
+ struct ext4_fc_add_range fc_add_ex;
+ struct ext4_extent newex, *ex;
+ struct inode *inode;
+ ext4_lblk_t start, cur;
+ int remaining, len;
+ ext4_fsblk_t start_pblk;
+ struct ext4_map_blocks map;
+ struct ext4_ext_path *path = NULL;
+ int ret;
+
+ memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
+ ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
+ le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex));
+
+ inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode not found.");
+ return 0;
+ }
+
+ ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+ if (ret)
+ goto out;
+
+ start = le32_to_cpu(ex->ee_block);
+ start_pblk = ext4_ext_pblock(ex);
+ len = ext4_ext_get_actual_len(ex);
+
+ cur = start;
+ remaining = len;
+ ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
+ start, start_pblk, len, ext4_ext_is_unwritten(ex),
+ inode->i_ino);
+
+ while (remaining > 0) {
+ map.m_lblk = cur;
+ map.m_len = remaining;
+ map.m_pblk = 0;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ /* Range is not mapped */
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ goto out;
+ memset(&newex, 0, sizeof(newex));
+ newex.ee_block = cpu_to_le32(cur);
+ ext4_ext_store_pblock(
+ &newex, start_pblk + cur - start);
+ newex.ee_len = cpu_to_le16(map.m_len);
+ if (ext4_ext_is_unwritten(ex))
+ ext4_ext_mark_unwritten(&newex);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_insert_extent(
+ NULL, inode, &path, &newex, 0);
+ up_write((&EXT4_I(inode)->i_data_sem));
+ ext4_free_ext_path(path);
+ if (ret)
+ goto out;
+ goto next;
+ }
+
+ if (start_pblk + cur - start != map.m_pblk) {
+ /*
+ * Logical to physical mapping changed. This can happen
+ * if this range was removed and then reallocated to
+ * map to new physical blocks during a fast commit.
+ */
+ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+ ext4_ext_is_unwritten(ex),
+ start_pblk + cur - start);
+ if (ret)
+ goto out;
+ /*
+ * Mark the old blocks as free since they aren't used
+ * anymore. We maintain an array of all the modified
+ * inodes. In case these blocks are still used at either
+ * a different logical range in the same inode or in
+ * some different inode, we will mark them as allocated
+ * at the end of the FC replay using our array of
+ * modified inodes.
+ */
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ goto next;
+ }
+
+ /* Range is mapped and needs a state change */
+ ext4_debug("Converting from %ld to %d %lld",
+ map.m_flags & EXT4_MAP_UNWRITTEN,
+ ext4_ext_is_unwritten(ex), map.m_pblk);
+ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+ ext4_ext_is_unwritten(ex), map.m_pblk);
+ if (ret)
+ goto out;
+ /*
+ * We may have split the extent tree while toggling the state.
+ * Try to shrink the extent tree now.
+ */
+ ext4_ext_replay_shrink_inode(inode, start + len);
+next:
+ cur += map.m_len;
+ remaining -= map.m_len;
+ }
+ ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
+ sb->s_blocksize_bits);
+out:
+ iput(inode);
+ return 0;
+}
+
+/* Replay DEL_RANGE tag */
+static int
+ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
+{
+ struct inode *inode;
+ struct ext4_fc_del_range lrange;
+ struct ext4_map_blocks map;
+ ext4_lblk_t cur, remaining;
+ int ret;
+
+ memcpy(&lrange, val, sizeof(lrange));
+ cur = le32_to_cpu(lrange.fc_lblk);
+ remaining = le32_to_cpu(lrange.fc_len);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
+ le32_to_cpu(lrange.fc_ino), cur, remaining);
+
+ inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
+ return 0;
+ }
+
+ ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+ if (ret)
+ goto out;
+
+ ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
+ inode->i_ino, le32_to_cpu(lrange.fc_lblk),
+ le32_to_cpu(lrange.fc_len));
+ while (remaining > 0) {
+ map.m_lblk = cur;
+ map.m_len = remaining;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ remaining -= ret;
+ cur += ret;
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ } else {
+ remaining -= map.m_len;
+ cur += map.m_len;
+ }
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
+ le32_to_cpu(lrange.fc_lblk) +
+ le32_to_cpu(lrange.fc_len) - 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out;
+ ext4_ext_replay_shrink_inode(inode,
+ i_size_read(inode) >> sb->s_blocksize_bits);
+ ext4_mark_inode_dirty(NULL, inode);
+out:
+ iput(inode);
+ return 0;
+}
+
+static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
+{
+ struct ext4_fc_replay_state *state;
+ struct inode *inode;
+ struct ext4_ext_path *path = NULL;
+ struct ext4_map_blocks map;
+ int i, ret, j;
+ ext4_lblk_t cur, end;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_modified_inodes_used; i++) {
+ inode = ext4_iget(sb, state->fc_modified_inodes[i],
+ EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ ext4_debug("Inode %d not found.",
+ state->fc_modified_inodes[i]);
+ continue;
+ }
+ cur = 0;
+ end = EXT_MAX_BLOCKS;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
+ iput(inode);
+ continue;
+ }
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+
+ if (ret > 0) {
+ path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
+ if (!IS_ERR(path)) {
+ for (j = 0; j < path->p_depth; j++)
+ ext4_mb_mark_bb(inode->i_sb,
+ path[j].p_block, 1, 1);
+ ext4_free_ext_path(path);
+ }
+ cur += ret;
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
+ map.m_len, 1);
+ } else {
+ cur = cur + (map.m_len ? map.m_len : 1);
+ }
+ }
+ iput(inode);
+ }
+}
+
+/*
+ * Check if block is in excluded regions for block allocation. The simple
+ * allocator that runs during replay phase is calls this function to see
+ * if it is okay to use a block.
+ */
+bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
+{
+ int i;
+ struct ext4_fc_replay_state *state;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_regions_valid; i++) {
+ if (state->fc_regions[i].ino == 0 ||
+ state->fc_regions[i].len == 0)
+ continue;
+ if (in_range(blk, state->fc_regions[i].pblk,
+ state->fc_regions[i].len))
+ return true;
+ }
+ return false;
+}
+
+/* Cleanup function called after replay */
+void ext4_fc_replay_cleanup(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ sbi->s_mount_state &= ~EXT4_FC_REPLAY;
+ kfree(sbi->s_fc_replay_state.fc_regions);
+ kfree(sbi->s_fc_replay_state.fc_modified_inodes);
+}
+
+static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl,
+ u8 *val, u8 *end)
+{
+ if (val + tl->fc_len > end)
+ return false;
+
+ /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do
+ * journal rescan before do CRC check. Other tags length check will
+ * rely on CRC check.
+ */
+ switch (tl->fc_tag) {
+ case EXT4_FC_TAG_ADD_RANGE:
+ return (sizeof(struct ext4_fc_add_range) == tl->fc_len);
+ case EXT4_FC_TAG_TAIL:
+ return (sizeof(struct ext4_fc_tail) <= tl->fc_len);
+ case EXT4_FC_TAG_HEAD:
+ return (sizeof(struct ext4_fc_head) == tl->fc_len);
+ case EXT4_FC_TAG_DEL_RANGE:
+ case EXT4_FC_TAG_LINK:
+ case EXT4_FC_TAG_UNLINK:
+ case EXT4_FC_TAG_CREAT:
+ case EXT4_FC_TAG_INODE:
+ case EXT4_FC_TAG_PAD:
+ default:
+ return true;
+ }
+}
+
+/*
+ * Recovery Scan phase handler
+ *
+ * This function is called during the scan phase and is responsible
+ * for doing following things:
+ * - Make sure the fast commit area has valid tags for replay
+ * - Count number of tags that need to be replayed by the replay handler
+ * - Verify CRC
+ * - Create a list of excluded blocks for allocation during replay phase
+ *
+ * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
+ * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
+ * to indicate that scan has finished and JBD2 can now start replay phase.
+ * It returns a negative error to indicate that there was an error. At the end
+ * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
+ * to indicate the number of tags that need to replayed during the replay phase.
+ */
+static int ext4_fc_replay_scan(journal_t *journal,
+ struct buffer_head *bh, int off,
+ tid_t expected_tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_replay_state *state;
+ int ret = JBD2_FC_REPLAY_CONTINUE;
+ struct ext4_fc_add_range ext;
+ struct ext4_fc_tl tl;
+ struct ext4_fc_tail tail;
+ __u8 *start, *end, *cur, *val;
+ struct ext4_fc_head head;
+ struct ext4_extent *ex;
+
+ state = &sbi->s_fc_replay_state;
+
+ start = (u8 *)bh->b_data;
+ end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+
+ if (state->fc_replay_expected_off == 0) {
+ state->fc_cur_tag = 0;
+ state->fc_replay_num_tags = 0;
+ state->fc_crc = 0;
+ state->fc_regions = NULL;
+ state->fc_regions_valid = state->fc_regions_used =
+ state->fc_regions_size = 0;
+ /* Check if we can stop early */
+ if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
+ != EXT4_FC_TAG_HEAD)
+ return 0;
+ }
+
+ if (off != state->fc_replay_expected_off) {
+ ret = -EFSCORRUPTED;
+ goto out_err;
+ }
+
+ state->fc_replay_expected_off++;
+ for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
+ ext4_fc_get_tl(&tl, cur);
+ val = cur + EXT4_FC_TAG_BASE_LEN;
+ if (!ext4_fc_tag_len_isvalid(&tl, val, end)) {
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -ECANCELED;
+ goto out_err;
+ }
+ ext4_debug("Scan phase, tag:%s, blk %lld\n",
+ tag2str(tl.fc_tag), bh->b_blocknr);
+ switch (tl.fc_tag) {
+ case EXT4_FC_TAG_ADD_RANGE:
+ memcpy(&ext, val, sizeof(ext));
+ ex = (struct ext4_extent *)&ext.fc_ex;
+ ret = ext4_fc_record_regions(sb,
+ le32_to_cpu(ext.fc_ino),
+ le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
+ ext4_ext_get_actual_len(ex), 0);
+ if (ret < 0)
+ break;
+ ret = JBD2_FC_REPLAY_CONTINUE;
+ fallthrough;
+ case EXT4_FC_TAG_DEL_RANGE:
+ case EXT4_FC_TAG_LINK:
+ case EXT4_FC_TAG_UNLINK:
+ case EXT4_FC_TAG_CREAT:
+ case EXT4_FC_TAG_INODE:
+ case EXT4_FC_TAG_PAD:
+ state->fc_cur_tag++;
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ EXT4_FC_TAG_BASE_LEN + tl.fc_len);
+ break;
+ case EXT4_FC_TAG_TAIL:
+ state->fc_cur_tag++;
+ memcpy(&tail, val, sizeof(tail));
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ EXT4_FC_TAG_BASE_LEN +
+ offsetof(struct ext4_fc_tail,
+ fc_crc));
+ if (le32_to_cpu(tail.fc_tid) == expected_tid &&
+ le32_to_cpu(tail.fc_crc) == state->fc_crc) {
+ state->fc_replay_num_tags = state->fc_cur_tag;
+ state->fc_regions_valid =
+ state->fc_regions_used;
+ } else {
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -EFSBADCRC;
+ }
+ state->fc_crc = 0;
+ break;
+ case EXT4_FC_TAG_HEAD:
+ memcpy(&head, val, sizeof(head));
+ if (le32_to_cpu(head.fc_features) &
+ ~EXT4_FC_SUPPORTED_FEATURES) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ if (le32_to_cpu(head.fc_tid) != expected_tid) {
+ ret = JBD2_FC_REPLAY_STOP;
+ break;
+ }
+ state->fc_cur_tag++;
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ EXT4_FC_TAG_BASE_LEN + tl.fc_len);
+ break;
+ default:
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -ECANCELED;
+ }
+ if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
+ break;
+ }
+
+out_err:
+ trace_ext4_fc_replay_scan(sb, ret, off);
+ return ret;
+}
+
+/*
+ * Main recovery path entry point.
+ * The meaning of return codes is similar as above.
+ */
+static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
+ enum passtype pass, int off, tid_t expected_tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_tl tl;
+ __u8 *start, *end, *cur, *val;
+ int ret = JBD2_FC_REPLAY_CONTINUE;
+ struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
+ struct ext4_fc_tail tail;
+
+ if (pass == PASS_SCAN) {
+ state->fc_current_pass = PASS_SCAN;
+ return ext4_fc_replay_scan(journal, bh, off, expected_tid);
+ }
+
+ if (state->fc_current_pass != pass) {
+ state->fc_current_pass = pass;
+ sbi->s_mount_state |= EXT4_FC_REPLAY;
+ }
+ if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
+ ext4_debug("Replay stops\n");
+ ext4_fc_set_bitmaps_and_counters(sb);
+ return 0;
+ }
+
+#ifdef CONFIG_EXT4_DEBUG
+ if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
+ pr_warn("Dropping fc block %d because max_replay set\n", off);
+ return JBD2_FC_REPLAY_STOP;
+ }
+#endif
+
+ start = (u8 *)bh->b_data;
+ end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+
+ for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
+ ext4_fc_get_tl(&tl, cur);
+ val = cur + EXT4_FC_TAG_BASE_LEN;
+
+ if (state->fc_replay_num_tags == 0) {
+ ret = JBD2_FC_REPLAY_STOP;
+ ext4_fc_set_bitmaps_and_counters(sb);
+ break;
+ }
+
+ ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
+ state->fc_replay_num_tags--;
+ switch (tl.fc_tag) {
+ case EXT4_FC_TAG_LINK:
+ ret = ext4_fc_replay_link(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_UNLINK:
+ ret = ext4_fc_replay_unlink(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_ADD_RANGE:
+ ret = ext4_fc_replay_add_range(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_CREAT:
+ ret = ext4_fc_replay_create(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_DEL_RANGE:
+ ret = ext4_fc_replay_del_range(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_INODE:
+ ret = ext4_fc_replay_inode(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_PAD:
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
+ tl.fc_len, 0);
+ break;
+ case EXT4_FC_TAG_TAIL:
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
+ 0, tl.fc_len, 0);
+ memcpy(&tail, val, sizeof(tail));
+ WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
+ break;
+ case EXT4_FC_TAG_HEAD:
+ break;
+ default:
+ trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
+ ret = -ECANCELED;
+ break;
+ }
+ if (ret < 0)
+ break;
+ ret = JBD2_FC_REPLAY_CONTINUE;
+ }
+ return ret;
+}
+
+void ext4_fc_init(struct super_block *sb, journal_t *journal)
+{
+ /*
+ * We set replay callback even if fast commit disabled because we may
+ * could still have fast commit blocks that need to be replayed even if
+ * fast commit has now been turned off.
+ */
+ journal->j_fc_replay_callback = ext4_fc_replay;
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
+ return;
+ journal->j_fc_cleanup_callback = ext4_fc_cleanup;
+}
+
+static const char *fc_ineligible_reasons[] = {
+ "Extended attributes changed",
+ "Cross rename",
+ "Journal flag changed",
+ "Insufficient memory",
+ "Swap boot",
+ "Resize",
+ "Dir renamed",
+ "Falloc range op",
+ "Data journalling",
+ "FC Commit Failed"
+};
+
+int ext4_fc_info_show(struct seq_file *seq, void *v)
+{
+ struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
+ struct ext4_fc_stats *stats = &sbi->s_fc_stats;
+ int i;
+
+ if (v != SEQ_START_TOKEN)
+ return 0;
+
+ seq_printf(seq,
+ "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
+ stats->fc_num_commits, stats->fc_ineligible_commits,
+ stats->fc_numblks,
+ div_u64(stats->s_fc_avg_commit_time, 1000));
+ seq_puts(seq, "Ineligible reasons:\n");
+ for (i = 0; i < EXT4_FC_REASON_MAX; i++)
+ seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
+ stats->fc_ineligible_reason_count[i]);
+
+ return 0;
+}
+
+int __init ext4_fc_init_dentry_cache(void)
+{
+ ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
+ SLAB_RECLAIM_ACCOUNT);
+
+ if (ext4_fc_dentry_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void ext4_fc_destroy_dentry_cache(void)
+{
+ kmem_cache_destroy(ext4_fc_dentry_cachep);
+}
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
new file mode 100644
index 000000000000..a6154c3ed135
--- /dev/null
+++ b/fs/ext4/fast_commit.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __FAST_COMMIT_H__
+#define __FAST_COMMIT_H__
+
+/*
+ * Note this file is present in e2fsprogs/lib/ext2fs/fast_commit.h and
+ * linux/fs/ext4/fast_commit.h. These file should always be byte identical.
+ */
+
+/* Fast commit tags */
+#define EXT4_FC_TAG_ADD_RANGE 0x0001
+#define EXT4_FC_TAG_DEL_RANGE 0x0002
+#define EXT4_FC_TAG_CREAT 0x0003
+#define EXT4_FC_TAG_LINK 0x0004
+#define EXT4_FC_TAG_UNLINK 0x0005
+#define EXT4_FC_TAG_INODE 0x0006
+#define EXT4_FC_TAG_PAD 0x0007
+#define EXT4_FC_TAG_TAIL 0x0008
+#define EXT4_FC_TAG_HEAD 0x0009
+
+#define EXT4_FC_SUPPORTED_FEATURES 0x0
+
+/* On disk fast commit tlv value structures */
+
+/* Fast commit on disk tag length structure */
+struct ext4_fc_tl {
+ __le16 fc_tag;
+ __le16 fc_len;
+};
+
+/* Value structure for tag EXT4_FC_TAG_HEAD. */
+struct ext4_fc_head {
+ __le32 fc_features;
+ __le32 fc_tid;
+};
+
+/* Value structure for EXT4_FC_TAG_ADD_RANGE. */
+struct ext4_fc_add_range {
+ __le32 fc_ino;
+ __u8 fc_ex[12];
+};
+
+/* Value structure for tag EXT4_FC_TAG_DEL_RANGE. */
+struct ext4_fc_del_range {
+ __le32 fc_ino;
+ __le32 fc_lblk;
+ __le32 fc_len;
+};
+
+/*
+ * This is the value structure for tags EXT4_FC_TAG_CREAT, EXT4_FC_TAG_LINK
+ * and EXT4_FC_TAG_UNLINK.
+ */
+struct ext4_fc_dentry_info {
+ __le32 fc_parent_ino;
+ __le32 fc_ino;
+ __u8 fc_dname[];
+};
+
+/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
+struct ext4_fc_inode {
+ __le32 fc_ino;
+ __u8 fc_raw_inode[];
+};
+
+/* Value structure for tag EXT4_FC_TAG_TAIL. */
+struct ext4_fc_tail {
+ __le32 fc_tid;
+ __le32 fc_crc;
+};
+
+/* Tag base length */
+#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl))
+
+/*
+ * Fast commit status codes
+ */
+enum {
+ EXT4_FC_STATUS_OK = 0,
+ EXT4_FC_STATUS_INELIGIBLE,
+ EXT4_FC_STATUS_SKIPPED,
+ EXT4_FC_STATUS_FAILED,
+};
+
+/*
+ * Fast commit ineligiblity reasons:
+ */
+enum {
+ EXT4_FC_REASON_XATTR = 0,
+ EXT4_FC_REASON_CROSS_RENAME,
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE,
+ EXT4_FC_REASON_NOMEM,
+ EXT4_FC_REASON_SWAP_BOOT,
+ EXT4_FC_REASON_RESIZE,
+ EXT4_FC_REASON_RENAME_DIR,
+ EXT4_FC_REASON_FALLOC_RANGE,
+ EXT4_FC_REASON_INODE_JOURNAL_DATA,
+ EXT4_FC_REASON_MAX
+};
+
+#ifdef __KERNEL__
+/*
+ * In memory list of dentry updates that are performed on the file
+ * system used by fast commit code.
+ */
+struct ext4_fc_dentry_update {
+ int fcd_op; /* Type of update create / unlink / link */
+ int fcd_parent; /* Parent inode number */
+ int fcd_ino; /* Inode number */
+ struct qstr fcd_name; /* Dirent name */
+ unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */
+ struct list_head fcd_list;
+ struct list_head fcd_dilist;
+};
+
+struct ext4_fc_stats {
+ unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX];
+ unsigned long fc_num_commits;
+ unsigned long fc_ineligible_commits;
+ unsigned long fc_failed_commits;
+ unsigned long fc_skipped_commits;
+ unsigned long fc_numblks;
+ u64 s_fc_avg_commit_time;
+};
+
+#define EXT4_FC_REPLAY_REALLOC_INCREMENT 4
+
+/*
+ * Physical block regions added to different inodes due to fast commit
+ * recovery. These are set during the SCAN phase. During the replay phase,
+ * our allocator excludes these from its allocation. This ensures that
+ * we don't accidentally allocating a block that is going to be used by
+ * another inode.
+ */
+struct ext4_fc_alloc_region {
+ ext4_lblk_t lblk;
+ ext4_fsblk_t pblk;
+ int ino, len;
+};
+
+/*
+ * Fast commit replay state.
+ */
+struct ext4_fc_replay_state {
+ int fc_replay_num_tags;
+ int fc_replay_expected_off;
+ int fc_current_pass;
+ int fc_cur_tag;
+ int fc_crc;
+ struct ext4_fc_alloc_region *fc_regions;
+ int fc_regions_size, fc_regions_used, fc_regions_valid;
+ int *fc_modified_inodes;
+ int fc_modified_inodes_used, fc_modified_inodes_size;
+};
+
+#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
+#endif
+
+static inline const char *tag2str(__u16 tag)
+{
+ switch (tag) {
+ case EXT4_FC_TAG_LINK:
+ return "ADD_ENTRY";
+ case EXT4_FC_TAG_UNLINK:
+ return "DEL_ENTRY";
+ case EXT4_FC_TAG_ADD_RANGE:
+ return "ADD_RANGE";
+ case EXT4_FC_TAG_CREAT:
+ return "CREAT_DENTRY";
+ case EXT4_FC_TAG_DEL_RANGE:
+ return "DEL_RANGE";
+ case EXT4_FC_TAG_INODE:
+ return "INODE";
+ case EXT4_FC_TAG_PAD:
+ return "PAD";
+ case EXT4_FC_TAG_TAIL:
+ return "TAIL";
+ case EXT4_FC_TAG_HEAD:
+ return "HEAD";
+ default:
+ return "ERROR";
+ }
+}
+
+#endif /* __FAST_COMMIT_H__ */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5f225881176b..a7a597c727e6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,17 +36,34 @@
#include "acl.h"
#include "truncate.h"
-static bool ext4_dio_supported(struct inode *inode)
+/*
+ * Returns %true if the given DIO request should be attempted with DIO, or
+ * %false if it should fall back to buffered I/O.
+ *
+ * DIO isn't well specified; when it's unsupported (either due to the request
+ * being misaligned, or due to the file not supporting DIO at all), filesystems
+ * either fall back to buffered I/O or return EINVAL. For files that don't use
+ * any special features like encryption or verity, ext4 has traditionally
+ * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too.
+ * In this case, we should attempt the DIO, *not* fall back to buffered I/O.
+ *
+ * In contrast, in cases where DIO is unsupported due to ext4 features, ext4
+ * traditionally falls back to buffered I/O.
+ *
+ * This function implements the traditional ext4 behavior in all these cases.
+ */
+static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
{
- if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
- return false;
- if (fsverity_active(inode))
- return false;
- if (ext4_should_journal_data(inode))
- return false;
- if (ext4_has_inline_data(inode))
+ struct inode *inode = file_inode(iocb->ki_filp);
+ u32 dio_align = ext4_dio_alignment(inode);
+
+ if (dio_align == 0)
return false;
- return true;
+
+ if (dio_align == 1)
+ return true;
+
+ return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
}
static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -61,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
}
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_should_use_dio(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
@@ -74,8 +91,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@@ -145,10 +161,9 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1) &&
- !EXT4_I(inode)->i_reserved_data_blocks)
- {
+ !EXT4_I(inode)->i_reserved_data_blocks) {
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
@@ -267,7 +282,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
goto out;
current->backing_dev_info = inode_to_bdi(inode);
- ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+ ret = generic_perform_write(iocb, from);
current->backing_dev_info = NULL;
out:
@@ -287,6 +302,7 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
bool truncate = false;
u8 blkbits = inode->i_blkbits;
ext4_lblk_t written_blk, end_blk;
+ int ret;
/*
* Note that EXT4_I(inode)->i_disksize can get extended up to
@@ -327,8 +343,14 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
goto truncate;
}
- if (ext4_update_inode_size(inode, offset + written))
- ext4_mark_inode_dirty(handle, inode);
+ if (ext4_update_inode_size(inode, offset + written)) {
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret)) {
+ written = ret;
+ ext4_journal_stop(handle);
+ goto truncate;
+ }
+ }
/*
* We may need to truncate allocated but not written blocks beyond EOF.
@@ -364,15 +386,32 @@ truncate:
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
int error, unsigned int flags)
{
- loff_t offset = iocb->ki_pos;
+ loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
if (error)
return error;
- if (size && flags & IOMAP_DIO_UNWRITTEN)
- return ext4_convert_unwritten_extents(NULL, inode,
- offset, size);
+ if (size && flags & IOMAP_DIO_UNWRITTEN) {
+ error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
+ if (error < 0)
+ return error;
+ }
+ /*
+ * If we are extending the file, we have to update i_size here before
+ * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
+ * buffered reads could zero out too much from page cache pages. Update
+ * of on-disk size will happen later in ext4_dio_write_iter() where
+ * we have enough information to also perform orphan list handling etc.
+ * Note that we perform all extending writes synchronously under
+ * i_rwsem held exclusively so i_size update is safe here in that case.
+ * If the write was not extending, we cannot see pos > i_size here
+ * because operations reducing i_size like truncate wait for all
+ * outstanding DIO before updating i_size.
+ */
+ pos += size;
+ if (pos > i_size_read(inode))
+ i_size_write(inode, pos);
return 0;
}
@@ -421,6 +460,10 @@ restart:
*/
if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
!ext4_overwrite_io(inode, offset, count))) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
inode_unlock_shared(inode);
*ilock_shared = false;
inode_lock(inode);
@@ -483,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_should_use_dio(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
@@ -495,6 +538,18 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret <= 0)
return ret;
+ /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
+ if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ /*
+ * Make sure inline data cannot be created anymore since we are going
+ * to allocate blocks for DIO. We know the inode does not have any
+ * inline data now because ext4_dio_supported() checked for that.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+
offset = iocb->ki_pos;
count = ret;
@@ -530,7 +585,10 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ilock_shared)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
- is_sync_kiocb(iocb) || unaligned_io || extend);
+ (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
+ NULL, 0);
+ if (ret == -ENOTBLK)
+ ret = 0;
if (extend)
ret = ext4_handle_inode_extension(inode, offset, ret, count);
@@ -638,8 +696,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
#endif
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_write_iter(iocb, from);
-
- return ext4_buffered_write_iter(iocb, from);
+ else
+ return ext4_buffered_write_iter(iocb, from);
}
#ifdef CONFIG_FS_DAX
@@ -666,22 +724,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
*/
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
pfn_t pfn;
if (write) {
sb_start_pagefault(sb);
file_update_time(vmf->vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
retry:
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
if (IS_ERR(handle)) {
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(sb);
return VM_FAULT_SIGBUS;
}
} else {
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
}
result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
if (write) {
@@ -693,10 +752,10 @@ retry:
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
result = dax_finish_sync_fault(vmf, pe_size, pfn);
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(sb);
} else {
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
}
return result;
@@ -718,7 +777,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = ext4_filemap_fault,
+ .fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
@@ -758,13 +817,13 @@ static int ext4_sample_last_mounted(struct super_block *sb,
handle_t *handle;
int err;
- if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED))
+ if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
return 0;
if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
return 0;
- sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+ ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
/*
* Sample where the filesystem has been mounted and
* store it in the superblock for sysadmin convenience
@@ -784,12 +843,16 @@ static int ext4_sample_last_mounted(struct super_block *sb,
if (IS_ERR(handle))
goto out;
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
if (err)
goto out_journal;
- strlcpy(sbi->s_es->s_last_mounted, cp,
+ lock_buffer(sbi->s_sbh);
+ strncpy(sbi->s_es->s_last_mounted, cp,
sizeof(sbi->s_es->s_last_mounted));
- ext4_handle_dirty_super(handle, sb);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
+ ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
out_journal:
ext4_journal_stop(handle);
out:
@@ -797,7 +860,7 @@ out:
return err;
}
-static int ext4_file_open(struct inode * inode, struct file * filp)
+static int ext4_file_open(struct inode *inode, struct file *filp)
{
int ret;
@@ -826,7 +889,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
return ret;
}
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return dquot_file_open(inode, filp);
}
@@ -872,6 +935,7 @@ const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
@@ -894,5 +958,7 @@ const struct inode_operations ext4_file_inode_operations = {
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
+ .fileattr_get = ext4_fileattr_get,
+ .fileattr_set = ext4_fileattr_set,
};
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index dbccf46f1770..4493ef0c715e 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -108,6 +108,9 @@ static int ext4_getfsmap_helper(struct super_block *sb,
/* Are we just counting mappings? */
if (info->gfi_head->fmh_count == 0) {
+ if (info->gfi_head->fmh_entries == UINT_MAX)
+ return EXT4_QUERY_RANGE_ABORT;
+
if (rec_fsblk > info->gfi_next_fsblk)
info->gfi_head->fmh_entries++;
@@ -277,7 +280,7 @@ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys,
/* Fabricate an rmap entry for the external log device. */
irec.fmr_physical = journal->j_blk_offset;
- irec.fmr_length = journal->j_maxlen;
+ irec.fmr_length = journal->j_total_len;
irec.fmr_owner = EXT4_FMR_OWN_LOG;
irec.fmr_flags = 0;
@@ -351,8 +354,8 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
/* Compare two fsmap items. */
static int ext4_getfsmap_compare(void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct ext4_fsmap *fa;
struct ext4_fsmap *fb;
@@ -571,8 +574,8 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
return true;
- if (EXT4_SB(sb)->journal_bdev &&
- fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev))
+ if (EXT4_SB(sb)->s_journal_bdev &&
+ fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev))
return true;
return false;
}
@@ -642,9 +645,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
memset(handlers, 0, sizeof(handlers));
handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
handlers[0].gfd_fn = ext4_getfsmap_datadev;
- if (EXT4_SB(sb)->journal_bdev) {
+ if (EXT4_SB(sb)->s_journal_bdev) {
handlers[1].gfd_dev = new_encode_dev(
- EXT4_SB(sb)->journal_bdev->bd_dev);
+ EXT4_SB(sb)->s_journal_bdev->bd_dev);
handlers[1].gfd_fn = ext4_getfsmap_logdev;
}
diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h
index 68c8001fee85..ac642be2302e 100644
--- a/fs/ext4/fsmap.h
+++ b/fs/ext4/fsmap.h
@@ -50,7 +50,7 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
#define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */
#define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */
#define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */
-#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* inode bitmap */
-#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* block bitmap */
+#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* block bitmap */
+#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* inode bitmap */
#endif /* __EXT4_FSMAP_H__ */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e10206e7f4bb..027a7d7037a0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,30 +44,28 @@
*/
static int ext4_sync_parent(struct inode *inode)
{
- struct dentry *dentry = NULL;
- struct inode *next;
+ struct dentry *dentry, *next;
int ret = 0;
if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
return 0;
- inode = igrab(inode);
+ dentry = d_find_any_alias(inode);
+ if (!dentry)
+ return 0;
while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
- dentry = d_find_any_alias(inode);
- if (!dentry)
- break;
- next = igrab(d_inode(dentry->d_parent));
+
+ next = dget_parent(dentry);
dput(dentry);
- if (!next)
- break;
- iput(inode);
- inode = next;
+ dentry = next;
+ inode = dentry->d_inode;
+
/*
* The directory inode may have gone through rmdir by now. But
* the inode itself and its blocks are still allocated (we hold
- * a reference to the inode so it didn't go through
- * ext4_evict_inode()) and so we are safe to flush metadata
- * blocks and the inode.
+ * a reference to the inode via its dentry), so it didn't go
+ * through ext4_evict_inode()) and so we are safe to flush
+ * metadata blocks and the inode.
*/
ret = sync_mapping_buffers(inode->i_mapping);
if (ret)
@@ -76,7 +74,7 @@ static int ext4_sync_parent(struct inode *inode)
if (ret)
break;
}
- iput(inode);
+ dput(dentry);
return ret;
}
@@ -114,7 +112,7 @@ static int ext4_fsync_journal(struct inode *inode, bool datasync,
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
*needs_barrier = true;
- return jbd2_complete_transaction(journal, commit_tid);
+ return ext4_fc_commit(journal, commit_tid);
}
/*
@@ -138,21 +136,21 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (unlikely(ext4_forced_shutdown(sbi)))
return -EIO;
- J_ASSERT(ext4_journal_current_handle() == NULL);
+ ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file_enter(file, datasync);
if (sb_rdonly(inode->i_sb)) {
/* Make sure that we read updated s_mount_flags value */
smp_rmb();
- if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))
ret = -EROFS;
goto out;
}
ret = file_write_and_wait_range(file, start, end);
if (ret)
- return ret;
+ goto out;
/*
* data=writeback,ordered:
@@ -176,7 +174,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
if (needs_barrier) {
- err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ err = blkdev_issue_flush(inode->i_sb->s_bdev);
if (!ret)
ret = err;
}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 3e133793a5a3..147b5241dd94 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -197,7 +197,7 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
* represented, and whether or not the returned hash is 32 bits or 64
* bits. 32 bit hashes will return 0 for the minor hash.
*/
-static int __ext4fs_dirhash(const char *name, int len,
+static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo)
{
__u32 hash;
@@ -233,7 +233,7 @@ static int __ext4fs_dirhash(const char *name, int len,
break;
case DX_HASH_HALF_MD4_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
- /* fall through */
+ fallthrough;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
@@ -247,7 +247,7 @@ static int __ext4fs_dirhash(const char *name, int len,
break;
case DX_HASH_TEA_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
- /* fall through */
+ fallthrough;
case DX_HASH_TEA:
p = name;
while (len > 0) {
@@ -259,6 +259,22 @@ static int __ext4fs_dirhash(const char *name, int len,
hash = buf[0];
minor_hash = buf[1];
break;
+ case DX_HASH_SIPHASH:
+ {
+ struct qstr qname = QSTR_INIT(name, len);
+ __u64 combined_hash;
+
+ if (fscrypt_has_encryption_key(dir)) {
+ combined_hash = fscrypt_fname_siphash(dir, &qname);
+ } else {
+ ext4_warning_inode(dir, "Siphash requires key");
+ return -1;
+ }
+
+ hash = (__u32)(combined_hash >> 32);
+ minor_hash = (__u32)combined_hash;
+ break;
+ }
default:
hinfo->hash = 0;
return -1;
@@ -274,13 +290,14 @@ static int __ext4fs_dirhash(const char *name, int len,
int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo)
{
-#ifdef CONFIG_UNICODE
- const struct unicode_map *um = EXT4_SB(dir->i_sb)->s_encoding;
+#if IS_ENABLED(CONFIG_UNICODE)
+ const struct unicode_map *um = dir->i_sb->s_encoding;
int r, dlen;
unsigned char *buff;
struct qstr qstr = {.name = name, .len = len };
- if (len && IS_CASEFOLDED(dir) && um) {
+ if (len && IS_CASEFOLDED(dir) && um &&
+ (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) {
buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
if (!buff)
return -ENOMEM;
@@ -291,12 +308,12 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
goto opaque_seq;
}
- r = __ext4fs_dirhash(buff, dlen, hinfo);
+ r = __ext4fs_dirhash(dir, buff, dlen, hinfo);
kfree(buff);
return r;
}
opaque_seq:
#endif
- return __ext4fs_dirhash(name, len, hinfo);
+ return __ext4fs_dirhash(dir, name, len, hinfo);
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f95ee99091e4..e9bc46684106 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -82,7 +82,12 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
- struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_group_info *grp;
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
+ grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return 0;
@@ -113,7 +118,7 @@ verified:
* Read the inode allocation bitmap for a given block_group, reading
* into the specified slot in the superblock's bitmap cache.
*
- * Return buffer_head of bitmap on success or NULL.
+ * Return buffer_head of bitmap on success, or an ERR_PTR on error.
*/
static struct buffer_head *
ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
@@ -189,17 +194,13 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
* submit the buffer_head for reading
*/
trace_ext4_load_inode_bitmap(sb, block_group);
- bh->b_end_io = ext4_end_bitmap_read;
- get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
- wait_on_buffer(bh);
+ ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read);
ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
if (!buffer_uptodate(bh)) {
put_bh(bh);
- ext4_set_errno(sb, EIO);
- ext4_error(sb, "Cannot read inode bitmap - "
- "block_group = %u, inode_bitmap = %llu",
- block_group, bitmap_blk);
+ ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
+ "block_group = %u, inode_bitmap = %llu",
+ block_group, bitmap_blk);
ext4_mark_group_bitmap_corrupted(sb, block_group,
EXT4_GROUP_INFO_IBITMAP_CORRUPT);
return ERR_PTR(-EIO);
@@ -285,19 +286,22 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
/* Don't bother if the inode bitmap is corrupt. */
- grp = ext4_get_group_info(sb, block_group);
if (IS_ERR(bitmap_bh)) {
fatal = PTR_ERR(bitmap_bh);
bitmap_bh = NULL;
goto error_return;
}
- if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
- fatal = -EFSCORRUPTED;
- goto error_return;
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, block_group);
+ if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
+ fatal = -EFSCORRUPTED;
+ goto error_return;
+ }
}
BUFFER_TRACE(bitmap_bh, "get_write_access");
- fatal = ext4_journal_get_write_access(handle, bitmap_bh);
+ fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
if (fatal)
goto error_return;
@@ -305,7 +309,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
gdp = ext4_get_group_desc(sb, block_group, &bh2);
if (gdp) {
BUFFER_TRACE(bh2, "get_write_access");
- fatal = ext4_journal_get_write_access(handle, bh2);
+ fatal = ext4_journal_get_write_access(handle, sb, bh2,
+ EXT4_JTR_NONE);
}
ext4_lock_group(sb, block_group);
cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
@@ -319,14 +324,16 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (is_directory) {
count = ext4_used_dirs_count(sb, gdp) - 1;
ext4_used_dirs_set(sb, gdp, count);
- percpu_counter_dec(&sbi->s_dirs_counter);
+ if (percpu_counter_initialized(&sbi->s_dirs_counter))
+ percpu_counter_dec(&sbi->s_dirs_counter);
}
ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
EXT4_INODES_PER_GROUP(sb) / 8);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- percpu_counter_inc(&sbi->s_freeinodes_counter);
+ if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
+ percpu_counter_inc(&sbi->s_freeinodes_counter);
if (sbi->s_log_groups_per_flex) {
struct flex_groups *fg;
@@ -397,7 +404,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
*
* We always try to spread first-level directories.
*
- * If there are blockgroups with both free inodes and free blocks counts
+ * If there are blockgroups with both free inodes and free clusters counts
* not worse than average we return one with smallest directory count.
* Otherwise we simply return a random group.
*
@@ -406,7 +413,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
* It's OK to put directory into a group unless
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
- * it has too few free blocks left (min_blocks) or
+ * it has too few free clusters left (min_clusters) or
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
@@ -422,7 +429,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4_group_t real_ngroups = ext4_get_groups_count(sb);
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei, grp_free;
- ext4_fsblk_t freeb, avefreec;
+ ext4_fsblk_t freec, avefreec;
unsigned int ndirs;
int max_dirs, min_inodes;
ext4_grpblk_t min_clusters;
@@ -441,9 +448,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
- freeb = EXT4_C2B(sbi,
- percpu_counter_read_positive(&sbi->s_freeclusters_counter));
- avefreec = freeb;
+ freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+ avefreec = freec;
do_div(avefreec, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
@@ -457,10 +463,9 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
hinfo.hash_version = DX_HASH_HALF_MD4;
hinfo.seed = sbi->s_hash_seed;
ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
- grp = hinfo.hash;
+ parent_group = hinfo.hash % ngroups;
} else
- grp = prandom_u32();
- parent_group = (unsigned)grp % ngroups;
+ parent_group = prandom_u32_max(ngroups);
for (i = 0; i < ngroups; i++) {
g = (parent_group + i) % ngroups;
get_orlov_stats(sb, g, flex_size, &stats);
@@ -504,7 +509,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
goto fallback;
}
- max_dirs = ndirs / ngroups + inodes_per_group / 16;
+ max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
min_inodes = avefreei - inodes_per_group*flex_size / 4;
if (min_inodes < 1)
min_inodes = 1;
@@ -663,7 +668,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
* block has been written back to disk. (Yes, these values are
* somewhat arbitrary...)
*/
-#define RECENTCY_MIN 5
+#define RECENTCY_MIN 60
#define RECENTCY_DIRTY 300
static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
@@ -712,22 +717,198 @@ out:
static int find_inode_bit(struct super_block *sb, ext4_group_t group,
struct buffer_head *bitmap, unsigned long *ino)
{
+ bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL;
+ unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb);
+
next:
*ino = ext4_find_next_zero_bit((unsigned long *)
bitmap->b_data,
EXT4_INODES_PER_GROUP(sb), *ino);
if (*ino >= EXT4_INODES_PER_GROUP(sb))
- return 0;
+ goto not_found;
- if ((EXT4_SB(sb)->s_journal == NULL) &&
- recently_deleted(sb, group, *ino)) {
+ if (check_recently_deleted && recently_deleted(sb, group, *ino)) {
+ recently_deleted_ino = *ino;
*ino = *ino + 1;
if (*ino < EXT4_INODES_PER_GROUP(sb))
goto next;
+ goto not_found;
+ }
+ return 1;
+not_found:
+ if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb))
return 0;
+ /*
+ * Not reusing recently deleted inodes is mostly a preference. We don't
+ * want to report ENOSPC or skew allocation patterns because of that.
+ * So return even recently deleted inode if we could find better in the
+ * given range.
+ */
+ *ino = recently_deleted_ino;
+ return 1;
+}
+
+int ext4_mark_inode_used(struct super_block *sb, int ino)
+{
+ unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
+ struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
+ struct ext4_group_desc *gdp;
+ ext4_group_t group;
+ int bit;
+ int err = -EFSCORRUPTED;
+
+ if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
+ goto out;
+
+ group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+ inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+ if (IS_ERR(inode_bitmap_bh))
+ return PTR_ERR(inode_bitmap_bh);
+
+ if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
+ err = 0;
+ goto out;
}
- return 1;
+ gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+ if (!gdp || !group_desc_bh) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ext4_set_bit(bit, inode_bitmap_bh->b_data);
+
+ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+ err = sync_dirty_buffer(inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+
+ /* We may have to initialize the block bitmap if it isn't already */
+ if (ext4_has_group_desc_csum(sb) &&
+ gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ struct buffer_head *block_bitmap_bh;
+
+ block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(block_bitmap_bh)) {
+ err = PTR_ERR(block_bitmap_bh);
+ goto out;
+ }
+
+ BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+ err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
+ sync_dirty_buffer(block_bitmap_bh);
+
+ /* recheck and clear flag under lock if we still need to */
+ ext4_lock_group(sb, group);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ ext4_block_bitmap_csum_set(sb, group, gdp,
+ block_bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ }
+ ext4_unlock_group(sb, group);
+ brelse(block_bitmap_bh);
+
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+ }
+
+ /* Update the relevant bg descriptor fields */
+ if (ext4_has_group_desc_csum(sb)) {
+ int free;
+
+ ext4_lock_group(sb, group); /* while we modify the bg desc */
+ free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ free = 0;
+ }
+
+ /*
+ * Check the relative inode number against the last used
+ * relative inode number in this group. if it is greater
+ * we need to update the bg_itable_unused count
+ */
+ if (bit >= free)
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - bit - 1));
+ } else {
+ ext4_lock_group(sb, group);
+ }
+
+ ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+ if (ext4_has_group_desc_csum(sb)) {
+ ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ }
+
+ ext4_unlock_group(sb, group);
+ err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
+ sync_dirty_buffer(group_desc_bh);
+out:
+ return err;
+}
+
+static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
+ bool encrypt)
+{
+ struct super_block *sb = dir->i_sb;
+ int nblocks = 0;
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+ if (p) {
+ int acl_size = p->a_count * sizeof(ext4_acl_entry);
+
+ nblocks += (S_ISDIR(mode) ? 2 : 1) *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, acl_size,
+ true /* is_create */);
+ posix_acl_release(p);
+ }
+#endif
+
+#ifdef CONFIG_SECURITY
+ {
+ int num_security_xattrs = 1;
+
+#ifdef CONFIG_INTEGRITY
+ num_security_xattrs++;
+#endif
+ /*
+ * We assume that security xattrs are never more than 1k.
+ * In practice they are under 128 bytes.
+ */
+ nblocks += num_security_xattrs *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, 1024,
+ true /* is_create */);
+ }
+#endif
+ if (encrypt)
+ nblocks += __ext4_xattr_set_credits(sb,
+ NULL /* inode */,
+ NULL /* block_bh */,
+ FSCRYPT_SET_CONTEXT_MAX_SIZE,
+ true /* is_create */);
+ return nblocks;
}
/*
@@ -740,7 +921,8 @@ next:
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+struct inode *__ext4_new_inode(struct user_namespace *mnt_userns,
+ handle_t *handle, struct inode *dir,
umode_t mode, const struct qstr *qstr,
__u32 goal, uid_t *owner, __u32 i_flags,
int handle_type, unsigned int line_no,
@@ -759,8 +941,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
struct inode *ret;
ext4_group_t i;
ext4_group_t flex_group;
- struct ext4_group_info *grp;
- int encrypt = 0;
+ struct ext4_group_info *grp = NULL;
+ bool encrypt = false;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -772,59 +954,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (unlikely(ext4_forced_shutdown(sbi)))
return ERR_PTR(-EIO);
- if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
- !(i_flags & EXT4_EA_INODE_FL)) {
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return ERR_PTR(err);
- if (!fscrypt_has_encryption_key(dir))
- return ERR_PTR(-ENOKEY);
- encrypt = 1;
- }
-
- if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
- struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
-
- if (IS_ERR(p))
- return ERR_CAST(p);
- if (p) {
- int acl_size = p->a_count * sizeof(ext4_acl_entry);
-
- nblocks += (S_ISDIR(mode) ? 2 : 1) *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, acl_size,
- true /* is_create */);
- posix_acl_release(p);
- }
-#endif
-
-#ifdef CONFIG_SECURITY
- {
- int num_security_xattrs = 1;
-
-#ifdef CONFIG_INTEGRITY
- num_security_xattrs++;
-#endif
- /*
- * We assume that security xattrs are never
- * more than 1k. In practice they are under
- * 128 bytes.
- */
- nblocks += num_security_xattrs *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, 1024,
- true /* is_create */);
- }
-#endif
- if (encrypt)
- nblocks += __ext4_xattr_set_credits(sb,
- NULL /* inode */, NULL /* block_bh */,
- FSCRYPT_SET_CONTEXT_MAX_SIZE,
- true /* is_create */);
- }
-
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
inode = new_inode(sb);
@@ -843,10 +972,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
- inode->i_uid = current_fsuid();
+ inode_fsuid_set(inode, mnt_userns);
inode->i_gid = dir->i_gid;
} else
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(mnt_userns, inode, dir, mode);
if (ext4_has_feature_project(sb) &&
ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
@@ -854,10 +983,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
else
ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
+ if (!(i_flags & EXT4_EA_INODE_FL)) {
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto out;
+ }
+
err = dquot_initialize(inode);
if (err)
goto out;
+ if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
+ ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
+ if (ret2 < 0) {
+ err = ret2;
+ goto out;
+ }
+ nblocks += ret2;
+ }
+
if (!goal)
goal = sbi->s_inode_goal;
@@ -897,15 +1041,21 @@ got_group:
if (ext4_free_inodes_count(sb, gdp) == 0)
goto next_group;
- grp = ext4_get_group_info(sb, group);
- /* Skip groups with already-known suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
- goto next_group;
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, group);
+ /*
+ * Skip groups with already-known suspicious inode
+ * tables
+ */
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ goto next_group;
+ }
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
/* Skip groups with suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
+ if (((!(sbi->s_mount_state & EXT4_FC_REPLAY))
+ && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
IS_ERR(inode_bitmap_bh)) {
inode_bitmap_bh = NULL;
goto next_group;
@@ -924,7 +1074,7 @@ repeat_in_this_group:
goto next_group;
}
- if (!handle) {
+ if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
handle_type, nblocks, 0,
@@ -936,7 +1086,8 @@ repeat_in_this_group:
}
}
BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh,
+ EXT4_JTR_NONE);
if (err) {
ext4_std_error(sb, err);
goto out;
@@ -978,7 +1129,8 @@ got:
}
BUFFER_TRACE(group_desc_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, group_desc_bh);
+ err = ext4_journal_get_write_access(handle, sb, group_desc_bh,
+ EXT4_JTR_NONE);
if (err) {
ext4_std_error(sb, err);
goto out;
@@ -995,7 +1147,8 @@ got:
goto out;
}
BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
- err = ext4_journal_get_write_access(handle, block_bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh,
+ EXT4_JTR_NONE);
if (err) {
brelse(block_bitmap_bh);
ext4_std_error(sb, err);
@@ -1028,9 +1181,15 @@ got:
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
int free;
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-
- down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+ struct ext4_group_info *grp = NULL;
+
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, group);
+ down_read(&grp->alloc_sem); /*
+ * protect vs itable
+ * lazyinit
+ */
+ }
ext4_lock_group(sb, group); /* while we modify the bg desc */
free = EXT4_INODES_PER_GROUP(sb) -
ext4_itable_unused_count(sb, gdp);
@@ -1046,7 +1205,8 @@ got:
if (ino > free)
ext4_itable_unused_set(sb, gdp,
(EXT4_INODES_PER_GROUP(sb) - ino));
- up_read(&grp->alloc_sem);
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
+ up_read(&grp->alloc_sem);
} else {
ext4_lock_group(sb, group);
}
@@ -1104,7 +1264,7 @@ got:
ei->i_block_group = group;
ei->i_last_alloc_group = ~0;
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
if (IS_DIRSYNC(inode))
ext4_handle_sync(handle);
if (insert_inode_locked(inode) < 0) {
@@ -1119,7 +1279,7 @@ got:
EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto out;
}
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
/* Precompute checksum seed for inode metadata */
if (ext4_has_metadata_csum(sb)) {
@@ -1137,7 +1297,8 @@ got:
ei->i_extra_isize = sbi->s_want_extra_isize;
ei->i_inline_off = 0;
- if (ext4_has_feature_inline_data(sb))
+ if (ext4_has_feature_inline_data(sb) &&
+ (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);
@@ -1150,7 +1311,7 @@ got:
* prevent its deduplication.
*/
if (encrypt) {
- err = fscrypt_inherit_context(dir, inode, handle, true);
+ err = fscrypt_set_context(inode, handle);
if (err)
goto fail_free_drop;
}
@@ -1231,9 +1392,10 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- ext4_set_errno(sb, -err);
- ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
- ino, err);
+ ext4_error_err(sb, -err,
+ "couldn't read orphan inode %lu (err %d)",
+ ino, err);
+ brelse(bitmap_bh);
return inode;
}
@@ -1357,6 +1519,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
handle_t *handle;
ext4_fsblk_t blk;
int num, ret = 0, used_blks = 0;
+ unsigned long used_inos = 0;
/* This should not happen, but just to be sure check this */
if (sb_rdonly(sb)) {
@@ -1387,30 +1550,45 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
* used inodes so we need to skip blocks with used inodes in
* inode table.
*/
- if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
- used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp)),
- sbi->s_inodes_per_block);
-
- if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) ||
- ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp)) <
- EXT4_FIRST_INO(sb)))) {
- ext4_error(sb, "Something is wrong with group %u: "
- "used itable blocks: %d; "
- "itable unused count: %u",
- group, used_blks,
- ext4_itable_unused_count(sb, gdp));
- ret = 1;
- goto err_out;
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
+ used_inos = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block);
+
+ /* Bogus inode unused count? */
+ if (used_blks < 0 || used_blks > sbi->s_itb_per_group) {
+ ext4_error(sb, "Something is wrong with group %u: "
+ "used itable blocks: %d; "
+ "itable unused count: %u",
+ group, used_blks,
+ ext4_itable_unused_count(sb, gdp));
+ ret = 1;
+ goto err_out;
+ }
+
+ used_inos += group * EXT4_INODES_PER_GROUP(sb);
+ /*
+ * Are there some uninitialized inodes in the inode table
+ * before the first normal inode?
+ */
+ if ((used_blks != sbi->s_itb_per_group) &&
+ (used_inos < EXT4_FIRST_INO(sb))) {
+ ext4_error(sb, "Something is wrong with group %u: "
+ "itable unused count: %u; "
+ "itables initialized count: %ld",
+ group, ext4_itable_unused_count(sb, gdp),
+ used_inos);
+ ret = 1;
+ goto err_out;
+ }
}
blk = ext4_inode_table(sb, gdp) + used_blks;
num = sbi->s_itb_per_group - used_blks;
BUFFER_TRACE(group_desc_bh, "get_write_access");
- ret = ext4_journal_get_write_access(handle,
- group_desc_bh);
+ ret = ext4_journal_get_write_access(handle, sb, group_desc_bh,
+ EXT4_JTR_NONE);
if (ret)
goto err_out;
@@ -1428,7 +1606,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
if (ret < 0)
goto err_out;
if (barrier)
- blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+ blkdev_issue_flush(sb->s_bdev);
skip_zeroout:
ext4_lock_group(sb, group);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 569fc68e8975..860fc5119009 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -163,7 +163,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
}
if (!bh_uptodate_or_lock(bh)) {
- if (bh_submit_read(bh) < 0) {
+ if (ext4_read_bh(bh, 0, NULL) < 0) {
put_bh(bh);
goto failure;
}
@@ -354,7 +354,8 @@ static int ext4_alloc_branch(handle_t *handle,
}
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, ar->inode->i_sb,
+ bh, EXT4_JTR_NONE);
if (err) {
unlock_buffer(bh);
goto failed;
@@ -429,7 +430,8 @@ static int ext4_splice_branch(handle_t *handle,
*/
if (where->bh) {
BUFFER_TRACE(where->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, where->bh);
+ err = ext4_journal_get_write_access(handle, ar->inode->i_sb,
+ where->bh, EXT4_JTR_NONE);
if (err)
goto err_out;
}
@@ -458,7 +460,7 @@ static int ext4_splice_branch(handle_t *handle,
* the new i_size. But that is not done here - it is done in
* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
*/
- jbd_debug(5, "splicing indirect only\n");
+ ext4_debug("splicing indirect only\n");
BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
if (err)
@@ -467,8 +469,10 @@ static int ext4_splice_branch(handle_t *handle,
/*
* OK, we spliced it into the inode itself on a direct block.
*/
- ext4_mark_inode_dirty(handle, ar->inode);
- jbd_debug(5, "splicing direct\n");
+ err = ext4_mark_inode_dirty(handle, ar->inode);
+ if (unlikely(err))
+ goto err_out;
+ ext4_debug("splicing direct\n");
}
return err;
@@ -532,8 +536,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t first_block = 0;
trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
- J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
- J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+ ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+ ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
depth = ext4_block_to_path(inode, map->m_lblk, offsets,
&blocks_to_boundary);
@@ -591,7 +595,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
if (ext4_has_feature_bigalloc(inode->i_sb)) {
EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
"non-extent mapped inodes with bigalloc");
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto out;
}
/* Set up for the direct block allocation */
@@ -691,10 +696,10 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
* moment, get_block can be called only for blocks inside i_size since
* page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
+ * i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
@@ -702,7 +707,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
/*
* Truncate transactions can be complex and absolutely huge. So we need to
- * be able to restart the transaction at a conventient checkpoint to make
+ * be able to restart the transaction at a convenient checkpoint to make
* sure we don't overflow the journal.
*
* Try to extend this transaction for the purposes of truncation. If
@@ -725,7 +730,8 @@ static int ext4_ind_truncate_ensure_credits(handle_t *handle,
return ret;
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
- ret = ext4_journal_get_write_access(handle, bh);
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (unlikely(ret))
return ret;
}
@@ -856,8 +862,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
else if (ext4_should_journal_data(inode))
flags |= EXT4_FREE_BLOCKS_FORGET;
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
- count)) {
+ if (!ext4_inode_block_valid(inode, block_to_free, count)) {
EXT4_ERROR_INODE(inode, "attempt to clear invalid "
"blocks %llu len %lu",
(unsigned long long) block_to_free, count);
@@ -914,7 +919,8 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if (this_bh) { /* For indirect block */
BUFFER_TRACE(this_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, this_bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ this_bh, EXT4_JTR_NONE);
/* Important: if we can't update the indirect pointers
* to the blocks, we can't free them. */
if (err)
@@ -1002,8 +1008,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
if (!nr)
continue; /* A hole */
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- nr, 1)) {
+ if (!ext4_inode_block_valid(inode, nr, 1)) {
EXT4_ERROR_INODE(inode,
"invalid indirect mapped "
"block %lu (level %d)",
@@ -1012,14 +1017,14 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
/* Go read the buffer for the next level down */
- bh = sb_bread(inode->i_sb, nr);
+ bh = ext4_sb_bread(inode->i_sb, nr, 0);
/*
* A read failure? Report error and clear slot
* (should be rare).
*/
- if (!bh) {
- EXT4_ERROR_INODE_BLOCK(inode, nr,
+ if (IS_ERR(bh)) {
+ ext4_error_inode_block(inode, nr, -PTR_ERR(bh),
"Read failure");
continue;
}
@@ -1033,7 +1038,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
brelse(bh);
/*
- * Everything below this this pointer has been
+ * Everything below this pointer has been
* released. Now let this top-of-subtree go.
*
* We want the freeing of this indirect block to be
@@ -1078,7 +1083,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
*/
BUFFER_TRACE(parent_bh, "get_write_access");
if (!ext4_journal_get_write_access(handle,
- parent_bh)){
+ inode->i_sb, parent_bh,
+ EXT4_JTR_NONE)) {
*p = 0;
BUFFER_TRACE(parent_bh,
"call ext4_handle_dirty_metadata");
@@ -1180,21 +1186,21 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_IND_BLOCK:
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_DIND_BLOCK:
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_TIND_BLOCK:
;
}
@@ -1434,7 +1440,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_IND_BLOCK:
if (++n >= n2)
break;
@@ -1443,7 +1449,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_DIND_BLOCK:
if (++n >= n2)
break;
@@ -1452,7 +1458,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_TIND_BLOCK:
;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index fad82d08fca5..a4fbe825694b 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -6,7 +6,9 @@
#include <linux/iomap.h>
#include <linux/fiemap.h>
+#include <linux/namei.h>
#include <linux/iversion.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -34,6 +36,9 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
struct ext4_inode *raw_inode;
int free, min_offs;
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
+ return 0;
+
min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
EXT4_GOOD_OLD_INODE_SIZE -
EXT4_I(inode)->i_extra_isize -
@@ -98,10 +103,9 @@ int ext4_get_max_inline_size(struct inode *inode)
error = ext4_get_inode_loc(inode, &iloc);
if (error) {
- ext4_set_errno(inode->i_sb, -error);
- ext4_error_inode(inode, __func__, __LINE__, 0,
- "can't get inode location %lu",
- inode->i_ino);
+ ext4_error_inode_err(inode, __func__, __LINE__, 0, -error,
+ "can't get inode location %lu",
+ inode->i_ino);
return 0;
}
@@ -205,7 +209,7 @@ out:
/*
* write the buffer to the inline inode.
* If 'create' is set, we don't need to do the extra copy in the xattr
- * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * value since it is already handled by ext4_xattr_ibody_set.
* That saves us one memcpy.
*/
static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
@@ -265,7 +269,8 @@ static int ext4_create_inline_data(handle_t *handle,
return error;
BUFFER_TRACE(is.iloc.bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
@@ -277,7 +282,7 @@ static int ext4_create_inline_data(handle_t *handle,
len = 0;
}
- /* Insert the the xttr entry. */
+ /* Insert the xttr entry. */
i.value = value;
i.value_len = len;
@@ -287,7 +292,7 @@ static int ext4_create_inline_data(handle_t *handle,
BUG_ON(!is.s.not_found);
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error) {
if (error == -ENOSPC)
ext4_clear_inode_state(inode,
@@ -351,15 +356,16 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
goto out;
BUFFER_TRACE(is.iloc.bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
- /* Update the xttr entry. */
+ /* Update the xattr entry. */
i.value = value;
i.value_len = len;
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error)
goto out;
@@ -428,11 +434,12 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
goto out;
BUFFER_TRACE(is.iloc.bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error)
goto out;
@@ -524,13 +531,13 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
}
static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
- struct inode *inode,
- unsigned flags)
+ struct inode *inode)
{
int ret, needed_blocks, no_expand;
handle_t *handle = NULL;
int retries = 0, sem_held = 0;
struct page *page = NULL;
+ unsigned int flags;
unsigned from, to;
struct ext4_iloc iloc;
@@ -559,9 +566,9 @@ retry:
/* We cannot recurse into the filesystem as the transaction is already
* started */
- flags |= AOP_FLAG_NOFS;
-
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ flags = memalloc_nofs_save();
+ page = grab_cache_page_write_begin(mapping, 0);
+ memalloc_nofs_restore(flags);
if (!page) {
ret = -ENOMEM;
goto out;
@@ -594,7 +601,7 @@ retry:
ret = __block_write_begin(page, from, to, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
- ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ ret = ext4_walk_page_buffers(handle, inode, page_buffers(page),
from, to, NULL,
do_journal_get_write_access);
}
@@ -646,11 +653,11 @@ out:
int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep)
{
int ret;
handle_t *handle;
+ unsigned int flags;
struct page *page;
struct ext4_iloc iloc;
@@ -683,13 +690,14 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
goto convert;
}
- ret = ext4_journal_get_write_access(handle, iloc.bh);
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
+ EXT4_JTR_NONE);
if (ret)
goto out;
- flags |= AOP_FLAG_NOFS;
-
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ flags = memalloc_nofs_save();
+ page = grab_cache_page_write_begin(mapping, 0);
+ memalloc_nofs_restore(flags);
if (!page) {
ret = -ENOMEM;
goto out;
@@ -723,46 +731,89 @@ out:
brelse(iloc.bh);
return ret;
convert:
- return ext4_convert_inline_data_to_extent(mapping,
- inode, flags);
+ return ext4_convert_inline_data_to_extent(mapping, inode);
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct page *page)
{
- int ret, no_expand;
+ handle_t *handle = ext4_journal_current_handle();
+ int no_expand;
void *kaddr;
struct ext4_iloc iloc;
+ int ret = 0, ret2;
- if (unlikely(copied < len)) {
- if (!PageUptodate(page)) {
- copied = 0;
+ if (unlikely(copied < len) && !PageUptodate(page))
+ copied = 0;
+
+ if (likely(copied)) {
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret) {
+ unlock_page(page);
+ put_page(page);
+ ext4_std_error(inode->i_sb, ret);
goto out;
}
- }
+ ext4_write_lock_xattr(inode, &no_expand);
+ BUG_ON(!ext4_has_inline_data(inode));
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret) {
- ext4_std_error(inode->i_sb, ret);
- copied = 0;
- goto out;
- }
+ /*
+ * ei->i_inline_off may have changed since
+ * ext4_write_begin() called
+ * ext4_try_to_write_inline_data()
+ */
+ (void) ext4_find_inline_data_nolock(inode);
- ext4_write_lock_xattr(inode, &no_expand);
- BUG_ON(!ext4_has_inline_data(inode));
+ kaddr = kmap_atomic(page);
+ ext4_write_inline_data(inode, &iloc, kaddr, pos, copied);
+ kunmap_atomic(kaddr);
+ SetPageUptodate(page);
+ /* clear page dirty so that writepages wouldn't work for us. */
+ ClearPageDirty(page);
- kaddr = kmap_atomic(page);
- ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
- kunmap_atomic(kaddr);
- SetPageUptodate(page);
- /* clear page dirty so that writepages wouldn't work for us. */
- ClearPageDirty(page);
+ ext4_write_unlock_xattr(inode, &no_expand);
+ brelse(iloc.bh);
- ext4_write_unlock_xattr(inode, &no_expand);
- brelse(iloc.bh);
- mark_inode_dirty(inode);
+ /*
+ * It's important to update i_size while still holding page
+ * lock: page writeout could otherwise come in and zero
+ * beyond i_size.
+ */
+ ext4_update_inode_size(inode, pos + copied);
+ }
+ unlock_page(page);
+ put_page(page);
+
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (likely(copied))
+ mark_inode_dirty(inode);
out:
- return copied;
+ /*
+ * If we didn't copy as much data as expected, we need to trim back
+ * size of xattr containing inline data.
+ */
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ ext4_orphan_add(handle, inode);
+
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+ return ret ? ret : copied;
}
struct buffer_head *
@@ -796,17 +847,16 @@ ext4_journalled_write_inline_data(struct inode *inode,
* clear the inode state safely.
* 2. The inode has inline data, then we need to read the data, make it
* update and dirty so that ext4_da_writepages can handle it. We don't
- * need to start the journal since the file's metatdata isn't changed now.
+ * need to start the journal since the file's metadata isn't changed now.
*/
static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
struct inode *inode,
- unsigned flags,
void **fsdata)
{
int ret = 0, inline_size;
struct page *page;
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ page = grab_cache_page_write_begin(mapping, 0);
if (!page)
return -ENOMEM;
@@ -859,15 +909,15 @@ out:
int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep,
void **fsdata)
{
- int ret, inline_size;
+ int ret;
handle_t *handle;
struct page *page;
struct ext4_iloc iloc;
int retries = 0;
+ unsigned int flags;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -880,26 +930,14 @@ retry_journal:
goto out;
}
- inline_size = ext4_get_max_inline_size(inode);
-
- ret = -ENOSPC;
- if (inline_size >= pos + len) {
- ret = ext4_prepare_inline_data(handle, inode, pos + len);
- if (ret && ret != -ENOSPC)
- goto out_journal;
- }
-
- /*
- * We cannot recurse into the filesystem as the transaction
- * is already started.
- */
- flags |= AOP_FLAG_NOFS;
+ ret = ext4_prepare_inline_data(handle, inode, pos + len);
+ if (ret && ret != -ENOSPC)
+ goto out_journal;
if (ret == -ENOSPC) {
ext4_journal_stop(handle);
ret = ext4_da_convert_inline_data_to_extent(mapping,
inode,
- flags,
fsdata);
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -907,7 +945,13 @@ retry_journal:
goto out;
}
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ /*
+ * We cannot recurse into the filesystem as the transaction
+ * is already started.
+ */
+ flags = memalloc_nofs_save();
+ page = grab_cache_page_write_begin(mapping, 0);
+ memalloc_nofs_restore(flags);
if (!page) {
ret = -ENOMEM;
goto out_journal;
@@ -924,7 +968,8 @@ retry_journal:
if (ret < 0)
goto out_release_page;
}
- ret = ext4_journal_get_write_access(handle, iloc.bh);
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
+ EXT4_JTR_NONE);
if (ret)
goto out_release_page;
@@ -943,43 +988,6 @@ out:
return ret;
}
-int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
- unsigned len, unsigned copied,
- struct page *page)
-{
- int ret;
-
- ret = ext4_write_inline_data_end(inode, pos, len, copied, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- return ret;
- }
- copied = ret;
-
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_mutex.
- *
- * But it's important to update i_size while still holding page lock:
- * page writeout could otherwise come in and zero beyond i_size.
- */
- if (pos+copied > inode->i_size)
- i_size_write(inode, pos+copied);
- unlock_page(page);
- put_page(page);
-
- /*
- * Don't mark the inode dirty under page lock. First, it unnecessarily
- * makes the holding time of page lock longer. Second, it forces lock
- * ordering of page lock and transaction start for journaling
- * filesystems.
- */
- mark_inode_dirty(inode);
-
- return copied;
-}
-
#ifdef INLINE_DIR_DEBUG
void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
void *inline_start, int inline_size)
@@ -1029,10 +1037,11 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
return err;
BUFFER_TRACE(iloc->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, iloc->bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh,
+ EXT4_JTR_NONE);
if (err)
return err;
- ext4_insert_dentry(inode, de, inline_size, fname);
+ ext4_insert_dentry(dir, inode, de, inline_size, fname);
ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
@@ -1075,14 +1084,14 @@ static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
void *limit;
int de_len;
- de = (struct ext4_dir_entry_2 *)de_buf;
+ de = de_buf;
if (old_size) {
limit = de_buf + old_size;
do {
prev_de = de;
de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
de_buf += de_len;
- de = (struct ext4_dir_entry_2 *)de_buf;
+ de = de_buf;
} while (de_buf < limit);
prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
@@ -1101,7 +1110,7 @@ static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
int new_size = get_max_inline_xattr_value_size(dir, iloc);
- if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
+ if (new_size - old_size <= ext4_dir_rec_len(1, NULL))
return -ENOSPC;
ret = ext4_update_inline_data(handle, dir,
@@ -1120,7 +1129,15 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc,
void *buf, int inline_size)
{
- ext4_create_inline_data(handle, inode, inline_size);
+ int ret;
+
+ ret = ext4_create_inline_data(handle, inode, inline_size);
+ if (ret) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)",
+ inode->i_ino, ret);
+ return;
+ }
ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
@@ -1139,7 +1156,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
* First create "." and ".." and then copy the dir information
* back to the block.
*/
- de = (struct ext4_dir_entry_2 *)target;
+ de = target;
de = ext4_init_dot_dotdot(inode, de,
inode->i_sb->s_blocksize, csum_size,
le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
@@ -1224,7 +1241,8 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
}
lock_buffer(data_bh);
- error = ext4_journal_get_create_access(handle, data_bh);
+ error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh,
+ EXT4_JTR_NONE);
if (error) {
unlock_buffer(data_bh);
error = -EIO;
@@ -1261,7 +1279,7 @@ out:
int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
struct inode *dir, struct inode *inode)
{
- int ret, inline_size, no_expand;
+ int ret, ret2, inline_size, no_expand;
void *inline_start;
struct ext4_iloc iloc;
@@ -1315,7 +1333,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
out:
ext4_write_unlock_xattr(dir, &no_expand);
- ext4_mark_inode_dirty(handle, dir);
+ ret2 = ext4_mark_inode_dirty(handle, dir);
+ if (unlikely(ret2 && !ret))
+ ret = ret2;
brelse(iloc.bh);
return ret;
}
@@ -1379,8 +1399,8 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
fake.name_len = 1;
strcpy(fake.name, ".");
fake.rec_len = ext4_rec_len_to_disk(
- EXT4_DIR_REC_LEN(fake.name_len),
- inline_size);
+ ext4_dir_rec_len(fake.name_len, NULL),
+ inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
pos = EXT4_INLINE_DOTDOT_OFFSET;
@@ -1389,8 +1409,8 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
fake.name_len = 2;
strcpy(fake.name, "..");
fake.rec_len = ext4_rec_len_to_disk(
- EXT4_DIR_REC_LEN(fake.name_len),
- inline_size);
+ ext4_dir_rec_len(fake.name_len, NULL),
+ inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
pos = EXT4_INLINE_DOTDOT_SIZE;
@@ -1405,7 +1425,12 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
}
}
- ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ if (ext4_hash_in_dirent(dir)) {
+ hinfo->hash = EXT4_DIRENT_HASH(de);
+ hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
+ } else {
+ ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ }
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
@@ -1487,8 +1512,8 @@ int ext4_read_inline_dir(struct file *file,
* So we will use extra_offset and extra_size to indicate them
* during the inline dir iteration.
*/
- dotdot_offset = EXT4_DIR_REC_LEN(1);
- dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+ dotdot_offset = ext4_dir_rec_len(1, NULL);
+ dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL);
extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
extra_size = extra_offset + inline_size;
@@ -1523,7 +1548,7 @@ int ext4_read_inline_dir(struct file *file,
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len, extra_size)
- < EXT4_DIR_REC_LEN(1))
+ < ext4_dir_rec_len(1, NULL))
break;
i += ext4_rec_len_from_disk(de->rec_len,
extra_size);
@@ -1567,6 +1592,35 @@ out:
return ret;
}
+void *ext4_read_inline_link(struct inode *inode)
+{
+ struct ext4_iloc iloc;
+ int ret, inline_size;
+ void *link;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = -ENOMEM;
+ inline_size = ext4_get_inline_size(inode);
+ link = kmalloc(inline_size + 1, GFP_NOFS);
+ if (!link)
+ goto out;
+
+ ret = ext4_read_inline_data(inode, link, inline_size, &iloc);
+ if (ret < 0) {
+ kfree(link);
+ goto out;
+ }
+ nd_terminate_link(link, inode->i_size, ret);
+out:
+ if (ret < 0)
+ link = ERR_PTR(ret);
+ brelse(iloc.bh);
+ return link;
+}
+
struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
struct ext4_dir_entry_2 **parent_de,
int *retval)
@@ -1701,11 +1755,12 @@ int ext4_delete_inline_entry(handle_t *handle,
}
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto out;
- err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+ err = ext4_generic_delete_entry(dir, de_del, bh,
inline_start, inline_size, 0);
if (err)
goto out;
@@ -1758,19 +1813,20 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
void *inline_pos;
unsigned int offset;
struct ext4_dir_entry_2 *de;
- bool ret = true;
+ bool ret = false;
err = ext4_get_inode_loc(dir, &iloc);
if (err) {
- ext4_set_errno(dir->i_sb, -err);
- EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
- err, dir->i_ino);
- return true;
+ EXT4_ERROR_INODE_ERR(dir, -err,
+ "error %d getting inode %lu block",
+ err, dir->i_ino);
+ return false;
}
down_read(&EXT4_I(dir)->xattr_sem);
if (!ext4_has_inline_data(dir)) {
*has_inline_data = 0;
+ ret = true;
goto out;
}
@@ -1779,7 +1835,6 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
ext4_warning(dir->i_sb,
"bad inline directory (dir #%lu) - no `..'",
dir->i_ino);
- ret = true;
goto out;
}
@@ -1798,16 +1853,15 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
dir->i_ino, le32_to_cpu(de->inode),
le16_to_cpu(de->rec_len), de->name_len,
inline_size);
- ret = true;
goto out;
}
if (le32_to_cpu(de->inode)) {
- ret = false;
goto out;
}
offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
}
+ ret = true;
out:
up_read(&EXT4_I(dir)->xattr_sem);
brelse(iloc.bh);
@@ -1857,47 +1911,6 @@ out:
return error;
}
-int ext4_inline_data_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- int *has_inline, __u64 start, __u64 len)
-{
- __u64 physical = 0;
- __u64 inline_len;
- __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
- FIEMAP_EXTENT_LAST;
- int error = 0;
- struct ext4_iloc iloc;
-
- down_read(&EXT4_I(inode)->xattr_sem);
- if (!ext4_has_inline_data(inode)) {
- *has_inline = 0;
- goto out;
- }
- inline_len = min_t(size_t, ext4_get_inline_size(inode),
- i_size_read(inode));
- if (start >= inline_len)
- goto out;
- if (start + len < inline_len)
- inline_len = start + len;
- inline_len -= start;
-
- error = ext4_get_inode_loc(inode, &iloc);
- if (error)
- goto out;
-
- physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
- physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
- physical += offsetof(struct ext4_inode, i_block);
-
- brelse(iloc.bh);
-out:
- up_read(&EXT4_I(inode)->xattr_sem);
- if (physical)
- error = fiemap_fill_next_extent(fieinfo, start, physical,
- inline_len, flags);
- return (error < 0 ? error : 0);
-}
-
int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
{
handle_t *handle;
@@ -1920,6 +1933,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
ext4_write_lock_xattr(inode, &no_expand);
if (!ext4_has_inline_data(inode)) {
+ ext4_write_unlock_xattr(inode, &no_expand);
*has_inline = 0;
ext4_journal_stop(handle);
return 0;
@@ -1937,6 +1951,23 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
EXT4_I(inode)->i_disksize = i_size;
if (i_size < inline_size) {
+ /*
+ * if there's inline data to truncate and this file was
+ * converted to extents after that inline data was written,
+ * the extent status cache must be cleared to avoid leaving
+ * behind stale delayed allocated extent entries
+ */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+retry:
+ err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ if (err == -ENOMEM) {
+ memalloc_retry_wait(GFP_ATOMIC);
+ goto retry;
+ }
+ if (err)
+ goto out_error;
+ }
+
/* Clear the content in the xattr space. */
if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
@@ -1959,8 +1990,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
i.value = value;
i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
- err = ext4_xattr_ibody_inline_set(handle, inode,
- &i, &is);
+ err = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (err)
goto out_error;
}
@@ -2005,6 +2035,18 @@ int ext4_convert_inline_data(struct inode *inode)
if (!ext4_has_inline_data(inode)) {
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
return 0;
+ } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ /*
+ * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is
+ * cleared. This means we are in the middle of moving of
+ * inline data to delay allocated block. Just force writeout
+ * here to finish conversion.
+ */
+ error = filemap_flush(inode->i_mapping);
+ if (error)
+ return error;
+ if (!ext4_has_inline_data(inode))
+ return 0;
}
needed_blocks = ext4_writepage_trans_blocks(inode);
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
index d62d802c9c12..7935ea6cf92c 100644
--- a/fs/ext4/inode-test.c
+++ b/fs/ext4/inode-test.c
@@ -80,6 +80,145 @@ struct timestamp_expectation {
bool lower_bound;
};
+static const struct timestamp_expectation test_data[] = {
+ {
+ .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -1LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {0LL, 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 6,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0xFFFFFFFF,
+ .expected = {.tv_sec = 0x300000000LL,
+ .tv_nsec = MAX_NANOSECONDS},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L},
+ }
+};
+
+static void timestamp_expectation_to_desc(const struct timestamp_expectation *t,
+ char *desc)
+{
+ strscpy(desc, t->test_case_name, KUNIT_PARAM_DESC_SIZE);
+}
+
+KUNIT_ARRAY_PARAM(ext4_inode, test_data, timestamp_expectation_to_desc);
+
static time64_t get_32bit_time(const struct timestamp_expectation * const test)
{
if (test->msb_set) {
@@ -101,166 +240,35 @@ static time64_t get_32bit_time(const struct timestamp_expectation * const test)
*/
static void inode_test_xtimestamp_decoding(struct kunit *test)
{
- const struct timestamp_expectation test_data[] = {
- {
- .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE,
- .msb_set = true,
- .lower_bound = true,
- .extra_bits = 0,
- .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE,
- .msb_set = true,
- .lower_bound = false,
- .extra_bits = 0,
- .expected = {.tv_sec = -1LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 0,
- .expected = {0LL, 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 0,
- .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NEG_LO_1_CASE,
- .msb_set = true,
- .lower_bound = true,
- .extra_bits = 1,
- .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NEG_LO_1_CASE,
- .msb_set = true,
- .lower_bound = false,
- .extra_bits = 1,
- .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 1,
- .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 1,
- .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NEG_HI_1_CASE,
- .msb_set = true,
- .lower_bound = true,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NEG_HI_1_CASE,
- .msb_set = true,
- .lower_bound = false,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 6,
- .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 0xFFFFFFFF,
- .expected = {.tv_sec = 0x300000000LL,
- .tv_nsec = MAX_NANOSECONDS},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 3,
- .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 3,
- .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L},
- }
- };
-
struct timespec64 timestamp;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(test_data); ++i) {
- timestamp.tv_sec = get_32bit_time(&test_data[i]);
- ext4_decode_extra_time(&timestamp,
- cpu_to_le32(test_data[i].extra_bits));
-
- KUNIT_EXPECT_EQ_MSG(test,
- test_data[i].expected.tv_sec,
- timestamp.tv_sec,
- CASE_NAME_FORMAT,
- test_data[i].test_case_name,
- test_data[i].msb_set,
- test_data[i].lower_bound,
- test_data[i].extra_bits);
- KUNIT_EXPECT_EQ_MSG(test,
- test_data[i].expected.tv_nsec,
- timestamp.tv_nsec,
- CASE_NAME_FORMAT,
- test_data[i].test_case_name,
- test_data[i].msb_set,
- test_data[i].lower_bound,
- test_data[i].extra_bits);
- }
+
+ struct timestamp_expectation *test_param =
+ (struct timestamp_expectation *)(test->param_value);
+
+ timestamp.tv_sec = get_32bit_time(test_param);
+ ext4_decode_extra_time(&timestamp,
+ cpu_to_le32(test_param->extra_bits));
+
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_param->expected.tv_sec,
+ timestamp.tv_sec,
+ CASE_NAME_FORMAT,
+ test_param->test_case_name,
+ test_param->msb_set,
+ test_param->lower_bound,
+ test_param->extra_bits);
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_param->expected.tv_nsec,
+ timestamp.tv_nsec,
+ CASE_NAME_FORMAT,
+ test_param->test_case_name,
+ test_param->msb_set,
+ test_param->lower_bound,
+ test_param->extra_bits);
}
static struct kunit_case ext4_inode_test_cases[] = {
- KUNIT_CASE(inode_test_xtimestamp_decoding),
+ KUNIT_CASE_PARAM(inode_test_xtimestamp_decoding, ext4_inode_gen_params),
{}
};
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fa0ff78dc033..2b5ef1b64249 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -20,6 +20,7 @@
*/
#include <linux/fs.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
@@ -101,8 +102,8 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
return provided == calculated;
}
-static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
- struct ext4_inode_info *ei)
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
__u32 csum;
@@ -135,10 +136,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static void ext4_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
@@ -175,16 +173,19 @@ void ext4_evict_inode(struct inode *inode)
*/
int extra_credits = 6;
struct ext4_xattr_inode_array *ea_inode_array = NULL;
+ bool freeze_protected = false;
trace_ext4_evict_inode(inode);
+ if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
+ ext4_evict_ea_inode(inode);
if (inode->i_nlink) {
/*
* When journalling data dirty buffers are tracked only in the
* journal. So although mm thinks everything is clean and
* ready for reaping the inode might still have some pages to
* write in the running transaction or waiting to be
- * checkpointed. Thus calling jbd2_journal_invalidatepage()
+ * checkpointed. Thus calling jbd2_journal_invalidate_folio()
* (via truncate_inode_pages()) to discard these buffers can
* cause data loss. Also even if we did not discard these
* buffers, we would have no way to find them after the inode
@@ -199,8 +200,7 @@ void ext4_evict_inode(struct inode *inode)
*/
if (inode->i_ino != EXT4_JOURNAL_INO &&
ext4_should_journal_data(inode) &&
- (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
- inode->i_data.nrpages) {
+ S_ISREG(inode->i_mode) && inode->i_data.nrpages) {
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
@@ -221,10 +221,25 @@ void ext4_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
/*
+ * For inodes with journalled data, transaction commit could have
+ * dirtied the inode. Flush worker is ignoring it because of I_FREEING
+ * flag but we still need to remove the inode from the writeback lists.
+ */
+ if (!list_empty_careful(&inode->i_io_list)) {
+ WARN_ON_ONCE(!ext4_should_journal_data(inode));
+ inode_io_list_del(inode);
+ }
+
+ /*
* Protect us against freezing - iput() caller didn't have to have any
- * protection against it
+ * protection against it. When we are in a running transaction though,
+ * we are already protected against freezing and we cannot grab further
+ * protection due to lock ordering constraints.
*/
- sb_start_intwrite(inode->i_sb);
+ if (!ext4_journal_current_handle()) {
+ sb_start_intwrite(inode->i_sb);
+ freeze_protected = true;
+ }
if (!IS_NOQUOTA(inode))
extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
@@ -243,7 +258,8 @@ void ext4_evict_inode(struct inode *inode)
* cleaned up.
*/
ext4_orphan_del(NULL, inode);
- sb_end_intwrite(inode->i_sb);
+ if (freeze_protected)
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
@@ -269,10 +285,9 @@ void ext4_evict_inode(struct inode *inode)
if (inode->i_blocks) {
err = ext4_truncate(inode);
if (err) {
- ext4_set_errno(inode->i_sb, -err);
- ext4_error(inode->i_sb,
- "couldn't truncate inode %lu (err %d)",
- inode->i_ino, err);
+ ext4_error_err(inode->i_sb, -err,
+ "couldn't truncate inode %lu (err %d)",
+ inode->i_ino, err);
goto stop_handle;
}
}
@@ -285,7 +300,8 @@ void ext4_evict_inode(struct inode *inode)
stop_handle:
ext4_journal_stop(handle);
ext4_orphan_del(NULL, inode);
- sb_end_intwrite(inode->i_sb);
+ if (freeze_protected)
+ sb_end_intwrite(inode->i_sb);
ext4_xattr_inode_array_free(ea_inode_array);
goto no_delete;
}
@@ -314,10 +330,13 @@ stop_handle:
else
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
- sb_end_intwrite(inode->i_sb);
+ if (freeze_protected)
+ sb_end_intwrite(inode->i_sb);
ext4_xattr_inode_array_free(ea_inode_array);
return;
no_delete:
+ if (!list_empty(&EXT4_I(inode)->i_fc_list))
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
}
@@ -353,7 +372,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
ei->i_reserved_data_blocks -= used;
percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ spin_unlock(&ei->i_block_reservation_lock);
/* Update quota subsystem for data blocks */
if (quota_claim)
@@ -374,7 +393,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
*/
if ((ei->i_reserved_data_blocks == 0) &&
!inode_is_open_for_write(inode))
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
}
static int __check_block_validity(struct inode *inode, const char *func,
@@ -385,8 +404,7 @@ static int __check_block_validity(struct inode *inode, const char *func,
(inode->i_ino ==
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
return 0;
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
- map->m_len)) {
+ if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
"(length %d)", (unsigned long) map->m_lblk,
@@ -433,11 +451,9 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
*/
down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, flags &
- EXT4_GET_BLOCKS_KEEP_SIZE);
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
} else {
- retval = ext4_ind_map_blocks(handle, inode, map, flags &
- EXT4_GET_BLOCKS_KEEP_SIZE);
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
}
up_read((&EXT4_I(inode)->i_data_sem));
@@ -494,9 +510,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
#endif
map->m_flags = 0;
- ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
- "logical block %lu\n", inode->i_ino, flags, map->m_len,
- (unsigned long) map->m_lblk);
+ ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
+ flags, map->m_len, (unsigned long) map->m_lblk);
/*
* ext4_map_blocks returns an int, and m_len is an unsigned int
@@ -509,7 +524,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -529,12 +545,21 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
} else {
BUG();
}
+
+ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
+ return retval;
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(handle, inode, map,
&orig_map, flags);
#endif
goto found;
}
+ /*
+ * In the query cache no-wait mode, nothing we can do more if we
+ * cannot find extent in the cache.
+ */
+ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
+ return 0;
/*
* Try to see if we can get the block without requesting a new
@@ -542,11 +567,9 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
*/
down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, flags &
- EXT4_GET_BLOCKS_KEEP_SIZE);
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
} else {
- retval = ext4_ind_map_blocks(handle, inode, map, flags &
- EXT4_GET_BLOCKS_KEEP_SIZE);
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
}
if (retval > 0) {
unsigned int status;
@@ -727,6 +750,12 @@ out_sem:
return ret;
}
}
+ if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
+ map->m_flags & EXT4_MAP_MAPPED))
+ ext4_fc_track_range(handle, inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1);
+ if (retval < 0)
+ ext_debug(inode, "failed with err %d\n", retval);
return retval;
}
@@ -802,7 +831,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
inode->i_ino, create);
return _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
}
/* Maximum number of blocks we map for direct IO at once. */
@@ -817,9 +846,12 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
struct ext4_map_blocks map;
struct buffer_head *bh;
int create = map_flags & EXT4_GET_BLOCKS_CREATE;
+ bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT;
int err;
- J_ASSERT(handle != NULL || create == 0);
+ ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ || handle != NULL || create == 0);
+ ASSERT(create == 0 || !nowait);
map.m_lblk = block;
map.m_len = 1;
@@ -830,12 +862,16 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
if (err < 0)
return ERR_PTR(err);
+ if (nowait)
+ return sb_find_get_block(inode->i_sb, map.m_pblk);
+
bh = sb_getblk(inode->i_sb, map.m_pblk);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
if (map.m_flags & EXT4_MAP_NEW) {
- J_ASSERT(create != 0);
- J_ASSERT(handle != NULL);
+ ASSERT(create != 0);
+ ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ || (handle != NULL));
/*
* Now that we do not always journal data, we should
@@ -846,7 +882,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
*/
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (unlikely(err)) {
unlock_buffer(bh);
goto errout;
@@ -872,18 +909,20 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int map_flags)
{
struct buffer_head *bh;
+ int ret;
bh = ext4_getblk(handle, inode, block, map_flags);
if (IS_ERR(bh))
return bh;
if (!bh || ext4_buffer_uptodate(bh))
return bh;
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return bh;
- put_bh(bh);
- return ERR_PTR(-EIO);
+
+ ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
+ if (ret) {
+ put_bh(bh);
+ return ERR_PTR(ret);
+ }
+ return bh;
}
/* Read a contiguous batch of blocks. */
@@ -904,8 +943,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
for (i = 0; i < bh_count; i++)
/* Note that NULL bhs[i] is valid because of holes. */
if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
- &bhs[i]);
+ ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);
if (!wait)
return 0;
@@ -930,12 +968,12 @@ out_brelse:
return err;
}
-int ext4_walk_page_buffers(handle_t *handle,
+int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
- int (*fn)(handle_t *handle,
+ int (*fn)(handle_t *handle, struct inode *inode,
struct buffer_head *bh))
{
struct buffer_head *bh;
@@ -954,7 +992,7 @@ int ext4_walk_page_buffers(handle_t *handle,
*partial = 1;
continue;
}
- err = (*fn)(handle, bh);
+ err = (*fn)(handle, inode, bh);
if (!ret)
ret = err;
}
@@ -985,7 +1023,7 @@ int ext4_walk_page_buffers(handle_t *handle,
* is elevated. We'll still have enough credits for the tiny quotafile
* write.
*/
-int do_journal_get_write_access(handle_t *handle,
+int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
int dirty = buffer_dirty(bh);
@@ -1004,7 +1042,8 @@ int do_journal_get_write_access(handle_t *handle,
if (dirty)
clear_buffer_dirty(bh);
BUFFER_TRACE(bh, "get write access");
- ret = ext4_journal_get_write_access(handle, bh);
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (!ret && dirty)
ret = ext4_handle_dirty_metadata(handle, NULL, bh);
return ret;
@@ -1042,8 +1081,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
+ set_buffer_uptodate(bh);
}
continue;
}
@@ -1068,14 +1106,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
}
}
if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
+ set_buffer_uptodate(bh);
continue;
}
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ext4_read_bh_lock(bh, 0, false);
wait[nr_wait++] = bh;
}
}
@@ -1089,7 +1126,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
}
if (unlikely(err)) {
page_zero_new_buffers(page, from, to);
- } else if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
+ } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
for (i = 0; i < nr_wait; i++) {
int err2;
@@ -1107,7 +1144,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
#endif
static int ext4_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
@@ -1121,7 +1158,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- trace_ext4_write_begin(inode, pos, len, flags);
+ trace_ext4_write_begin(inode, pos, len);
/*
* Reserve one block more for addition to orphan list in case
* we allocate blocks but write fails for some reason
@@ -1133,7 +1170,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
- flags, pagep);
+ pagep);
if (ret < 0)
return ret;
if (ret == 1)
@@ -1148,9 +1185,16 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
* the page (if needed) without using GFP_NOFS.
*/
retry_grab:
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
+ /*
+ * The same as page allocation, we prealloc buffer heads before
+ * starting the handle.
+ */
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, inode->i_sb->s_blocksize, 0);
+
unlock_page(page);
retry_journal:
@@ -1186,8 +1230,8 @@ retry_journal:
ret = __block_write_begin(page, pos, len, ext4_get_block);
#endif
if (!ret && ext4_should_journal_data(inode)) {
- ret = ext4_walk_page_buffers(handle, page_buffers(page),
- from, to, NULL,
+ ret = ext4_walk_page_buffers(handle, inode,
+ page_buffers(page), from, to, NULL,
do_journal_get_write_access);
}
@@ -1199,7 +1243,7 @@ retry_journal:
/*
* __block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
+ * i_size_read because we hold i_rwsem.
*
* Add inode to orphan list in case we crash before
* truncate finishes
@@ -1231,7 +1275,8 @@ retry_journal:
}
/* For write_end() in data=journal mode */
-static int write_end_fn(handle_t *handle, struct buffer_head *bh)
+static int write_end_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
int ret;
if (!buffer_mapped(bh) || buffer_freed(bh))
@@ -1260,22 +1305,14 @@ static int ext4_write_end(struct file *file,
loff_t old_size = inode->i_size;
int ret = 0, ret2;
int i_size_changed = 0;
- int inline_data = ext4_has_inline_data(inode);
bool verity = ext4_verity_in_progress(inode);
trace_ext4_write_end(inode, pos, len, copied);
- if (inline_data) {
- ret = ext4_write_inline_data_end(inode, pos, len,
- copied, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- goto errout;
- }
- copied = ret;
- } else
- copied = block_write_end(file, mapping, pos,
- len, copied, page, fsdata);
+
+ if (ext4_has_inline_data(inode))
+ return ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
/*
* it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
@@ -1296,8 +1333,8 @@ static int ext4_write_end(struct file *file,
* ordering of page lock and transaction start for journaling
* filesystems.
*/
- if (i_size_changed || inline_data)
- ext4_mark_inode_dirty(handle, inode);
+ if (i_size_changed)
+ ret = ext4_mark_inode_dirty(handle, inode);
if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
@@ -1305,7 +1342,7 @@ static int ext4_write_end(struct file *file,
* inode->i_size. So truncate them
*/
ext4_orphan_add(handle, inode);
-errout:
+
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1330,6 +1367,7 @@ errout:
* to call ext4_handle_dirty_metadata() instead.
*/
static void ext4_journalled_zero_new_buffers(handle_t *handle,
+ struct inode *inode,
struct page *page,
unsigned from, unsigned to)
{
@@ -1348,7 +1386,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
size = min(to, block_end) - start;
zero_user(page, start, size);
- write_end_fn(handle, bh);
+ write_end_fn(handle, inode, bh);
}
clear_buffer_new(bh);
}
@@ -1370,7 +1408,6 @@ static int ext4_journalled_write_end(struct file *file,
int partial = 0;
unsigned from, to;
int size_changed = 0;
- int inline_data = ext4_has_inline_data(inode);
bool verity = ext4_verity_in_progress(inode);
trace_ext4_journalled_write_end(inode, pos, len, copied);
@@ -1379,24 +1416,18 @@ static int ext4_journalled_write_end(struct file *file,
BUG_ON(!ext4_handle_valid(handle));
- if (inline_data) {
- ret = ext4_write_inline_data_end(inode, pos, len,
- copied, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- goto errout;
- }
- copied = ret;
- } else if (unlikely(copied < len) && !PageUptodate(page)) {
+ if (ext4_has_inline_data(inode))
+ return ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+ if (unlikely(copied < len) && !PageUptodate(page)) {
copied = 0;
- ext4_journalled_zero_new_buffers(handle, page, from, to);
+ ext4_journalled_zero_new_buffers(handle, inode, page, from, to);
} else {
if (unlikely(copied < len))
- ext4_journalled_zero_new_buffers(handle, page,
+ ext4_journalled_zero_new_buffers(handle, inode, page,
from + copied, to);
- ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
- from + copied, &partial,
+ ret = ext4_walk_page_buffers(handle, inode, page_buffers(page),
+ from, from + copied, &partial,
write_end_fn);
if (!partial)
SetPageUptodate(page);
@@ -1411,7 +1442,7 @@ static int ext4_journalled_write_end(struct file *file,
if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
- if (size_changed || inline_data) {
+ if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
if (!ret)
ret = ret2;
@@ -1424,7 +1455,6 @@ static int ext4_journalled_write_end(struct file *file,
*/
ext4_orphan_add(handle, inode);
-errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1527,14 +1557,15 @@ struct mpage_da_data {
struct ext4_map_blocks map;
struct ext4_io_submit io_submit; /* IO submission data */
unsigned int do_map:1;
+ unsigned int scanned_until_end:1;
};
static void mpage_release_unused_pages(struct mpage_da_data *mpd,
bool invalidate)
{
- int nr_pages, i;
+ unsigned nr, i;
pgoff_t index, end;
- struct pagevec pvec;
+ struct folio_batch fbatch;
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
@@ -1542,34 +1573,47 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
if (mpd->first_page >= mpd->next_page)
return;
+ mpd->scanned_until_end = 0;
index = mpd->first_page;
end = mpd->next_page - 1;
if (invalidate) {
ext4_lblk_t start, last;
start = index << (PAGE_SHIFT - inode->i_blkbits);
last = end << (PAGE_SHIFT - inode->i_blkbits);
+
+ /*
+ * avoid racing with extent status tree scans made by
+ * ext4_insert_delayed_block()
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_es_remove_extent(inode, start, last - start + 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
}
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
while (index <= end) {
- nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
- if (nr_pages == 0)
+ nr = filemap_get_folios(mapping, &index, end, &fbatch);
+ if (nr == 0)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr; i++) {
+ struct folio *folio = fbatch.folios[i];
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
+ if (folio->index < mpd->first_page)
+ continue;
+ if (folio->index + folio_nr_pages(folio) - 1 > end)
+ continue;
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(folio_test_writeback(folio));
if (invalidate) {
- if (page_mapped(page))
- clear_page_dirty_for_io(page);
- block_invalidatepage(page, 0, PAGE_SIZE);
- ClearPageUptodate(page);
+ if (folio_mapped(folio))
+ folio_clear_dirty_for_io(folio);
+ block_invalidate_folio(folio, 0,
+ folio_size(folio));
+ folio_clear_uptodate(folio);
}
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
}
}
@@ -1595,7 +1639,8 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
}
@@ -1616,6 +1661,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
bool allocated = false;
+ bool reserved = false;
/*
* If the cluster containing lblk is shared with a delayed,
@@ -1632,6 +1678,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
goto errout;
+ reserved = true;
} else { /* bigalloc */
if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
if (!ext4_es_scan_clu(inode,
@@ -1644,6 +1691,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
goto errout;
+ reserved = true;
} else {
allocated = true;
}
@@ -1654,6 +1702,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
}
ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
+ if (ret && reserved)
+ ext4_da_release_space(inode, 1);
errout:
return ret;
@@ -1682,8 +1732,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
invalid_block = ~0;
map->m_flags = 0;
- ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
- "logical block %lu\n", inode->i_ino, map->m_len,
+ ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
/* Lookup extent status tree firstly */
@@ -1828,28 +1877,16 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return 0;
}
-static int bget_one(handle_t *handle, struct buffer_head *bh)
-{
- get_bh(bh);
- return 0;
-}
-
-static int bput_one(handle_t *handle, struct buffer_head *bh)
-{
- put_bh(bh);
- return 0;
-}
-
static int __ext4_journalled_writepage(struct page *page,
unsigned int len)
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
- struct buffer_head *page_bufs = NULL;
handle_t *handle = NULL;
int ret = 0, err = 0;
int inline_data = ext4_has_inline_data(inode);
struct buffer_head *inode_bh = NULL;
+ loff_t size;
ClearPageChecked(page);
@@ -1859,14 +1896,6 @@ static int __ext4_journalled_writepage(struct page *page,
inode_bh = ext4_journalled_write_inline_data(inode, len, page);
if (inode_bh == NULL)
goto out;
- } else {
- page_bufs = page_buffers(page);
- if (!page_bufs) {
- BUG();
- goto out;
- }
- ext4_walk_page_buffers(handle, page_bufs, 0, len,
- NULL, bget_one);
}
/*
* We need to release the page lock before we start the
@@ -1887,7 +1916,8 @@ static int __ext4_journalled_writepage(struct page *page,
lock_page(page);
put_page(page);
- if (page->mapping != mapping) {
+ size = i_size_read(inode);
+ if (page->mapping != mapping || page_offset(page) > size) {
/* The page got truncated from under us */
ext4_journal_stop(handle);
ret = 0;
@@ -1897,22 +1927,29 @@ static int __ext4_journalled_writepage(struct page *page,
if (inline_data) {
ret = ext4_mark_inode_dirty(handle, inode);
} else {
- ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
- do_journal_get_write_access);
+ struct buffer_head *page_bufs = page_buffers(page);
- err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
- write_end_fn);
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
+ else
+ len = PAGE_SIZE;
+
+ ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
+ NULL, do_journal_get_write_access);
+
+ err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
+ NULL, write_end_fn);
}
if (ret == 0)
ret = err;
+ err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
+ if (ret == 0)
+ ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
- if (!ext4_has_inline_data(inode))
- ext4_walk_page_buffers(NULL, page_bufs, 0, len,
- NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
unlock_page(page);
@@ -1965,6 +2002,7 @@ out_no_pagelock:
static int ext4_writepage(struct page *page,
struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
int ret = 0;
loff_t size;
unsigned int len;
@@ -1974,8 +2012,8 @@ static int ext4_writepage(struct page *page,
bool keep_towrite = false;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
- ext4_invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
+ folio_invalidate(folio, 0, folio_size(folio));
+ folio_unlock(folio);
return -EIO;
}
@@ -1987,6 +2025,15 @@ static int ext4_writepage(struct page *page,
else
len = PAGE_SIZE;
+ /* Should never happen but for bugs in other kernel subsystems */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(inode,
+ "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ return 0;
+ }
+
page_bufs = page_buffers(page);
/*
* We cannot do block allocation or other extent handling in this
@@ -2005,7 +2052,7 @@ static int ext4_writepage(struct page *page,
* for the extremely common case, this is an optimization that
* skips a useless round trip through ext4_bio_write_page().
*/
- if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL,
ext4_bh_delay_or_unwritten)) {
redirty_page_for_writepage(wbc, page);
if ((current->flags & PF_MEMALLOC) ||
@@ -2037,7 +2084,7 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return -ENOMEM;
}
- ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
+ ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite);
ext4_io_submit(&io_submit);
/* Drop io_end reference we got from init */
ext4_put_io_end_defer(io_submit.io_end);
@@ -2071,7 +2118,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
- err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, false);
if (!err)
mpd->wbc->nr_to_write--;
mpd->first_page++;
@@ -2079,7 +2126,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
return err;
}
-#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))
/*
* mballoc gives us at most this number of blocks...
@@ -2189,7 +2236,11 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
if (err < 0)
return err;
}
- return lblk < blocks;
+ if (lblk >= blocks) {
+ mpd->scanned_until_end = 1;
+ return 0;
+ }
+ return 1;
}
/*
@@ -2232,7 +2283,6 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
io_end_vec->size += io_end_size;
- io_end_size = 0;
err = mpage_process_page_bufs(mpd, head, bh, lblk);
if (err > 0)
@@ -2243,7 +2293,7 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
err = PTR_ERR(io_end_vec);
goto out;
}
- io_end_vec->offset = mpd->map.m_lblk << blkbits;
+ io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
}
*map_bh = true;
goto out;
@@ -2257,7 +2307,6 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
} while (lblk++, (bh = bh->b_this_page) != head);
io_end_vec->size += io_end_size;
- io_end_size = 0;
*map_bh = false;
out:
*m_lblk = lblk;
@@ -2281,8 +2330,8 @@ out:
*/
static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
{
- struct pagevec pvec;
- int nr_pages, i;
+ struct folio_batch fbatch;
+ unsigned nr, i;
struct inode *inode = mpd->inode;
int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
@@ -2296,14 +2345,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
lblk = start << bpp_bits;
pblock = mpd->map.m_pblk;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
while (start <= end) {
- nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
- &start, end);
- if (nr_pages == 0)
+ nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch);
+ if (nr == 0)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr; i++) {
+ struct page *page = &fbatch.folios[i]->page;
err = mpage_process_page(mpd, page, &lblk, &pblock,
&map_bh);
@@ -2312,21 +2360,21 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
* mapping, or maybe the page was submitted for IO.
* So we return to call further extent mapping.
*/
- if (err < 0 || map_bh == true)
+ if (err < 0 || map_bh)
goto out;
/* Page fully mapped - let IO run! */
err = mpage_submit_page(mpd, page);
if (err < 0)
goto out;
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
}
/* Extent fully mapped and matches with page boundary. We are done. */
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
return 0;
out:
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
return err;
}
@@ -2359,7 +2407,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (map->m_flags & (1 << BH_Delay))
+ if (map->m_flags & BIT(BH_Delay))
get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
@@ -2420,7 +2468,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
struct super_block *sb = inode->i_sb;
if (ext4_forced_shutdown(EXT4_SB(sb)) ||
- EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
@@ -2478,10 +2526,9 @@ update_disksize:
up_write(&EXT4_I(inode)->i_data_sem);
err2 = ext4_mark_inode_dirty(handle, inode);
if (err2) {
- ext4_set_errno(inode->i_sb, -err2);
- ext4_error(inode->i_sb,
- "Failed to mark inode %lu dirty",
- inode->i_ino);
+ ext4_error_err(inode->i_sb, -err2,
+ "Failed to mark inode %lu dirty",
+ inode->i_ino);
}
if (!err)
err = err2;
@@ -2548,7 +2595,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
tag);
if (nr_pages == 0)
- goto out;
+ break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -2587,6 +2634,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
+ /*
+ * Should never happen but for buggy code in
+ * other subsystems that call
+ * set_page_dirty() without properly warning
+ * the file system first. See [1] for more
+ * information.
+ *
+ * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
+ */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ continue;
+ }
+
if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
@@ -2603,6 +2666,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
pagevec_release(&pvec);
cond_resched();
}
+ mpd->scanned_until_end = 1;
return 0;
out:
pagevec_release(&pvec);
@@ -2621,7 +2685,6 @@ static int ext4_writepages(struct address_space *mapping,
struct inode *inode = mapping->host;
int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
- bool done;
struct blk_plug plug;
bool give_up_on_write = false;
@@ -2655,7 +2718,7 @@ static int ext4_writepages(struct address_space *mapping,
* the stack trace.
*/
if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
- sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+ ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
ret = -EROFS;
goto out_writepages;
}
@@ -2707,7 +2770,6 @@ static int ext4_writepages(struct address_space *mapping,
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
- done = false;
blk_start_plug(&plug);
/*
@@ -2717,6 +2779,7 @@ retry:
* started.
*/
mpd.do_map = 0;
+ mpd.scanned_until_end = 0;
mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
if (!mpd.io_submit.io_end) {
ret = -ENOMEM;
@@ -2732,7 +2795,7 @@ retry:
if (ret < 0)
goto unplug;
- while (!done && mpd.first_page <= mpd.last_page) {
+ while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
/* For each extent of pages we use new io_end */
mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
if (!mpd.io_submit.io_end) {
@@ -2767,26 +2830,15 @@ retry:
trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
ret = mpage_prepare_extent_to_map(&mpd);
- if (!ret) {
- if (mpd.map.m_len)
- ret = mpage_map_and_submit_extent(handle, &mpd,
+ if (!ret && mpd.map.m_len)
+ ret = mpage_map_and_submit_extent(handle, &mpd,
&give_up_on_write);
- else {
- /*
- * We scanned the whole range (or exhausted
- * nr_to_write), submitted what was mapped and
- * didn't find anything needing mapping. We are
- * done.
- */
- done = true;
- }
- }
/*
* Caution: If the handle is synchronous,
* ext4_journal_stop() can wait for transaction commit
* to finish which may depend on writeback of pages to
* complete or on page lock to be released. In that
- * case, we have to wait until after after we have
+ * case, we have to wait until after we have
* submitted all the IO, released page locks we hold,
* and dropped io_end reference (for extent conversion
* to be able to complete) before stopping the handle.
@@ -2908,46 +2960,30 @@ static int ext4_nonda_switch(struct super_block *sb)
return 0;
}
-/* We always reserve for an inode update; the superblock could be there too */
-static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
-{
- if (likely(ext4_has_feature_large_file(inode->i_sb)))
- return 1;
-
- if (pos + len <= 0x7fffffffULL)
- return 1;
-
- /* We might need to update the superblock to set LARGE_FILE */
- return 2;
-}
-
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret, retries = 0;
struct page *page;
pgoff_t index;
struct inode *inode = mapping->host;
- handle_t *handle;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
index = pos >> PAGE_SHIFT;
- if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
- ext4_verity_in_progress(inode)) {
+ if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
return ext4_write_begin(file, mapping, pos,
- len, flags, pagep, fsdata);
+ len, pagep, fsdata);
}
*fsdata = (void *)0;
- trace_ext4_da_write_begin(inode, pos, len, flags);
+ trace_ext4_da_write_begin(inode, pos, len);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
- ret = ext4_da_write_inline_data_begin(mapping, inode,
- pos, len, flags,
+ ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
pagep, fsdata);
if (ret < 0)
return ret;
@@ -2955,41 +2991,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
return 0;
}
- /*
- * grab_cache_page_write_begin() can take a long time if the
- * system is thrashing due to memory pressure, or if the page
- * is being written back. So grab it first before we start
- * the transaction handle. This also allows us to allocate
- * the page (if needed) without using GFP_NOFS.
- */
-retry_grab:
- page = grab_cache_page_write_begin(mapping, index, flags);
+retry:
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
- unlock_page(page);
- /*
- * With delayed allocation, we don't log the i_disksize update
- * if there is delayed block allocation. But we still need
- * to journalling the i_disksize update if writes to the end
- * of file which has an already mapped buffer.
- */
-retry_journal:
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- ext4_da_write_credits(inode, pos, len));
- if (IS_ERR(handle)) {
- put_page(page);
- return PTR_ERR(handle);
- }
-
- lock_page(page);
- if (page->mapping != mapping) {
- /* The page got truncated from under us */
- unlock_page(page);
- put_page(page);
- ext4_journal_stop(handle);
- goto retry_grab;
- }
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
@@ -3001,20 +3007,18 @@ retry_journal:
#endif
if (ret < 0) {
unlock_page(page);
- ext4_journal_stop(handle);
+ put_page(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
+ * i_size_read because we hold inode lock.
*/
if (pos + len > inode->i_size)
ext4_truncate_failed_write(inode);
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry_journal;
-
- put_page(page);
+ goto retry;
return ret;
}
@@ -3051,8 +3055,6 @@ static int ext4_da_write_end(struct file *file,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
- int ret = 0, ret2;
- handle_t *handle = ext4_journal_current_handle();
loff_t new_i_size;
unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
@@ -3062,44 +3064,36 @@ static int ext4_da_write_end(struct file *file,
len, copied, page, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
+
+ if (write_mode != CONVERT_INLINE_DATA &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+ ext4_has_inline_data(inode))
+ return ext4_write_inline_data_end(inode, pos, len, copied, page);
+
start = pos & (PAGE_SIZE - 1);
end = start + copied - 1;
/*
- * generic_write_end() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
+ * Since we are holding inode lock, we are sure i_disksize <=
+ * i_size. We also know that if i_disksize < i_size, there are
+ * delalloc writes pending in the range upto i_size. If the end of
+ * the current write is <= i_size, there's no need to touch
+ * i_disksize since writeback will push i_disksize upto i_size
+ * eventually. If the end of the current write is > i_size and
+ * inside an allocated block (ext4_da_should_update_i_disksize()
+ * check), we need to update i_disksize here as neither
+ * ext4_writepage() nor certain ext4_writepages() paths not
+ * allocating blocks update i_disksize.
+ *
+ * Note that we defer inode dirtying to generic_write_end() /
+ * ext4_da_write_inline_data_end().
*/
new_i_size = pos + copied;
- if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
- if (ext4_has_inline_data(inode) ||
- ext4_da_should_update_i_disksize(page, end)) {
- ext4_update_i_disksize(inode, new_i_size);
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
- */
- ext4_mark_inode_dirty(handle, inode);
- }
- }
+ if (copied && new_i_size > inode->i_size &&
+ ext4_da_should_update_i_disksize(page, end))
+ ext4_update_i_disksize(inode, new_i_size);
- if (write_mode != CONVERT_INLINE_DATA &&
- ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
- ext4_has_inline_data(inode))
- ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
- page);
- else
- ret2 = generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
-
- copied = ret2;
- if (ret2 < 0)
- ret = ret2;
- ret2 = ext4_journal_stop(handle);
- if (!ret)
- ret = ret2;
-
- return ret ? ret : copied;
+ return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
}
/*
@@ -3164,13 +3158,15 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
journal_t *journal;
+ sector_t ret = 0;
int err;
+ inode_lock_shared(inode);
/*
* We can get here for an inline file via the FIBMAP ioctl
*/
if (ext4_has_inline_data(inode))
- return 0;
+ goto out;
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
test_opt(inode->i_sb, DELALLOC)) {
@@ -3205,18 +3201,23 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
journal = EXT4_JOURNAL(inode);
jbd2_journal_lock_updates(journal);
- err = jbd2_journal_flush(journal);
+ err = jbd2_journal_flush(journal, 0);
jbd2_journal_unlock_updates(journal);
if (err)
- return 0;
+ goto out;
}
- return generic_block_bmap(mapping, block, ext4_get_block);
+ ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
+
+out:
+ inode_unlock_shared(inode);
+ return ret;
}
-static int ext4_readpage(struct file *file, struct page *page)
+static int ext4_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
int ret = -EAGAIN;
struct inode *inode = page->mapping->host;
@@ -3226,83 +3227,85 @@ static int ext4_readpage(struct file *file, struct page *page)
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
- return ext4_mpage_readpages(page->mapping, NULL, page, 1,
- false);
+ return ext4_mpage_readpages(inode, NULL, page);
return ret;
}
-static int
-ext4_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void ext4_readahead(struct readahead_control *rac)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = rac->mapping->host;
- /* If the file has inline data, no need to do readpages. */
+ /* If the file has inline data, no need to do readahead. */
if (ext4_has_inline_data(inode))
- return 0;
+ return;
- return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true);
+ ext4_mpage_readpages(inode, rac, NULL);
}
-static void ext4_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ext4_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- trace_ext4_invalidatepage(page, offset, length);
+ trace_ext4_invalidate_folio(folio, offset, length);
/* No journalling happens on data buffers when this function is used */
- WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));
- block_invalidatepage(page, offset, length);
+ block_invalidate_folio(folio, offset, length);
}
-static int __ext4_journalled_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static int __ext4_journalled_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+ journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
- trace_ext4_journalled_invalidatepage(page, offset, length);
+ trace_ext4_journalled_invalidate_folio(folio, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0 && length == PAGE_SIZE)
- ClearPageChecked(page);
+ if (offset == 0 && length == folio_size(folio))
+ folio_clear_checked(folio);
- return jbd2_journal_invalidatepage(journal, page, offset, length);
+ return jbd2_journal_invalidate_folio(journal, folio, offset, length);
}
/* Wrapper for aops... */
-static void ext4_journalled_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static void ext4_journalled_invalidate_folio(struct folio *folio,
+ size_t offset,
+ size_t length)
{
- WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
+ WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
}
-static int ext4_releasepage(struct page *page, gfp_t wait)
+static bool ext4_release_folio(struct folio *folio, gfp_t wait)
{
- journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+ journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
- trace_ext4_releasepage(page);
+ trace_ext4_releasepage(&folio->page);
/* Page has dirty journalled data -> cannot release */
- if (PageChecked(page))
- return 0;
+ if (folio_test_checked(folio))
+ return false;
if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ return jbd2_journal_try_to_free_buffers(journal, folio);
else
- return try_to_free_buffers(page);
+ return try_to_free_buffers(folio);
}
static bool ext4_inode_datasync_dirty(struct inode *inode)
{
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (journal)
- return !jbd2_transaction_committed(journal,
- EXT4_I(inode)->i_datasync_tid);
+ if (journal) {
+ if (jbd2_transaction_committed(journal,
+ EXT4_I(inode)->i_datasync_tid))
+ return false;
+ if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
+ return !list_empty(&EXT4_I(inode)->i_fc_list);
+ return true;
+ }
+
/* Any metadata buffers to write? */
if (!list_empty(&inode->i_mapping->private_list))
return true;
@@ -3311,7 +3314,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
struct ext4_map_blocks *map, loff_t offset,
- loff_t length)
+ loff_t length, unsigned int flags)
{
u8 blkbits = inode->i_blkbits;
@@ -3328,11 +3331,17 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+ else
+ iomap->bdev = inode->i_sb->s_bdev;
iomap->offset = (u64) map->m_lblk << blkbits;
iomap->length = (u64) map->m_len << blkbits;
+ if ((map->m_flags & EXT4_MAP_MAPPED) &&
+ !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ iomap->flags |= IOMAP_F_MERGED;
+
/*
* Flags passed to ext4_map_blocks() for direct I/O writes can result
* in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
@@ -3345,9 +3354,13 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_UNWRITTEN) {
iomap->type = IOMAP_UNWRITTEN;
iomap->addr = (u64) map->m_pblk << blkbits;
+ if (flags & IOMAP_DAX)
+ iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
} else if (map->m_flags & EXT4_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
iomap->addr = (u64) map->m_pblk << blkbits;
+ if (flags & IOMAP_DAX)
+ iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
@@ -3384,8 +3397,8 @@ retry:
* DAX and direct I/O are the only two operations that are currently
* supported with IOMAP_WRITE.
*/
- WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
- if (IS_DAX(inode))
+ WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT)));
+ if (flags & IOMAP_DAX)
m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
/*
* We use i_size instead of i_disksize here because delalloc writeback
@@ -3393,7 +3406,7 @@ retry:
* i_disksize out to i_size. This could be beyond where direct I/O is
* happening and thus expose allocated blocks to direct I/O reads.
*/
- else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode))
+ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
m_flags = EXT4_GET_BLOCKS_CREATE;
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -3436,15 +3449,34 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
- if (flags & IOMAP_WRITE)
+ if (flags & IOMAP_WRITE) {
+ /*
+ * We check here if the blocks are already allocated, then we
+ * don't need to start a journal txn and we can directly return
+ * the mapping information. This could boost performance
+ * especially in multi-threaded overwrite requests.
+ */
+ if (offset + length <= i_size_read(inode)) {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
+ goto out;
+ }
ret = ext4_iomap_alloc(inode, &map, flags);
- else
+ } else {
ret = ext4_map_blocks(NULL, inode, &map, 0);
+ }
if (ret < 0)
return ret;
+out:
+ /*
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
+ * has to be broken up to guarantee DUN contiguity. Handle this by
+ * limiting the length of the mapping returned.
+ */
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
- ext4_set_iomap(inode, iomap, &map, offset, length);
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
}
@@ -3542,13 +3574,29 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ /*
+ * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
+ * So handle it here itself instead of querying ext4_map_blocks().
+ * Since ext4_map_blocks() will warn about it and will return
+ * -EIO error.
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (offset >= sbi->s_bitmap_maxbytes) {
+ map.m_flags = 0;
+ goto set_iomap;
+ }
+ }
+
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0)
return ret;
if (ret == 0)
delalloc = ext4_iomap_is_delalloc(inode, &map);
- ext4_set_iomap(inode, iomap, &map, offset, length);
+set_iomap:
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);
if (delalloc && iomap->type == IOMAP_HOLE)
iomap->type = IOMAP_DELALLOC;
@@ -3560,87 +3608,100 @@ const struct iomap_ops ext4_iomap_report_ops = {
};
/*
- * Pages can be marked dirty completely asynchronously from ext4's journalling
- * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
- * much here because ->set_page_dirty is called under VFS locks. The page is
- * not necessarily locked.
+ * Whenever the folio is being dirtied, corresponding buffers should already
+ * be attached to the transaction (we take care of this in ext4_page_mkwrite()
+ * and ext4_write_begin()). However we cannot move buffers to dirty transaction
+ * lists here because ->dirty_folio is called under VFS locks and the folio
+ * is not necessarily locked.
*
- * We cannot just dirty the page and leave attached buffers clean, because the
+ * We cannot just dirty the folio and leave attached buffers clean, because the
* buffers' dirty state is "definitive". We cannot just set the buffers dirty
* or jbddirty because all the journalling code will explode.
*
- * So what we do is to mark the page "pending dirty" and next time writepage
+ * So what we do is to mark the folio "pending dirty" and next time writepage
* is called, propagate that into the buffers appropriately.
*/
-static int ext4_journalled_set_page_dirty(struct page *page)
+static bool ext4_journalled_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- SetPageChecked(page);
- return __set_page_dirty_nobuffers(page);
+ WARN_ON_ONCE(!folio_buffers(folio));
+ folio_set_checked(folio);
+ return filemap_dirty_folio(mapping, folio);
}
-static int ext4_set_page_dirty(struct page *page)
+static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page));
- WARN_ON_ONCE(!page_has_buffers(page));
- return __set_page_dirty_buffers(page);
+ WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
+ WARN_ON_ONCE(!folio_buffers(folio));
+ return block_dirty_folio(mapping, folio);
+}
+
+static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
+ struct file *file, sector_t *span)
+{
+ return iomap_swapfile_activate(sis, file, span,
+ &ext4_iomap_report_ops);
}
static const struct address_space_operations ext4_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .read_folio = ext4_read_folio,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
- .set_page_dirty = ext4_set_page_dirty,
+ .dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
+ .invalidate_folio = ext4_invalidate_folio,
+ .release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_journalled_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .read_folio = ext4_read_folio,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
- .set_page_dirty = ext4_journalled_set_page_dirty,
+ .dirty_folio = ext4_journalled_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_journalled_invalidatepage,
- .releasepage = ext4_releasepage,
+ .invalidate_folio = ext4_journalled_invalidate_folio,
+ .release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_da_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .read_folio = ext4_read_folio,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
- .set_page_dirty = ext4_set_page_dirty,
+ .dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
+ .invalidate_folio = ext4_invalidate_folio,
+ .release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = noop_set_page_dirty,
+ .dirty_folio = noop_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = noop_invalidatepage,
+ .swap_activate = ext4_iomap_swap_activate,
};
void ext4_set_aops(struct inode *inode)
@@ -3714,13 +3775,10 @@ static int __ext4_block_zero_page_range(handle_t *handle,
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
- wait_on_buffer(bh);
- /* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
+ err = ext4_read_bh_lock(bh, 0, true);
+ if (err)
goto unlock;
- if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
@@ -3733,7 +3791,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
}
if (ext4_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto unlock;
}
@@ -3761,7 +3820,7 @@ unlock:
* starting from file offset 'from'. The range to be zero'd must
* be contained with in one block. If the specified range exceeds
* the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
+ * that corresponds to 'from'
*/
static int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
@@ -3779,8 +3838,8 @@ static int ext4_block_zero_page_range(handle_t *handle,
length = max;
if (IS_DAX(inode)) {
- return iomap_zero_range(inode, from, length, NULL,
- &ext4_iomap_ops);
+ return dax_zero_range(inode, from, length, NULL,
+ &ext4_iomap_ops);
}
return __ext4_block_zero_page_range(handle, mapping, from, length);
}
@@ -3868,6 +3927,8 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
loff_t len)
{
handle_t *handle;
+ int ret;
+
loff_t size = i_size_read(inode);
WARN_ON(!inode_is_locked(inode));
@@ -3881,26 +3942,25 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
if (IS_ERR(handle))
return PTR_ERR(handle);
ext4_update_i_disksize(inode, size);
- ext4_mark_inode_dirty(handle, inode);
+ ret = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
- return 0;
+ return ret;
}
-static void ext4_wait_dax_page(struct ext4_inode_info *ei)
+static void ext4_wait_dax_page(struct inode *inode)
{
- up_write(&ei->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
schedule();
- down_write(&ei->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
}
int ext4_break_layouts(struct inode *inode)
{
- struct ext4_inode_info *ei = EXT4_I(inode);
struct page *page;
int error;
- if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
+ if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
return -EINVAL;
do {
@@ -3911,7 +3971,7 @@ int ext4_break_layouts(struct inode *inode)
error = ___wait_var_event(&page->_refcount,
atomic_read(&page->_refcount) == 1,
TASK_INTERRUPTIBLE, 0, 0,
- ext4_wait_dax_page(ei));
+ ext4_wait_dax_page(inode));
} while (error == 0);
return error;
@@ -3928,27 +3988,20 @@ int ext4_break_layouts(struct inode *inode)
* Returns: 0 on success or negative on failure
*/
-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset;
+ loff_t first_block_offset, last_block_offset, max_length;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
unsigned int credits;
- int ret = 0;
+ int ret = 0, ret2 = 0;
trace_ext4_punch_hole(inode, offset, length, 0);
- ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
- if (ext4_has_inline_data(inode)) {
- down_write(&EXT4_I(inode)->i_mmap_sem);
- ret = ext4_convert_inline_data(inode);
- up_write(&EXT4_I(inode)->i_mmap_sem);
- if (ret)
- return ret;
- }
-
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
@@ -3976,6 +4029,14 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
offset;
}
+ /*
+ * For punch hole the length + offset needs to be within one block
+ * before last range. Adjust the length if it goes beyond that limit.
+ */
+ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
+ if (offset + length > max_length)
+ length = max_length - offset;
+
if (offset & (sb->s_blocksize - 1) ||
(offset + length) & (sb->s_blocksize - 1)) {
/*
@@ -3988,14 +4049,18 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
- /* Wait all existing dio workers, newcomers will block on i_mutex */
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
@@ -4037,7 +4102,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (stop_block > first_block) {
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_es_remove_extent(inode, first_block,
stop_block - first_block);
@@ -4055,17 +4120,20 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
up_write(&EXT4_I(inode)->i_data_sem);
}
+ ext4_fc_track_range(handle, inode, first_block, stop_block);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
inode->i_mtime = inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret2))
+ ret = ret2;
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
ext4_journal_stop(handle);
out_dio:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
@@ -4128,23 +4196,21 @@ int ext4_truncate(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int credits;
- int err = 0;
+ int err = 0, err2;
handle_t *handle;
struct address_space *mapping = inode->i_mapping;
/*
* There is a possibility that we're either freeing the inode
* or it's a completely new inode. In those cases we might not
- * have i_mutex locked because it's not necessary.
+ * have i_rwsem locked because it's not necessary.
*/
if (!(inode->i_state & (I_NEW|I_FREEING)))
WARN_ON(!inode_is_locked(inode));
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
- return 0;
-
- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ goto out_trace;
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
@@ -4153,16 +4219,14 @@ int ext4_truncate(struct inode *inode)
int has_inline = 1;
err = ext4_inline_data_truncate(inode, &has_inline);
- if (err)
- return err;
- if (has_inline)
- return 0;
+ if (err || has_inline)
+ goto out_trace;
}
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
if (ext4_inode_attach_jinode(inode) < 0)
- return 0;
+ goto out_trace;
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4171,8 +4235,10 @@ int ext4_truncate(struct inode *inode)
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_trace;
+ }
if (inode->i_size & (inode->i_sb->s_blocksize - 1))
ext4_block_truncate_page(handle, mapping, inode->i_size);
@@ -4192,7 +4258,7 @@ int ext4_truncate(struct inode *inode)
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
err = ext4_ext_truncate(handle, inode);
@@ -4218,35 +4284,185 @@ out_stop:
ext4_orphan_del(handle, inode);
inode->i_mtime = inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err2 && !err))
+ err = err2;
ext4_journal_stop(handle);
+out_trace:
trace_ext4_truncate_exit(inode);
return err;
}
+static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
+{
+ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ return inode_peek_iversion_raw(inode);
+ else
+ return inode_peek_iversion(inode);
+}
+
+static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ struct inode *inode = &(ei->vfs_inode);
+ u64 i_blocks = READ_ONCE(inode->i_blocks);
+ struct super_block *sb = inode->i_sb;
+
+ if (i_blocks <= ~0U) {
+ /*
+ * i_blocks can be represented in a 32 bit variable
+ * as multiple of 512 bytes
+ */
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = 0;
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ return 0;
+ }
+
+ /*
+ * This should never happen since sb->s_maxbytes should not have
+ * allowed this, sb->s_maxbytes was set according to the huge_file
+ * feature in ext4_fill_super().
+ */
+ if (!ext4_has_feature_huge_file(sb))
+ return -EFSCORRUPTED;
+
+ if (i_blocks <= 0xffffffffffffULL) {
+ /*
+ * i_blocks can be represented in a 48 bit variable
+ * as multiple of 512 bytes
+ */
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ } else {
+ ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ /* i_block is stored in file system block size */
+ i_blocks = i_blocks >> (inode->i_blkbits - 9);
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+ }
+ return 0;
+}
+
+static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ uid_t i_uid;
+ gid_t i_gid;
+ projid_t i_projid;
+ int block;
+ int err;
+
+ err = ext4_inode_blocks_set(raw_inode, ei);
+
+ raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
+ i_projid = from_kprojid(&init_user_ns, ei->i_projid);
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
+ /*
+ * Fix up interoperability with old kernels. Otherwise,
+ * old inodes get re-used with the upper 16 bits of the
+ * uid/gid intact.
+ */
+ if (ei->i_dtime && list_empty(&ei->i_orphan)) {
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ } else {
+ raw_inode->i_uid_high =
+ cpu_to_le16(high_16_bits(i_uid));
+ raw_inode->i_gid_high =
+ cpu_to_le16(high_16_bits(i_gid));
+ }
+ } else {
+ raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ }
+ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+
+ EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
+
+ raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
+ raw_inode->i_file_acl_high =
+ cpu_to_le16(ei->i_file_acl >> 32);
+ raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+ ext4_isize_set(raw_inode, ei->i_disksize);
+
+ raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (old_valid_dev(inode->i_rdev)) {
+ raw_inode->i_block[0] =
+ cpu_to_le32(old_encode_dev(inode->i_rdev));
+ raw_inode->i_block[1] = 0;
+ } else {
+ raw_inode->i_block[0] = 0;
+ raw_inode->i_block[1] =
+ cpu_to_le32(new_encode_dev(inode->i_rdev));
+ raw_inode->i_block[2] = 0;
+ }
+ } else if (!ext4_has_inline_data(inode)) {
+ for (block = 0; block < EXT4_N_BLOCKS; block++)
+ raw_inode->i_block[block] = ei->i_data[block];
+ }
+
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ u64 ivers = ext4_inode_peek_iversion(inode);
+
+ raw_inode->i_disk_version = cpu_to_le32(ivers);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ raw_inode->i_version_hi =
+ cpu_to_le32(ivers >> 32);
+ raw_inode->i_extra_isize =
+ cpu_to_le16(ei->i_extra_isize);
+ }
+ }
+
+ if (i_projid != EXT4_DEF_PROJID &&
+ !ext4_has_feature_project(inode->i_sb))
+ err = err ?: -EFSCORRUPTED;
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+ raw_inode->i_projid = cpu_to_le32(i_projid);
+
+ ext4_inode_csum_set(inode, raw_inode, ei);
+ return err;
+}
+
/*
* ext4_get_inode_loc returns with an extra refcount against the inode's
- * underlying buffer_head on success. If 'in_mem' is true, we have all
- * data in memory that is needed to recreate the on-disk version of this
- * inode.
+ * underlying buffer_head on success. If we pass 'inode' and it does not
+ * have in-inode xattr, we have all inode data in memory that is needed
+ * to recreate the on-disk version of this inode.
*/
-static int __ext4_get_inode_loc(struct inode *inode,
- struct ext4_iloc *iloc, int in_mem)
+static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
+ struct inode *inode, struct ext4_iloc *iloc,
+ ext4_fsblk_t *ret_block)
{
struct ext4_group_desc *gdp;
struct buffer_head *bh;
- struct super_block *sb = inode->i_sb;
ext4_fsblk_t block;
struct blk_plug plug;
int inodes_per_block, inode_offset;
iloc->bh = NULL;
- if (inode->i_ino < EXT4_ROOT_INO ||
- inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+ if (ino < EXT4_ROOT_INO ||
+ ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
return -EFSCORRUPTED;
- iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
if (!gdp)
return -EIO;
@@ -4255,7 +4471,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
* Figure out the offset within the block group inode table
*/
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
- inode_offset = ((inode->i_ino - 1) %
+ inode_offset = ((ino - 1) %
EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
@@ -4263,129 +4479,153 @@ static int __ext4_get_inode_loc(struct inode *inode,
bh = sb_getblk(sb, block);
if (unlikely(!bh))
return -ENOMEM;
- if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
- goto simulate_eio;
- if (!buffer_uptodate(bh)) {
- lock_buffer(bh);
+ if (ext4_buffer_uptodate(bh))
+ goto has_buffer;
- /*
- * If the buffer has the write error flag, we have failed
- * to write out another inode in the same block. In this
- * case, we don't have to read the block because we may
- * read the old inode data successfully.
- */
- if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
- set_buffer_uptodate(bh);
-
- if (buffer_uptodate(bh)) {
- /* someone brought it uptodate while we waited */
- unlock_buffer(bh);
- goto has_buffer;
- }
+ lock_buffer(bh);
+ if (ext4_buffer_uptodate(bh)) {
+ /* Someone brought it uptodate while we waited */
+ unlock_buffer(bh);
+ goto has_buffer;
+ }
- /*
- * If we have all information of the inode in memory and this
- * is the only valid inode in the block, we need not read the
- * block.
- */
- if (in_mem) {
- struct buffer_head *bitmap_bh;
- int i, start;
+ /*
+ * If we have all information of the inode in memory and this
+ * is the only valid inode in the block, we need not read the
+ * block.
+ */
+ if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+ struct buffer_head *bitmap_bh;
+ int i, start;
- start = inode_offset & ~(inodes_per_block - 1);
+ start = inode_offset & ~(inodes_per_block - 1);
- /* Is the inode bitmap in cache? */
- bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
- if (unlikely(!bitmap_bh))
- goto make_io;
+ /* Is the inode bitmap in cache? */
+ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
+ if (unlikely(!bitmap_bh))
+ goto make_io;
- /*
- * If the inode bitmap isn't in cache then the
- * optimisation may end up performing two reads instead
- * of one, so skip it.
- */
- if (!buffer_uptodate(bitmap_bh)) {
- brelse(bitmap_bh);
- goto make_io;
- }
- for (i = start; i < start + inodes_per_block; i++) {
- if (i == inode_offset)
- continue;
- if (ext4_test_bit(i, bitmap_bh->b_data))
- break;
- }
+ /*
+ * If the inode bitmap isn't in cache then the
+ * optimisation may end up performing two reads instead
+ * of one, so skip it.
+ */
+ if (!buffer_uptodate(bitmap_bh)) {
brelse(bitmap_bh);
- if (i == start + inodes_per_block) {
- /* all other inodes are free, so skip I/O */
- memset(bh->b_data, 0, bh->b_size);
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- goto has_buffer;
- }
+ goto make_io;
}
+ for (i = start; i < start + inodes_per_block; i++) {
+ if (i == inode_offset)
+ continue;
+ if (ext4_test_bit(i, bitmap_bh->b_data))
+ break;
+ }
+ brelse(bitmap_bh);
+ if (i == start + inodes_per_block) {
+ struct ext4_inode *raw_inode =
+ (struct ext4_inode *) (bh->b_data + iloc->offset);
+
+ /* all other inodes are free, so skip I/O */
+ memset(bh->b_data, 0, bh->b_size);
+ if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
+ ext4_fill_raw_inode(inode, raw_inode);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ goto has_buffer;
+ }
+ }
make_io:
- /*
- * If we need to do any I/O, try to pre-readahead extra
- * blocks from the inode table.
- */
- blk_start_plug(&plug);
- if (EXT4_SB(sb)->s_inode_readahead_blks) {
- ext4_fsblk_t b, end, table;
- unsigned num;
- __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
-
- table = ext4_inode_table(sb, gdp);
- /* s_inode_readahead_blks is always a power of 2 */
- b = block & ~((ext4_fsblk_t) ra_blks - 1);
- if (table > b)
- b = table;
- end = b + ra_blks;
- num = EXT4_INODES_PER_GROUP(sb);
- if (ext4_has_group_desc_csum(sb))
- num -= ext4_itable_unused_count(sb, gdp);
- table += num / inodes_per_block;
- if (end > table)
- end = table;
- while (b <= end)
- sb_breadahead(sb, b++);
- }
+ /*
+ * If we need to do any I/O, try to pre-readahead extra
+ * blocks from the inode table.
+ */
+ blk_start_plug(&plug);
+ if (EXT4_SB(sb)->s_inode_readahead_blks) {
+ ext4_fsblk_t b, end, table;
+ unsigned num;
+ __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
+
+ table = ext4_inode_table(sb, gdp);
+ /* s_inode_readahead_blks is always a power of 2 */
+ b = block & ~((ext4_fsblk_t) ra_blks - 1);
+ if (table > b)
+ b = table;
+ end = b + ra_blks;
+ num = EXT4_INODES_PER_GROUP(sb);
+ if (ext4_has_group_desc_csum(sb))
+ num -= ext4_itable_unused_count(sb, gdp);
+ table += num / inodes_per_block;
+ if (end > table)
+ end = table;
+ while (b <= end)
+ ext4_sb_breadahead_unmovable(sb, b++);
+ }
- /*
- * There are other valid inodes in the buffer, this inode
- * has in-inode xattrs, or we don't have this inode in memory.
- * Read the block from disk.
- */
- trace_ext4_load_inode(inode);
- get_bh(bh);
- bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
- blk_finish_plug(&plug);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- simulate_eio:
- ext4_set_errno(inode->i_sb, EIO);
- EXT4_ERROR_INODE_BLOCK(inode, block,
- "unable to read itable block");
- brelse(bh);
- return -EIO;
- }
+ /*
+ * There are other valid inodes in the buffer, this inode
+ * has in-inode xattrs, or we don't have this inode in memory.
+ * Read the block from disk.
+ */
+ trace_ext4_load_inode(sb, ino);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
+ blk_finish_plug(&plug);
+ wait_on_buffer(bh);
+ ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO);
+ if (!buffer_uptodate(bh)) {
+ if (ret_block)
+ *ret_block = block;
+ brelse(bh);
+ return -EIO;
}
has_buffer:
iloc->bh = bh;
return 0;
}
+static int __ext4_get_inode_loc_noinmem(struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ ext4_fsblk_t err_blk = 0;
+ int ret;
+
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
+ &err_blk);
+
+ if (ret == -EIO)
+ ext4_error_inode_block(inode, err_blk, EIO,
+ "unable to read itable block");
+
+ return ret;
+}
+
int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
- /* We have all inode data except xattrs in memory here. */
- return __ext4_get_inode_loc(inode, iloc,
- !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
+ ext4_fsblk_t err_blk = 0;
+ int ret;
+
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
+ &err_blk);
+
+ if (ret == -EIO)
+ ext4_error_inode_block(inode, err_blk, EIO,
+ "unable to read itable block");
+
+ return ret;
+}
+
+
+int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+ struct ext4_iloc *iloc)
+{
+ return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
}
-static bool ext4_should_use_dax(struct inode *inode)
+static bool ext4_should_enable_dax(struct inode *inode)
{
- if (!test_opt(inode->i_sb, DAX))
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (test_opt2(inode->i_sb, DAX_NEVER))
return false;
if (!S_ISREG(inode->i_mode))
return false;
@@ -4397,14 +4637,21 @@ static bool ext4_should_use_dax(struct inode *inode)
return false;
if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
return false;
- return true;
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
+ return false;
+ if (test_opt(inode->i_sb, DAX_ALWAYS))
+ return true;
+
+ return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
}
-void ext4_set_inode_flags(struct inode *inode)
+void ext4_set_inode_flags(struct inode *inode, bool init)
{
unsigned int flags = EXT4_I(inode)->i_flags;
unsigned int new_fl = 0;
+ WARN_ON_ONCE(IS_DAX(inode) && init);
+
if (flags & EXT4_SYNC_FL)
new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
@@ -4415,8 +4662,13 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (ext4_should_use_dax(inode))
+
+ /* Because of the way inode_set_flags() works we must preserve S_DAX
+ * here if already set. */
+ new_fl |= (inode->i_flags & S_DAX);
+ if (init && ext4_should_enable_dax(inode))
new_fl |= S_DAX;
+
if (flags & EXT4_ENCRYPT_FL)
new_fl |= S_ENCRYPTED;
if (flags & EXT4_CASEFOLD_FL)
@@ -4457,8 +4709,7 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
__le32 *magic = (void *)raw_inode +
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
- EXT4_INODE_SIZE(inode->i_sb) &&
+ if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
return ext4_find_inline_data_nolock(inode);
@@ -4487,13 +4738,6 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
else
inode_set_iversion_queried(inode, val);
}
-static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
-{
- if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return inode_peek_iversion_raw(inode);
- else
- return inode_peek_iversion(inode);
-}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
@@ -4502,6 +4746,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct inode *inode;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
@@ -4512,12 +4757,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
projid_t i_projid;
if ((!(flags & EXT4_IGET_SPECIAL) &&
- (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) ||
+ ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
+ ino == le32_to_cpu(es->s_usr_quota_inum) ||
+ ino == le32_to_cpu(es->s_grp_quota_inum) ||
+ ino == le32_to_cpu(es->s_prj_quota_inum) ||
+ ino == le32_to_cpu(es->s_orphan_file_inum))) ||
(ino < EXT4_ROOT_INO) ||
- (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
+ (ino > le32_to_cpu(es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
return ERR_PTR(-ESTALE);
- __ext4_error(sb, function, line,
+ __ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
"inode #%lu: comm %s: iget: illegal inode #",
ino, current->comm);
return ERR_PTR(-EFSCORRUPTED);
@@ -4532,7 +4781,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei = EXT4_I(inode);
iloc.bh = NULL;
- ret = __ext4_get_inode_loc(inode, &iloc, 0);
+ ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
if (ret < 0)
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
@@ -4578,11 +4827,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
sizeof(gen));
}
- if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
- ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
- ext4_set_errno(inode->i_sb, EFSBADCRC);
- ext4_error_inode(inode, function, line, 0,
- "iget: checksum invalid");
+ if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
+ (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
+ ext4_error_inode_err(inode, function, line, 0,
+ EFSBADCRC, "iget: checksum invalid");
ret = -EFSBADCRC;
goto bad_inode;
}
@@ -4631,7 +4880,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
if (ext4_has_feature_64bit(sb))
@@ -4670,6 +4919,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
for (block = 0; block < EXT4_N_BLOCKS; block++)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ ext4_fc_init_inode(&ei->vfs_inode);
/*
* Set transaction id's of transactions that have to be committed
@@ -4727,7 +4977,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = 0;
if (ei->i_file_acl &&
- !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+ !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad extended attribute block %llu",
ei->i_file_acl);
@@ -4735,9 +4985,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
goto bad_inode;
} else if (!ext4_has_inline_data(inode)) {
/* validate the block references in the inode */
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode))) {
+ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode)))) {
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_check_inode(inode);
else
@@ -4765,7 +5016,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
}
if (IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations;
- ext4_set_aops(inode);
} else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_link = (char *)ei->i_data;
inode->i_op = &ext4_fast_symlink_inode_operations;
@@ -4773,9 +5023,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
sizeof(ei->i_data) - 1);
} else {
inode->i_op = &ext4_symlink_inode_operations;
- ext4_set_aops(inode);
}
- inode_nohighmem(inode);
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
@@ -4807,80 +5055,37 @@ bad_inode:
return ERR_PTR(ret);
}
-static int ext4_inode_blocks_set(handle_t *handle,
- struct ext4_inode *raw_inode,
- struct ext4_inode_info *ei)
+static void __ext4_update_other_inode_time(struct super_block *sb,
+ unsigned long orig_ino,
+ unsigned long ino,
+ struct ext4_inode *raw_inode)
{
- struct inode *inode = &(ei->vfs_inode);
- u64 i_blocks = inode->i_blocks;
- struct super_block *sb = inode->i_sb;
-
- if (i_blocks <= ~0U) {
- /*
- * i_blocks can be represented in a 32 bit variable
- * as multiple of 512 bytes
- */
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = 0;
- ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- return 0;
- }
- if (!ext4_has_feature_huge_file(sb))
- return -EFBIG;
-
- if (i_blocks <= 0xffffffffffffULL) {
- /*
- * i_blocks can be represented in a 48 bit variable
- * as multiple of 512 bytes
- */
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- } else {
- ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- /* i_block is stored in file system block size */
- i_blocks = i_blocks >> (inode->i_blkbits - 9);
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- }
- return 0;
-}
+ struct inode *inode;
-struct other_inode {
- unsigned long orig_ino;
- struct ext4_inode *raw_inode;
-};
+ inode = find_inode_by_ino_rcu(sb, ino);
+ if (!inode)
+ return;
-static int other_inode_match(struct inode * inode, unsigned long ino,
- void *data)
-{
- struct other_inode *oi = (struct other_inode *) data;
+ if (!inode_is_dirtytime_only(inode))
+ return;
- if ((inode->i_ino != ino) ||
- (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_INODE)) ||
- ((inode->i_state & I_DIRTY_TIME) == 0))
- return 0;
spin_lock(&inode->i_lock);
- if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_INODE)) == 0) &&
- (inode->i_state & I_DIRTY_TIME)) {
+ if (inode_is_dirtytime_only(inode)) {
struct ext4_inode_info *ei = EXT4_I(inode);
- inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
- EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
- ext4_inode_csum_set(inode, oi->raw_inode, ei);
+ EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ ext4_inode_csum_set(inode, raw_inode, ei);
spin_unlock(&ei->i_raw_lock);
- trace_ext4_other_inode_update_time(inode, oi->orig_ino);
- return -1;
+ trace_ext4_other_inode_update_time(inode, orig_ino);
+ return;
}
spin_unlock(&inode->i_lock);
- return -1;
}
/*
@@ -4890,24 +5095,24 @@ static int other_inode_match(struct inode * inode, unsigned long ino,
static void ext4_update_other_inodes_time(struct super_block *sb,
unsigned long orig_ino, char *buf)
{
- struct other_inode oi;
unsigned long ino;
int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
int inode_size = EXT4_INODE_SIZE(sb);
- oi.orig_ino = orig_ino;
/*
* Calculate the first inode in the inode table block. Inode
* numbers are one-based. That is, the first inode in a block
* (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
*/
ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
+ rcu_read_lock();
for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
if (ino == orig_ino)
continue;
- oi.raw_inode = (struct ext4_inode *) buf;
- (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
+ __ext4_update_other_inode_time(sb, orig_ino, ino,
+ (struct ext4_inode *)buf);
}
+ rcu_read_unlock();
}
/*
@@ -4925,134 +5130,62 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
struct super_block *sb = inode->i_sb;
- int err = 0, rc, block;
+ int err;
int need_datasync = 0, set_large_file = 0;
- uid_t i_uid;
- gid_t i_gid;
- projid_t i_projid;
spin_lock(&ei->i_raw_lock);
- /* For fields not tracked in the in-memory inode,
- * initialise them to zero for new inodes. */
+ /*
+ * For fields not tracked in the in-memory inode, initialise them
+ * to zero for new inodes.
+ */
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
- raw_inode->i_mode = cpu_to_le16(inode->i_mode);
- i_uid = i_uid_read(inode);
- i_gid = i_gid_read(inode);
- i_projid = from_kprojid(&init_user_ns, ei->i_projid);
- if (!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
-/*
- * Fix up interoperability with old kernels. Otherwise, old inodes get
- * re-used with the upper 16 bits of the uid/gid intact
- */
- if (ei->i_dtime && list_empty(&ei->i_orphan)) {
- raw_inode->i_uid_high = 0;
- raw_inode->i_gid_high = 0;
- } else {
- raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(i_uid));
- raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(i_gid));
- }
- } else {
- raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
- raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
- raw_inode->i_uid_high = 0;
- raw_inode->i_gid_high = 0;
- }
- raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-
- EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
- EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
-
- err = ext4_inode_blocks_set(handle, raw_inode, ei);
- if (err) {
- spin_unlock(&ei->i_raw_lock);
- goto out_brelse;
- }
- raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
- if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
- raw_inode->i_file_acl_high =
- cpu_to_le16(ei->i_file_acl >> 32);
- raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
need_datasync = 1;
- }
if (ei->i_disksize > 0x7fffffffULL) {
if (!ext4_has_feature_large_file(sb) ||
- EXT4_SB(sb)->s_es->s_rev_level ==
- cpu_to_le32(EXT4_GOOD_OLD_REV))
+ EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
set_large_file = 1;
}
- raw_inode->i_generation = cpu_to_le32(inode->i_generation);
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (old_valid_dev(inode->i_rdev)) {
- raw_inode->i_block[0] =
- cpu_to_le32(old_encode_dev(inode->i_rdev));
- raw_inode->i_block[1] = 0;
- } else {
- raw_inode->i_block[0] = 0;
- raw_inode->i_block[1] =
- cpu_to_le32(new_encode_dev(inode->i_rdev));
- raw_inode->i_block[2] = 0;
- }
- } else if (!ext4_has_inline_data(inode)) {
- for (block = 0; block < EXT4_N_BLOCKS; block++)
- raw_inode->i_block[block] = ei->i_data[block];
- }
- if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
- u64 ivers = ext4_inode_peek_iversion(inode);
-
- raw_inode->i_disk_version = cpu_to_le32(ivers);
- if (ei->i_extra_isize) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- raw_inode->i_version_hi =
- cpu_to_le32(ivers >> 32);
- raw_inode->i_extra_isize =
- cpu_to_le16(ei->i_extra_isize);
- }
+ err = ext4_fill_raw_inode(inode, raw_inode);
+ spin_unlock(&ei->i_raw_lock);
+ if (err) {
+ EXT4_ERROR_INODE(inode, "corrupted inode contents");
+ goto out_brelse;
}
- BUG_ON(!ext4_has_feature_project(inode->i_sb) &&
- i_projid != EXT4_DEF_PROJID);
-
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
- EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
- raw_inode->i_projid = cpu_to_le32(i_projid);
-
- ext4_inode_csum_set(inode, raw_inode, ei);
- spin_unlock(&ei->i_raw_lock);
if (inode->i_sb->s_flags & SB_LAZYTIME)
ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
bh->b_data);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, NULL, bh);
- if (!err)
- err = rc;
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out_error;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
if (set_large_file) {
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb,
+ EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE);
if (err)
- goto out_brelse;
+ goto out_error;
+ lock_buffer(EXT4_SB(sb)->s_sbh);
ext4_set_feature_large_file(sb);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(EXT4_SB(sb)->s_sbh);
ext4_handle_sync(handle);
- err = ext4_handle_dirty_super(handle, sb);
+ err = ext4_handle_dirty_metadata(handle, NULL,
+ EXT4_SB(sb)->s_sbh);
}
ext4_update_inode_fsync_trans(handle, inode, need_datasync);
+out_error:
+ ext4_std_error(inode->i_sb, err);
out_brelse:
brelse(bh);
- ext4_std_error(inode->i_sb, err);
return err;
}
@@ -5103,7 +5236,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (EXT4_SB(inode->i_sb)->s_journal) {
if (ext4_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ ext4_debug("called recursively, non-PF_MEMALLOC!\n");
dump_stack();
return -EIO;
}
@@ -5116,12 +5249,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
- err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+ err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
EXT4_I(inode)->i_sync_tid);
} else {
struct ext4_iloc iloc;
- err = __ext4_get_inode_loc(inode, &iloc, 0);
+ err = __ext4_get_inode_loc_noinmem(inode, &iloc);
if (err)
return err;
/*
@@ -5131,9 +5264,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
- ext4_set_errno(inode->i_sb, EIO);
- EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
- "IO error syncing inode");
+ ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
+ "IO error syncing inode");
err = -EIO;
}
brelse(iloc.bh);
@@ -5142,13 +5274,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
- * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
- * buffers that are attached to a page stradding i_size and are undergoing
+ * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
+ * buffers that are attached to a folio straddling i_size and are undergoing
* commit. In that case we have to wait for commit to finish and try again.
*/
static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
- struct page *page;
unsigned offset;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = 0;
@@ -5156,25 +5287,25 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
offset = inode->i_size & (PAGE_SIZE - 1);
/*
- * If the page is fully truncated, we don't need to wait for any commit
- * (and we even should not as __ext4_journalled_invalidatepage() may
- * strip all buffers from the page but keep the page dirty which can then
- * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+ * If the folio is fully truncated, we don't need to wait for any commit
+ * (and we even should not as __ext4_journalled_invalidate_folio() may
+ * strip all buffers from the folio but keep the folio dirty which can then
+ * confuse e.g. concurrent ext4_writepage() seeing dirty folio without
* buffers). Also we don't need to wait for any commit if all buffers in
- * the page remain valid. This is most beneficial for the common case of
+ * the folio remain valid. This is most beneficial for the common case of
* blocksize == PAGESIZE.
*/
if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
return;
while (1) {
- page = find_lock_page(inode->i_mapping,
+ struct folio *folio = filemap_lock_folio(inode->i_mapping,
inode->i_size >> PAGE_SHIFT);
- if (!page)
+ if (!folio)
return;
- ret = __ext4_journalled_invalidatepage(page, offset,
- PAGE_SIZE - offset);
- unlock_page(page);
- put_page(page);
+ ret = __ext4_journalled_invalidate_folio(folio, offset,
+ folio_size(folio) - offset);
+ folio_unlock(folio);
+ folio_put(folio);
if (ret != -EBUSY)
return;
commit_tid = 0;
@@ -5209,14 +5340,16 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
* transaction are already on disk (truncate waits for pages under
* writeback).
*
- * Called with inode->i_mutex down.
+ * Called with inode->i_rwsem down.
*/
-int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int error, rc = 0;
int orphan = 0;
const unsigned int ia_valid = attr->ia_valid;
+ bool inc_ivers = true;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
@@ -5229,7 +5362,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ATTR_GID | ATTR_TIMES_SET))))
return -EPERM;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(mnt_userns, dentry, attr);
if (error)
return error;
@@ -5241,13 +5374,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
error = dquot_initialize(inode);
if (error)
return error;
}
- if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
- (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+
+ if (i_uid_needs_update(mnt_userns, attr, inode) ||
+ i_gid_needs_update(mnt_userns, attr, inode)) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
@@ -5264,7 +5398,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* counts xattr inode references.
*/
down_read(&EXT4_I(inode)->xattr_sem);
- error = dquot_transfer(inode, attr);
+ error = dquot_transfer(mnt_userns, inode, attr);
up_read(&EXT4_I(inode)->xattr_sem);
if (error) {
@@ -5273,30 +5407,34 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
/* Update corresponding info in inode so that everything is in
* one transaction */
- if (attr->ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (attr->ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
error = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
+ if (unlikely(error)) {
+ return error;
+ }
}
if (attr->ia_valid & ATTR_SIZE) {
handle_t *handle;
loff_t oldsize = inode->i_size;
+ loff_t old_disksize;
int shrink = (attr->ia_size < inode->i_size);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (attr->ia_size > sbi->s_bitmap_maxbytes)
+ if (attr->ia_size > sbi->s_bitmap_maxbytes) {
return -EFBIG;
+ }
}
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode)) {
return -EINVAL;
+ }
- if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
- inode_inc_iversion(inode);
+ if (attr->ia_size == inode->i_size)
+ inc_ivers = false;
if (shrink) {
if (ext4_should_order_data(inode)) {
@@ -5312,12 +5450,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
inode_dio_wait(inode);
}
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
rc = ext4_break_layouts(inode);
if (rc) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
- return rc;
+ filemap_invalidate_unlock(inode->i_mapping);
+ goto err_out;
}
if (attr->ia_size != inode->i_size) {
@@ -5338,7 +5476,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
inode->i_mtime = current_time(inode);
inode->i_ctime = inode->i_mtime;
}
+
+ if (shrink)
+ ext4_fc_track_range(handle, inode,
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
+ inode->i_sb->s_blocksize_bits,
+ EXT_MAX_BLOCKS - 1);
+ else
+ ext4_fc_track_range(
+ handle, inode,
+ (oldsize > 0 ? oldsize - 1 : oldsize) >>
+ inode->i_sb->s_blocksize_bits,
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
+ inode->i_sb->s_blocksize_bits);
+
down_write(&EXT4_I(inode)->i_data_sem);
+ old_disksize = EXT4_I(inode)->i_disksize;
EXT4_I(inode)->i_disksize = attr->ia_size;
rc = ext4_mark_inode_dirty(handle, inode);
if (!error)
@@ -5350,6 +5503,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
*/
if (!error)
i_size_write(inode, attr->ia_size);
+ else
+ EXT4_I(inode)->i_disksize = old_disksize;
up_write(&EXT4_I(inode)->i_data_sem);
ext4_journal_stop(handle);
if (error)
@@ -5377,11 +5532,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
error = rc;
}
out_mmap_sem:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
}
if (!error) {
- setattr_copy(inode, attr);
+ if (inc_ivers)
+ inode_inc_iversion(inode);
+ setattr_copy(mnt_userns, inode, attr);
mark_inode_dirty(inode);
}
@@ -5393,17 +5550,34 @@ out_mmap_sem:
ext4_orphan_del(NULL, inode);
if (!error && (ia_valid & ATTR_MODE))
- rc = posix_acl_chmod(inode, inode->i_mode);
+ rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
err_out:
- ext4_std_error(inode->i_sb, error);
+ if (error)
+ ext4_std_error(inode->i_sb, error);
if (!error)
error = rc;
return error;
}
-int ext4_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
+u32 ext4_dio_alignment(struct inode *inode)
+{
+ if (fsverity_active(inode))
+ return 0;
+ if (ext4_should_journal_data(inode))
+ return 0;
+ if (ext4_has_inline_data(inode))
+ return 0;
+ if (IS_ENCRYPTED(inode)) {
+ if (!fscrypt_dio_supported(inode))
+ return 0;
+ return i_blocksize(inode);
+ }
+ return 1; /* use the iomap defaults */
+}
+
+int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct ext4_inode *raw_inode;
@@ -5417,6 +5591,27 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
}
+ /*
+ * Return the DIO alignment restrictions if requested. We only return
+ * this information when requested, since on encrypted files it might
+ * take a fair bit of work to get if the file wasn't opened recently.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ u32 dio_align = ext4_dio_alignment(inode);
+
+ stat->result_mask |= STATX_DIOALIGN;
+ if (dio_align == 1) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+
+ /* iomap defaults */
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ } else {
+ stat->dio_mem_align = dio_align;
+ stat->dio_offset_align = dio_align;
+ }
+ }
+
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
@@ -5438,17 +5633,18 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
return 0;
}
-int ext4_file_getattr(const struct path *path, struct kstat *stat,
+int ext4_file_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
u64 delalloc_blocks;
- ext4_getattr(path, stat, request_mask, query_flags);
+ ext4_getattr(mnt_userns, path, stat, request_mask, query_flags);
/*
* If there is inline data in the inode, the inode will normally not
@@ -5580,8 +5776,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
put_bh(iloc->bh);
return -EIO;
}
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
+ ext4_fc_track_inode(handle, inode);
/* the do_update_inode consumes one bh->b_count */
get_bh(iloc->bh);
@@ -5609,7 +5804,8 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
err = ext4_get_inode_loc(inode, iloc);
if (!err) {
BUFFER_TRACE(iloc->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, iloc->bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ iloc->bh, EXT4_JTR_NONE);
if (err) {
brelse(iloc->bh);
iloc->bh = NULL;
@@ -5732,7 +5928,8 @@ int ext4_expand_extra_isize(struct inode *inode,
ext4_write_lock_xattr(inode, &no_expand);
BUFFER_TRACE(iloc->bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, iloc->bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh,
+ EXT4_JTR_NONE);
if (error) {
brelse(iloc->bh);
goto out_unlock;
@@ -5764,7 +5961,8 @@ out_unlock:
* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
* we start and wait on commits.
*/
-int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
+int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
+ const char *func, unsigned int line)
{
struct ext4_iloc iloc;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -5774,13 +5972,18 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
trace_ext4_mark_inode_dirty(inode, _RET_IP_);
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
- return err;
+ goto out;
if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
iloc, handle);
- return ext4_mark_iloc_dirty(handle, inode, &iloc);
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+out:
+ if (unlikely(err))
+ ext4_error_inode_err(inode, func, line, 0, err,
+ "mark_inode_dirty error");
+ return err;
}
/*
@@ -5796,26 +5999,16 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* If the inode is marked synchronous, we don't honour that here - doing
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
- *
- * If only the I_DIRTY_TIME flag is set, we can skip everything. If
- * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
- * to copy into the on-disk inode structure are the timestamp files.
*/
void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
- if (flags == I_DIRTY_TIME)
- return;
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
- goto out;
-
+ return;
ext4_mark_inode_dirty(handle, inode);
-
ext4_journal_stop(handle);
-out:
- return;
}
int ext4_change_inode_journal_flag(struct inode *inode, int val)
@@ -5853,10 +6046,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
* data (and journalled aops don't know how to handle these cases).
*/
if (val) {
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
err = filemap_write_and_wait(inode->i_mapping);
if (err < 0) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return err;
}
}
@@ -5875,7 +6068,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (val)
ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
else {
- err = jbd2_journal_flush(journal);
+ err = jbd2_journal_flush(journal, 0);
if (err < 0) {
jbd2_journal_unlock_updates(journal);
percpu_up_write(&sbi->s_writepages_rwsem);
@@ -5889,7 +6082,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
percpu_up_write(&sbi->s_writepages_rwsem);
if (val)
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
/* Finally we can mark the inode as dirty. */
@@ -5897,6 +6090,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (IS_ERR(handle))
return PTR_ERR(handle);
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
err = ext4_mark_inode_dirty(handle, inode);
ext4_handle_sync(handle);
ext4_journal_stop(handle);
@@ -5905,7 +6100,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
-static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
return !buffer_mapped(bh);
}
@@ -5931,15 +6127,23 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
err = ext4_convert_inline_data(inode);
if (err)
goto out_ret;
+ /*
+ * On data journalling we skip straight to the transaction handle:
+ * there's no delalloc; page truncated will be checked later; the
+ * early return w/ all buffers mapped (calculates size/len) can't
+ * be used; and there's no dioread_nolock, so only ext4_get_block.
+ */
+ if (ext4_should_journal_data(inode))
+ goto retry_alloc;
+
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
- !ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
err = block_page_mkwrite(vma, vmf,
@@ -5965,9 +6169,12 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
/*
* Return if we have all the buffers mapped. This avoids the need to do
* journal_start/journal_stop which can block and take a long time
+ *
+ * This cannot be done for data journalling, as we have to add the
+ * inode to the transaction's list to writeprotect pages on commit.
*/
if (page_has_buffers(page)) {
- if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+ if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page),
0, len, NULL,
ext4_bh_unmapped)) {
/* Wait so that we don't change page under IO */
@@ -5989,16 +6196,45 @@ retry_alloc:
ret = VM_FAULT_SIGBUS;
goto out;
}
- err = block_page_mkwrite(vma, vmf, get_block);
- if (!err && ext4_should_journal_data(inode)) {
- if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_SIZE, NULL, do_journal_get_write_access)) {
- unlock_page(page);
+ /*
+ * Data journalling can't use block_page_mkwrite() because it
+ * will set_buffer_dirty() before do_journal_get_write_access()
+ * thus might hit warning messages for dirty metadata buffers.
+ */
+ if (!ext4_should_journal_data(inode)) {
+ err = block_page_mkwrite(vma, vmf, get_block);
+ } else {
+ lock_page(page);
+ size = i_size_read(inode);
+ /* Page got truncated from under us? */
+ if (page->mapping != mapping || page_offset(page) > size) {
+ ret = VM_FAULT_NOPAGE;
+ goto out_error;
+ }
+
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
+ else
+ len = PAGE_SIZE;
+
+ err = __block_write_begin(page, 0, len, ext4_get_block);
+ if (!err) {
ret = VM_FAULT_SIGBUS;
- ext4_journal_stop(handle);
- goto out;
+ if (ext4_walk_page_buffers(handle, inode,
+ page_buffers(page), 0, len, NULL,
+ do_journal_get_write_access))
+ goto out_error;
+ if (ext4_walk_page_buffers(handle, inode,
+ page_buffers(page), 0, len, NULL,
+ write_end_fn))
+ goto out_error;
+ if (ext4_jbd2_inode_add_write(handle, inode,
+ page_offset(page), len))
+ goto out_error;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ } else {
+ unlock_page(page);
}
- ext4_set_inode_state(inode, EXT4_STATE_JDATA);
}
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -6006,19 +6242,11 @@ retry_alloc:
out_ret:
ret = block_page_mkwrite_return(err);
out:
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
return ret;
-}
-
-vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- vm_fault_t ret;
-
- down_read(&EXT4_I(inode)->i_mmap_sem);
- ret = filemap_fault(vmf);
- up_read(&EXT4_I(inode)->i_mmap_sem);
-
- return ret;
+out_error:
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ goto out;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a0ec750018dd..95dfea28bf4e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -16,17 +16,268 @@
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/random.h>
-#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/delay.h>
#include <linux/iversion.h>
+#include <linux/fileattr.h>
+#include <linux/uuid.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include <linux/fsmap.h>
#include "fsmap.h"
#include <trace/events/ext4.h>
-/**
+typedef void ext4_update_sb_callback(struct ext4_super_block *es,
+ const void *arg);
+
+/*
+ * Superblock modification callback function for changing file system
+ * label
+ */
+static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg)
+{
+ /* Sanity check, this should never happen */
+ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX);
+
+ memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX);
+}
+
+/*
+ * Superblock modification callback function for changing file system
+ * UUID.
+ */
+static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg)
+{
+ memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE);
+}
+
+static
+int ext4_update_primary_sb(struct super_block *sb, handle_t *handle,
+ ext4_update_sb_callback func,
+ const void *arg)
+{
+ int err = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh = sbi->s_sbh;
+ struct ext4_super_block *es = sbi->s_es;
+
+ trace_ext4_update_sb(sb, bh->b_blocknr, 1);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb,
+ bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+
+ lock_buffer(bh);
+ func(es, arg);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(bh);
+
+ if (buffer_write_io_error(bh) || !buffer_uptodate(bh)) {
+ ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ }
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out_err;
+ err = sync_dirty_buffer(bh);
+out_err:
+ ext4_std_error(sb, err);
+ return err;
+}
+
+/*
+ * Update one backup superblock in the group 'grp' using the callback
+ * function 'func' and argument 'arg'. If the handle is NULL the
+ * modification is not journalled.
+ *
+ * Returns: 0 when no modification was done (no superblock in the group)
+ * 1 when the modification was successful
+ * <0 on error
+ */
+static int ext4_update_backup_sb(struct super_block *sb,
+ handle_t *handle, ext4_group_t grp,
+ ext4_update_sb_callback func, const void *arg)
+{
+ int err = 0;
+ ext4_fsblk_t sb_block;
+ struct buffer_head *bh;
+ unsigned long offset = 0;
+ struct ext4_super_block *es;
+
+ if (!ext4_bg_has_super(sb, grp))
+ return 0;
+
+ /*
+ * For the group 0 there is always 1k padding, so we have
+ * either adjust offset, or sb_block depending on blocksize
+ */
+ if (grp == 0) {
+ sb_block = 1 * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(sb_block, sb->s_blocksize);
+ } else {
+ sb_block = ext4_group_first_block_no(sb, grp);
+ offset = 0;
+ }
+
+ trace_ext4_update_sb(sb, sb_block, handle ? 1 : 0);
+
+ bh = ext4_sb_bread(sb, sb_block, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ if (handle) {
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb,
+ bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_bh;
+ }
+
+ es = (struct ext4_super_block *) (bh->b_data + offset);
+ lock_buffer(bh);
+ if (ext4_has_metadata_csum(sb) &&
+ es->s_checksum != ext4_superblock_csum(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
+ "superblock %llu", sb_block);
+ unlock_buffer(bh);
+ goto out_bh;
+ }
+ func(es, arg);
+ if (ext4_has_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(sb, es);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ if (err)
+ goto out_bh;
+
+ if (handle) {
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out_bh;
+ } else {
+ BUFFER_TRACE(bh, "marking dirty");
+ mark_buffer_dirty(bh);
+ }
+ err = sync_dirty_buffer(bh);
+
+out_bh:
+ brelse(bh);
+ ext4_std_error(sb, err);
+ return (err) ? err : 1;
+}
+
+/*
+ * Update primary and backup superblocks using the provided function
+ * func and argument arg.
+ *
+ * Only the primary superblock and at most two backup superblock
+ * modifications are journalled; the rest is modified without journal.
+ * This is safe because e2fsck will re-write them if there is a problem,
+ * and we're very unlikely to ever need more than two backups.
+ */
+static
+int ext4_update_superblocks_fn(struct super_block *sb,
+ ext4_update_sb_callback func,
+ const void *arg)
+{
+ handle_t *handle;
+ ext4_group_t ngroups;
+ unsigned int three = 1;
+ unsigned int five = 5;
+ unsigned int seven = 7;
+ int err = 0, ret, i;
+ ext4_group_t grp, primary_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /*
+ * We can't update superblocks while the online resize is running
+ */
+ if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
+ &sbi->s_ext4_flags)) {
+ ext4_msg(sb, KERN_ERR, "Can't modify superblock while"
+ "performing online resize");
+ return -EBUSY;
+ }
+
+ /*
+ * We're only going to update primary superblock and two
+ * backup superblocks in this transaction.
+ */
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 3);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out;
+ }
+
+ /* Update primary superblock */
+ err = ext4_update_primary_sb(sb, handle, func, arg);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "Failed to update primary "
+ "superblock");
+ goto out_journal;
+ }
+
+ primary_grp = ext4_get_group_number(sb, sbi->s_sbh->b_blocknr);
+ ngroups = ext4_get_groups_count(sb);
+
+ /*
+ * Update backup superblocks. We have to start from group 0
+ * because it might not be where the primary superblock is
+ * if the fs is mounted with -o sb=<backup_sb_block>
+ */
+ i = 0;
+ grp = 0;
+ while (grp < ngroups) {
+ /* Skip primary superblock */
+ if (grp == primary_grp)
+ goto next_grp;
+
+ ret = ext4_update_backup_sb(sb, handle, grp, func, arg);
+ if (ret < 0) {
+ /* Ignore bad checksum; try to update next sb */
+ if (ret == -EFSBADCRC)
+ goto next_grp;
+ err = ret;
+ goto out_journal;
+ }
+
+ i += ret;
+ if (handle && i > 1) {
+ /*
+ * We're only journalling primary superblock and
+ * two backup superblocks; the rest is not
+ * journalled.
+ */
+ err = ext4_journal_stop(handle);
+ if (err)
+ goto out;
+ handle = NULL;
+ }
+next_grp:
+ grp = ext4_list_backups(sb, &three, &five, &seven);
+ }
+
+out_journal:
+ if (handle) {
+ ret = ext4_journal_stop(handle);
+ if (ret && !err)
+ err = ret;
+ }
+out:
+ clear_bit_unlock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags);
+ smp_mb__after_atomic();
+ return err ? err : 0;
+}
+
+/*
* Swap memory between @a and @b for @len bytes.
*
* @a: pointer to first memory area
@@ -47,7 +298,7 @@ static void memswap(void *a, void *b, size_t len)
}
}
-/**
+/*
* Swap i_data and associated attributes between @inode1 and @inode2.
* This function is used for the primary swap between inode1 and inode2
* and also to revert this primary swap in case of errors.
@@ -86,7 +337,7 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
i_size_write(inode2, isize);
}
-static void reset_inode_seed(struct inode *inode)
+void ext4_reset_inode_seed(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -101,16 +352,18 @@ static void reset_inode_seed(struct inode *inode)
ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
}
-/**
+/*
* Swap the information from the given @inode and the inode
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
* important fields of the inodes.
*
* @sb: the super block of the filesystem
+ * @mnt_userns: user namespace of the mount the inode was found from
* @inode: the inode to swap with EXT4_BOOT_LOADER_INO
*
*/
static long swap_inode_boot_loader(struct super_block *sb,
+ struct user_namespace *mnt_userns,
struct inode *inode)
{
handle_t *handle;
@@ -139,12 +392,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
}
if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
- !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+ !inode_owner_or_capable(mnt_userns, inode) ||
+ !capable(CAP_SYS_ADMIN)) {
err = -EPERM;
goto journal_err_out;
}
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
err = filemap_write_and_wait(inode->i_mapping);
if (err)
goto err_out;
@@ -165,6 +419,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
err = -EINVAL;
goto err_out;
}
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle);
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
@@ -196,13 +451,14 @@ static long swap_inode_boot_loader(struct super_block *sb,
swap_inode_data(inode, inode_bl);
inode->i_ctime = inode_bl->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
- inode->i_generation = prandom_u32();
- inode_bl->i_generation = prandom_u32();
- reset_inode_seed(inode);
- reset_inode_seed(inode_bl);
+ inode->i_generation = get_random_u32();
+ inode_bl->i_generation = get_random_u32();
+ ext4_reset_inode_seed(inode);
+ ext4_reset_inode_seed(inode_bl);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
@@ -250,25 +506,13 @@ err_out1:
ext4_double_up_write_data_sem(inode, inode_bl);
err_out:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
journal_err_out:
unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
return err;
}
-#ifdef CONFIG_FS_ENCRYPTION
-static int uuid_is_zero(__u8 u[16])
-{
- int i;
-
- for (i = 0; i < 16; i++)
- if (u[i])
- return 0;
- return 1;
-}
-#endif
-
/*
* If immutable is set and we are not clearing it, we're not allowed to change
* anything else in the inode. Don't error out if we're only trying to set
@@ -292,6 +536,44 @@ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid,
return 0;
}
+static void ext4_dax_dontcache(struct inode *inode, unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (test_opt2(inode->i_sb, DAX_NEVER) ||
+ test_opt(inode->i_sb, DAX_ALWAYS))
+ return;
+
+ if ((ei->i_flags ^ flags) & EXT4_DAX_FL)
+ d_mark_dontcache(inode);
+}
+
+static bool dax_compatible(struct inode *inode, unsigned int oldflags,
+ unsigned int flags)
+{
+ /* Allow the DAX flag to be changed on inline directories */
+ if (S_ISDIR(inode->i_mode)) {
+ flags &= ~EXT4_INLINE_DATA_FL;
+ oldflags &= ~EXT4_INLINE_DATA_FL;
+ }
+
+ if (flags & EXT4_DAX_FL) {
+ if ((oldflags & EXT4_DAX_MUT_EXCL) ||
+ ext4_test_inode_state(inode,
+ EXT4_STATE_VERITY_IN_PROGRESS)) {
+ return false;
+ }
+ }
+
+ if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL))
+ return false;
+
+ return true;
+}
+
static int ext4_ioctl_setflags(struct inode *inode,
unsigned int flags)
{
@@ -300,7 +582,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
int err = -EPERM, migrate = 0;
struct ext4_iloc iloc;
unsigned int oldflags, mask, i;
- unsigned int jflag;
struct super_block *sb = inode->i_sb;
/* Is it quota file? Do not allow user to mess with it */
@@ -308,37 +589,23 @@ static int ext4_ioctl_setflags(struct inode *inode,
goto flags_out;
oldflags = ei->i_flags;
-
- /* The JOURNAL_DATA flag is modifiable only by root */
- jflag = flags & EXT4_JOURNAL_DATA_FL;
-
- err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
- if (err)
- goto flags_out;
-
/*
* The JOURNAL_DATA flag can only be changed by
* the relevant capability.
*/
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
- if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
- migrate = 1;
- if (flags & EXT4_EOFBLOCKS_FL) {
- /* we don't support adding EOFBLOCKS flag */
- if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
- err = -EOPNOTSUPP;
- goto flags_out;
- }
- } else if (oldflags & EXT4_EOFBLOCKS_FL) {
- err = ext4_truncate(inode);
- if (err)
- goto flags_out;
+ if (!dax_compatible(inode, oldflags, flags)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
}
+ if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
+ migrate = 1;
+
if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) {
if (!ext4_has_feature_casefold(sb)) {
err = -EOPNOTSUPP;
@@ -381,6 +648,8 @@ static int ext4_ioctl_setflags(struct inode *inode,
if (err)
goto flags_err;
+ ext4_dax_dontcache(inode, flags);
+
for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
if (!(mask & EXT4_FL_USER_MODIFIABLE))
continue;
@@ -393,8 +662,10 @@ static int ext4_ioctl_setflags(struct inode *inode,
ext4_clear_inode_flag(inode, i);
}
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
+
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
@@ -402,17 +673,18 @@ flags_err:
if (err)
goto flags_out;
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
/*
* Changes to the journaling mode can cause unsafe changes to
- * S_DAX if we are using the DAX mount option.
+ * S_DAX if the inode is DAX
*/
- if (test_opt(inode->i_sb, DAX)) {
+ if (IS_DAX(inode)) {
err = -EBUSY;
goto flags_out;
}
- err = ext4_change_inode_journal_flag(inode, jflag);
+ err = ext4_change_inode_journal_flag(inode,
+ flags & EXT4_JOURNAL_DATA_FL);
if (err)
goto flags_out;
}
@@ -428,9 +700,8 @@ flags_out:
}
#ifdef CONFIG_QUOTA
-static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
{
- struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
struct ext4_inode_info *ei = EXT4_I(inode);
int err, rc;
@@ -505,6 +776,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
EXT4_I(inode)->i_projid = kprojid;
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
out_dirty:
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
@@ -514,7 +786,7 @@ out_stop:
return err;
}
#else
-static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
{
if (projid != EXT4_DEF_PROJID)
return -EOPNOTSUPP;
@@ -522,51 +794,6 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
}
#endif
-/* Transfer internal flags to xflags */
-static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
-{
- __u32 xflags = 0;
-
- if (iflags & EXT4_SYNC_FL)
- xflags |= FS_XFLAG_SYNC;
- if (iflags & EXT4_IMMUTABLE_FL)
- xflags |= FS_XFLAG_IMMUTABLE;
- if (iflags & EXT4_APPEND_FL)
- xflags |= FS_XFLAG_APPEND;
- if (iflags & EXT4_NODUMP_FL)
- xflags |= FS_XFLAG_NODUMP;
- if (iflags & EXT4_NOATIME_FL)
- xflags |= FS_XFLAG_NOATIME;
- if (iflags & EXT4_PROJINHERIT_FL)
- xflags |= FS_XFLAG_PROJINHERIT;
- return xflags;
-}
-
-#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
- FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
- FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
-
-/* Transfer xflags flags to internal */
-static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
-{
- unsigned long iflags = 0;
-
- if (xflags & FS_XFLAG_SYNC)
- iflags |= EXT4_SYNC_FL;
- if (xflags & FS_XFLAG_IMMUTABLE)
- iflags |= EXT4_IMMUTABLE_FL;
- if (xflags & FS_XFLAG_APPEND)
- iflags |= EXT4_APPEND_FL;
- if (xflags & FS_XFLAG_NODUMP)
- iflags |= EXT4_NODUMP_FL;
- if (xflags & FS_XFLAG_NOATIME)
- iflags |= EXT4_NOATIME_FL;
- if (xflags & FS_XFLAG_PROJINHERIT)
- iflags |= EXT4_PROJINHERIT_FL;
-
- return iflags;
-}
-
static int ext4_shutdown(struct super_block *sb, unsigned long arg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -591,7 +818,7 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg)
case EXT4_GOING_FLAGS_DEFAULT:
freeze_bdev(sb->s_bdev);
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
- thaw_bdev(sb->s_bdev, sb);
+ thaw_bdev(sb->s_bdev);
break;
case EXT4_GOING_FLAGS_LOGFLUSH:
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
@@ -672,10 +899,9 @@ static int ext4_ioc_getfsmap(struct super_block *sb,
info.gi_sb = sb;
info.gi_data = arg;
error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info);
- if (error == EXT4_QUERY_RANGE_ABORT) {
- error = 0;
+ if (error == EXT4_QUERY_RANGE_ABORT)
aborted = true;
- } else if (error)
+ else if (error)
return error;
/* If we didn't abort, set the "last" flag in the last fmx */
@@ -720,7 +946,7 @@ static long ext4_ioctl_group_add(struct file *file,
err = ext4_group_add(sb, input);
if (EXT4_SB(sb)->s_journal) {
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
@@ -730,42 +956,56 @@ static long ext4_ioctl_group_add(struct file *file,
test_opt(sb, INIT_INODE_TABLE))
err = ext4_register_li_request(sb, input->group);
group_add_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, false);
+ if (err == 0)
+ err = err2;
return err;
}
-static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
+int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
+ struct inode *inode = d_inode(dentry);
struct ext4_inode_info *ei = EXT4_I(inode);
+ u32 flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
- simple_fill_fsxattr(fa, ext4_iflags_to_xflags(ei->i_flags &
- EXT4_FL_USER_VISIBLE));
+ if (S_ISREG(inode->i_mode))
+ flags &= ~FS_PROJINHERIT_FL;
+ fileattr_fill_flags(fa, flags);
if (ext4_has_feature_project(inode->i_sb))
fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
+
+ return 0;
}
-/* copied from fs/ioctl.c */
-static int fiemap_check_ranges(struct super_block *sb,
- u64 start, u64 len, u64 *new_len)
+int ext4_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
{
- u64 maxbytes = (u64) sb->s_maxbytes;
+ struct inode *inode = d_inode(dentry);
+ u32 flags = fa->flags;
+ int err = -EOPNOTSUPP;
- *new_len = len;
-
- if (len == 0)
- return -EINVAL;
-
- if (start > maxbytes)
- return -EFBIG;
+ if (flags & ~EXT4_FL_USER_VISIBLE)
+ goto out;
/*
- * Shrink request scope to what the fs can actually handle.
+ * chattr(1) grabs flags via GETFLAGS, modifies the result and
+ * passes that to SETFLAGS. So we cannot easily make SETFLAGS
+ * more restrictive than just silently masking off visible but
+ * not settable flags as we always did.
*/
- if (len > maxbytes || (maxbytes - len) < start)
- *new_len = maxbytes - start;
-
- return 0;
+ flags &= EXT4_FL_USER_MODIFIABLE;
+ if (ext4_mask_flags(inode->i_mode, flags) != flags)
+ goto out;
+ err = ext4_ioctl_check_immutable(inode, fa->fsx_projid, flags);
+ if (err)
+ goto out;
+ err = ext4_ioctl_setflags(inode, flags);
+ if (err)
+ goto out;
+ err = ext4_ioctl_setproject(inode, fa->fsx_projid);
+out:
+ return err;
}
/* So that the fiemap access checks can't overflow on 32 bit machines. */
@@ -777,8 +1017,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
struct fiemap_extent_info fieinfo = { 0, };
struct inode *inode = file_inode(filp);
- struct super_block *sb = inode->i_sb;
- u64 len;
int error;
if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
@@ -787,24 +1025,12 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
return -EINVAL;
- error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
- &len);
- if (error)
- return error;
-
fieinfo.fi_flags = fiemap.fm_flags;
fieinfo.fi_extents_max = fiemap.fm_extent_count;
fieinfo.fi_extents_start = ufiemap->fm_extents;
- if (fiemap.fm_extent_count != 0 &&
- !access_ok(fieinfo.fi_extents_start,
- fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
- return -EFAULT;
-
- if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
- filemap_write_and_wait(inode->i_mapping);
-
- error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len);
+ error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start,
+ fiemap.fm_length);
fiemap.fm_flags = fieinfo.fi_flags;
fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
@@ -813,58 +1039,187 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
return error;
}
-long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
{
- struct inode *inode = file_inode(filp);
- struct super_block *sb = inode->i_sb;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned int flags;
+ int err = 0;
+ __u32 flags = 0;
+ unsigned int flush_flags = 0;
+ struct super_block *sb = file_inode(filp)->i_sb;
- ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
+ if (copy_from_user(&flags, (__u32 __user *)arg,
+ sizeof(__u32)))
+ return -EFAULT;
- switch (cmd) {
- case FS_IOC_GETFSMAP:
- return ext4_ioc_getfsmap(sb, (void __user *)arg);
- case EXT4_IOC_GETFLAGS:
- flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
- if (S_ISREG(inode->i_mode))
- flags &= ~EXT4_PROJINHERIT_FL;
- return put_user(flags, (int __user *) arg);
- case EXT4_IOC_SETFLAGS: {
- int err;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
- if (!inode_owner_or_capable(inode))
- return -EACCES;
+ /* check for invalid bits set */
+ if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) ||
+ ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ (flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
+ return -EINVAL;
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
+ if (!EXT4_SB(sb)->s_journal)
+ return -ENODEV;
- if (flags & ~EXT4_FL_USER_VISIBLE)
- return -EOPNOTSUPP;
- /*
- * chattr(1) grabs flags via GETFLAGS, modifies the result and
- * passes that to SETFLAGS. So we cannot easily make SETFLAGS
- * more restrictive than just silently masking off visible but
- * not settable flags as we always did.
- */
- flags &= EXT4_FL_USER_MODIFIABLE;
- if (ext4_mask_flags(inode->i_mode, flags) != flags)
- return -EOPNOTSUPP;
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev))
+ return -EOPNOTSUPP;
- err = mnt_want_write_file(filp);
- if (err)
- return err;
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+ return 0;
- inode_lock(inode);
- err = ext4_ioctl_check_immutable(inode,
- from_kprojid(&init_user_ns, ei->i_projid),
- flags);
- if (!err)
- err = ext4_ioctl_setflags(inode, flags);
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return err;
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD)
+ flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) {
+ flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT;
+ pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow");
+ }
+
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+
+ return err;
+}
+
+static int ext4_ioctl_setlabel(struct file *filp, const char __user *user_label)
+{
+ size_t len;
+ int ret = 0;
+ char new_label[EXT4_LABEL_MAX + 1];
+ struct super_block *sb = file_inode(filp)->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * Copy the maximum length allowed for ext4 label with one more to
+ * find the required terminating null byte in order to test the
+ * label length. The on disk label doesn't need to be null terminated.
+ */
+ if (copy_from_user(new_label, user_label, EXT4_LABEL_MAX + 1))
+ return -EFAULT;
+
+ len = strnlen(new_label, EXT4_LABEL_MAX + 1);
+ if (len > EXT4_LABEL_MAX)
+ return -EINVAL;
+
+ /*
+ * Clear the buffer after the new label
+ */
+ memset(new_label + len, 0, EXT4_LABEL_MAX - len);
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setlabel, new_label);
+
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label)
+{
+ char label[EXT4_LABEL_MAX + 1];
+
+ /*
+ * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because
+ * FSLABEL_MAX must include terminating null byte, while s_volume_name
+ * does not have to.
+ */
+ BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);
+
+ memset(label, 0, sizeof(label));
+ lock_buffer(sbi->s_sbh);
+ strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX);
+ unlock_buffer(sbi->s_sbh);
+
+ if (copy_to_user(user_label, label, sizeof(label)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
+ struct fsuuid __user *ufsuuid)
+{
+ struct fsuuid fsuuid;
+ __u8 uuid[UUID_SIZE];
+
+ if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+ return -EFAULT;
+
+ if (fsuuid.fsu_len == 0) {
+ fsuuid.fsu_len = UUID_SIZE;
+ if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len)))
+ return -EFAULT;
+ return -EINVAL;
}
+
+ if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+ return -EINVAL;
+
+ lock_buffer(sbi->s_sbh);
+ memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE);
+ unlock_buffer(sbi->s_sbh);
+
+ if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_ioctl_setuuid(struct file *filp,
+ const struct fsuuid __user *ufsuuid)
+{
+ int ret = 0;
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct fsuuid fsuuid;
+ __u8 uuid[UUID_SIZE];
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * If any checksums (group descriptors or metadata) are being used
+ * then the checksum seed feature is required to change the UUID.
+ */
+ if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb))
+ && !ext4_has_feature_csum_seed(sb))
+ || ext4_has_feature_stable_inodes(sb))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+ return -EFAULT;
+
+ if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+ return -EINVAL;
+
+ if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE))
+ return -EFAULT;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid);
+ mnt_drop_write_file(filp);
+
+ return ret;
+}
+
+static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
+
+ ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
+
+ switch (cmd) {
+ case FS_IOC_GETFSMAP:
+ return ext4_ioc_getfsmap(sb, (void __user *)arg);
case EXT4_IOC_GETVERSION:
case EXT4_IOC_GETVERSION_OLD:
return put_user(inode->i_generation, (int __user *) arg);
@@ -875,7 +1230,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
__u32 generation;
int err;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
if (ext4_has_metadata_csum(inode->i_sb)) {
@@ -901,6 +1256,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
@@ -939,14 +1295,16 @@ setversion_out:
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
group_extend_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, false);
+ if (err == 0)
+ err = err2;
return err;
}
@@ -1014,7 +1372,7 @@ mext_out:
case EXT4_IOC_MIGRATE:
{
int err;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
err = mnt_want_write_file(filp);
@@ -1036,7 +1394,7 @@ mext_out:
case EXT4_IOC_ALLOC_DA_BLKS:
{
int err;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
err = mnt_want_write_file(filp);
@@ -1055,7 +1413,7 @@ mext_out:
err = mnt_want_write_file(filp);
if (err)
return err;
- err = swap_inode_boot_loader(sb, inode);
+ err = swap_inode_boot_loader(sb, mnt_userns, inode);
mnt_drop_write_file(filp);
return err;
}
@@ -1080,8 +1438,9 @@ mext_out:
err = ext4_resize_fs(sb, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
@@ -1093,20 +1452,21 @@ mext_out:
err = ext4_register_li_request(sb, o_group);
resizefs_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, true);
+ if (err == 0)
+ err = err2;
return err;
}
case FITRIM:
{
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sb->s_bdev))
return -EOPNOTSUPP;
/*
@@ -1120,8 +1480,6 @@ resizefs_out:
sizeof(range)))
return -EFAULT;
- range.minlen = max((unsigned int)range.minlen,
- q->limits.discard_granularity);
ret = ext4_trim_fs(sb, &range);
if (ret < 0)
return ret;
@@ -1135,52 +1493,15 @@ resizefs_out:
case EXT4_IOC_PRECACHE_EXTENTS:
return ext4_ext_precache(inode);
- case EXT4_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
- case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
-#ifdef CONFIG_FS_ENCRYPTION
- int err, err2;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- handle_t *handle;
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg);
- if (!ext4_has_feature_encrypt(sb))
- return -EOPNOTSUPP;
- if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
- err = mnt_want_write_file(filp);
- if (err)
- return err;
- handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto pwsalt_err_exit;
- }
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err)
- goto pwsalt_err_journal;
- generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
- err = ext4_handle_dirty_metadata(handle, NULL,
- sbi->s_sbh);
- pwsalt_err_journal:
- err2 = ext4_journal_stop(handle);
- if (err2 && !err)
- err = err2;
- pwsalt_err_exit:
- mnt_drop_write_file(filp);
- if (err)
- return err;
- }
- if (copy_to_user((void __user *) arg,
- sbi->s_es->s_encrypt_pw_salt, 16))
- return -EFAULT;
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
- }
- case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
@@ -1210,9 +1531,14 @@ resizefs_out:
return -EOPNOTSUPP;
return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_nonce(filp, (void __user *)arg);
+
case EXT4_IOC_CLEAR_ES_CACHE:
{
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ext4_clear_inode_es(inode);
return 0;
@@ -1237,60 +1563,6 @@ resizefs_out:
case EXT4_IOC_GET_ES_CACHE:
return ext4_ioctl_get_es_cache(filp, arg);
- case EXT4_IOC_FSGETXATTR:
- {
- struct fsxattr fa;
-
- ext4_fill_fsxattr(inode, &fa);
-
- if (copy_to_user((struct fsxattr __user *)arg,
- &fa, sizeof(fa)))
- return -EFAULT;
- return 0;
- }
- case EXT4_IOC_FSSETXATTR:
- {
- struct fsxattr fa, old_fa;
- int err;
-
- if (copy_from_user(&fa, (struct fsxattr __user *)arg,
- sizeof(fa)))
- return -EFAULT;
-
- /* Make sure caller has proper permission */
- if (!inode_owner_or_capable(inode))
- return -EACCES;
-
- if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
- return -EOPNOTSUPP;
-
- flags = ext4_xflags_to_iflags(fa.fsx_xflags);
- if (ext4_mask_flags(inode->i_mode, flags) != flags)
- return -EOPNOTSUPP;
-
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- inode_lock(inode);
- ext4_fill_fsxattr(inode, &old_fa);
- err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
- if (err)
- goto out;
- flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
- (flags & EXT4_FL_XFLAG_VISIBLE);
- err = ext4_ioctl_check_immutable(inode, fa.fsx_projid, flags);
- if (err)
- goto out;
- err = ext4_ioctl_setflags(inode, flags);
- if (err)
- goto out;
- err = ext4_ioctl_setproject(filp, fa.fsx_projid);
-out:
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return err;
- }
case EXT4_IOC_SHUTDOWN:
return ext4_shutdown(sb, arg);
@@ -1304,22 +1576,41 @@ out:
return -EOPNOTSUPP;
return fsverity_ioctl_measure(filp, (void __user *)arg);
+ case FS_IOC_READ_VERITY_METADATA:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_read_metadata(filp,
+ (const void __user *)arg);
+
+ case EXT4_IOC_CHECKPOINT:
+ return ext4_ioctl_checkpoint(filp, arg);
+
+ case FS_IOC_GETFSLABEL:
+ return ext4_ioctl_getlabel(EXT4_SB(sb), (void __user *)arg);
+
+ case FS_IOC_SETFSLABEL:
+ return ext4_ioctl_setlabel(filp,
+ (const void __user *)arg);
+
+ case EXT4_IOC_GETFSUUID:
+ return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg);
+ case EXT4_IOC_SETFSUUID:
+ return ext4_ioctl_setuuid(filp, (const void __user *)arg);
default:
return -ENOTTY;
}
}
+long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ return __ext4_ioctl(filp, cmd, arg);
+}
+
#ifdef CONFIG_COMPAT
long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
- case EXT4_IOC32_GETFLAGS:
- cmd = EXT4_IOC_GETFLAGS;
- break;
- case EXT4_IOC32_SETFLAGS:
- cmd = EXT4_IOC_SETFLAGS;
- break;
case EXT4_IOC32_GETVERSION:
cmd = EXT4_IOC_GETVERSION;
break;
@@ -1362,23 +1653,28 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_RESIZE_FS:
case FITRIM:
case EXT4_IOC_PRECACHE_EXTENTS:
- case EXT4_IOC_SET_ENCRYPTION_POLICY:
- case EXT4_IOC_GET_ENCRYPTION_PWSALT:
- case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
case FS_IOC_GET_ENCRYPTION_POLICY_EX:
case FS_IOC_ADD_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ case FS_IOC_GET_ENCRYPTION_NONCE:
case EXT4_IOC_SHUTDOWN:
case FS_IOC_GETFSMAP:
case FS_IOC_ENABLE_VERITY:
case FS_IOC_MEASURE_VERITY:
+ case FS_IOC_READ_VERITY_METADATA:
case EXT4_IOC_CLEAR_ES_CACHE:
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
- case EXT4_IOC_FSGETXATTR:
- case EXT4_IOC_FSSETXATTR:
+ case EXT4_IOC_CHECKPOINT:
+ case FS_IOC_GETFSLABEL:
+ case FS_IOC_SETFSLABEL:
+ case EXT4_IOC_GETFSUUID:
+ case EXT4_IOC_SETFSUUID:
break;
default:
return -ENOIOCTLCMD;
@@ -1386,3 +1682,21 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif
+
+static void set_overhead(struct ext4_super_block *es, const void *arg)
+{
+ es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
+}
+
+int ext4_update_overhead(struct super_block *sb, bool force)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sb_rdonly(sb))
+ return 0;
+ if (!force &&
+ (sbi->s_overhead == 0 ||
+ sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters)))
+ return 0;
+ return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead);
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 51a78eb65f3c..9dad93059945 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -18,13 +18,6 @@
#include <linux/backing-dev.h>
#include <trace/events/ext4.h>
-#ifdef CONFIG_EXT4_DEBUG
-ushort ext4_mballoc_debug __read_mostly;
-
-module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
-MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
-#endif
-
/*
* MUSTDO:
* - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -131,14 +124,56 @@ MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
* /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
* stripe=<value> option the group prealloc request is normalized to the
- * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * smallest multiple of the stripe value (sbi->s_stripe) which is
* greater than the default mb_group_prealloc.
*
+ * If "mb_optimize_scan" mount option is set, we maintain in memory group info
+ * structures in two data structures:
+ *
+ * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
+ *
+ * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
+ *
+ * This is an array of lists where the index in the array represents the
+ * largest free order in the buddy bitmap of the participating group infos of
+ * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
+ * number of buddy bitmap orders possible) number of lists. Group-infos are
+ * placed in appropriate lists.
+ *
+ * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
+ *
+ * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
+ *
+ * This is an array of lists where in the i-th list there are groups with
+ * average fragment size >= 2^i and < 2^(i+1). The average fragment size
+ * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
+ * Note that we don't bother with a special list for completely empty groups
+ * so we only have MB_NUM_ORDERS(sb) lists.
+ *
+ * When "mb_optimize_scan" mount option is set, mballoc consults the above data
+ * structures to decide the order in which groups are to be traversed for
+ * fulfilling an allocation request.
+ *
+ * At CR = 0, we look for groups which have the largest_free_order >= the order
+ * of the request. We directly look at the largest free order list in the data
+ * structure (1) above where largest_free_order = order of the request. If that
+ * list is empty, we look at remaining list in the increasing order of
+ * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time.
+ *
+ * At CR = 1, we only consider groups where average fragment size > request
+ * size. So, we lookup a group which has average fragment size just above or
+ * equal to request size using our average fragment size group lists (data
+ * structure 2) in O(1) time.
+ *
+ * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
+ * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
+ *
* The regular allocator (using the buddy cache) supports a few tunables.
*
* /sys/fs/ext4/<partition>/mb_min_to_scan
* /sys/fs/ext4/<partition>/mb_max_to_scan
* /sys/fs/ext4/<partition>/mb_order2_req
+ * /sys/fs/ext4/<partition>/mb_linear_limit
*
* The regular allocator uses buddy scan only if the request len is power of
* 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
@@ -156,6 +191,16 @@ MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
* can be used for allocation. ext4_mb_good_group explains how the groups are
* checked.
*
+ * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
+ * get traversed linearly. That may result in subsequent allocations being not
+ * close to each other. And so, the underlying device may get filled up in a
+ * non-linear fashion. While that may not matter on non-rotational devices, for
+ * rotational devices that may result in higher seek times. "mb_linear_limit"
+ * tells mballoc how many groups mballoc should search linearly before
+ * performing consulting above data structures for more efficient lookups. For
+ * non rotational devices, this value defaults to 0 and for rotational devices
+ * this is set to MB_DEFAULT_LINEAR_LIMIT.
+ *
* Both the prealloc space are getting populated as above. So for the first
* request we will hit the buddy cache which will result in this prealloc
* space getting filled. The prealloc space is then later used for the
@@ -306,6 +351,8 @@ MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
* - bitlock on a group (group)
* - object (inode/locality) (object)
* - per-pa lock (pa)
+ * - cr0 lists lock (cr0)
+ * - cr1 tree lock (cr1)
*
* Paths:
* - new pa
@@ -335,6 +382,9 @@ MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
* group
* object
*
+ * - allocation path (ext4_mb_regular_allocator)
+ * group
+ * cr0/cr1
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
@@ -356,6 +406,43 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
+
+static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
+ ext4_group_t group, int cr);
+
+static int ext4_try_to_trim_range(struct super_block *sb,
+ struct ext4_buddy *e4b, ext4_grpblk_t start,
+ ext4_grpblk_t max, ext4_grpblk_t minblocks);
+
+/*
+ * The algorithm using this percpu seq counter goes below:
+ * 1. We sample the percpu discard_pa_seq counter before trying for block
+ * allocation in ext4_mb_new_blocks().
+ * 2. We increment this percpu discard_pa_seq counter when we either allocate
+ * or free these blocks i.e. while marking those blocks as used/free in
+ * mb_mark_used()/mb_free_blocks().
+ * 3. We also increment this percpu seq counter when we successfully identify
+ * that the bb_prealloc_list is not empty and hence proceed for discarding
+ * of those PAs inside ext4_mb_discard_group_preallocations().
+ *
+ * Now to make sure that the regular fast path of block allocation is not
+ * affected, as a small optimization we only sample the percpu seq counter
+ * on that cpu. Only when the block allocation fails and when freed blocks
+ * found were 0, that is when we sample percpu seq counter for all cpus using
+ * below function ext4_get_discard_pa_seq_sum(). This happens after making
+ * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
+ */
+static DEFINE_PER_CPU(u64, discard_pa_seq);
+static inline u64 ext4_get_discard_pa_seq_sum(void)
+{
+ int __cpu;
+ u64 __seq = 0;
+
+ for_each_possible_cpu(__cpu)
+ __seq += per_cpu(discard_pa_seq, __cpu);
+ return __seq;
+}
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -493,6 +580,8 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
{
+ if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+ return;
if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
unsigned char *b1, *b2;
int i;
@@ -511,6 +600,31 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
}
}
+static void mb_group_bb_bitmap_alloc(struct super_block *sb,
+ struct ext4_group_info *grp, ext4_group_t group)
+{
+ struct buffer_head *bh;
+
+ grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+ if (!grp->bb_bitmap)
+ return;
+
+ bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR_OR_NULL(bh)) {
+ kfree(grp->bb_bitmap);
+ grp->bb_bitmap = NULL;
+ return;
+ }
+
+ memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
+ put_bh(bh);
+}
+
+static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
+{
+ kfree(grp->bb_bitmap);
+}
+
#else
static inline void mb_free_blocks_double(struct inode *inode,
struct ext4_buddy *e4b, int first, int count)
@@ -526,6 +640,17 @@ static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
{
return;
}
+
+static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
+ struct ext4_group_info *grp, ext4_group_t group)
+{
+ return;
+}
+
+static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
+{
+ return;
+}
#endif
#ifdef AGGRESSIVE_CHECK
@@ -558,11 +683,8 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
void *buddy;
void *buddy2;
- {
- static int mb_check_counter;
- if (mb_check_counter++ % 100 != 0)
- return 0;
- }
+ if (e4b->bd_info->bb_check_counter++ % 10)
+ return 0;
while (order > 1) {
buddy = mb_find_buddy(e4b, order, &max);
@@ -576,13 +698,10 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
for (i = 0; i < max; i++) {
if (mb_test_bit(i, buddy)) {
- /* only single bit in buddy2 may be 1 */
+ /* only single bit in buddy2 may be 0 */
if (!mb_test_bit(i << 1, buddy2)) {
MB_CHECK_ASSERT(
mb_test_bit((i<<1)+1, buddy2));
- } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
- MB_CHECK_ASSERT(
- mb_test_bit(i << 1, buddy2));
}
continue;
}
@@ -686,6 +805,221 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
}
}
+static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
+{
+ int order;
+
+ /*
+ * We don't bother with a special lists groups with only 1 block free
+ * extents and for completely empty groups.
+ */
+ order = fls(len) - 2;
+ if (order < 0)
+ return 0;
+ if (order == MB_NUM_ORDERS(sb))
+ order--;
+ return order;
+}
+
+/* Move group to appropriate avg_fragment_size list */
+static void
+mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int new_order;
+
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
+ return;
+
+ new_order = mb_avg_fragment_size_order(sb,
+ grp->bb_free / grp->bb_fragments);
+ if (new_order == grp->bb_avg_fragment_size_order)
+ return;
+
+ if (grp->bb_avg_fragment_size_order != -1) {
+ write_lock(&sbi->s_mb_avg_fragment_size_locks[
+ grp->bb_avg_fragment_size_order]);
+ list_del(&grp->bb_avg_fragment_size_node);
+ write_unlock(&sbi->s_mb_avg_fragment_size_locks[
+ grp->bb_avg_fragment_size_order]);
+ }
+ grp->bb_avg_fragment_size_order = new_order;
+ write_lock(&sbi->s_mb_avg_fragment_size_locks[
+ grp->bb_avg_fragment_size_order]);
+ list_add_tail(&grp->bb_avg_fragment_size_node,
+ &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
+ write_unlock(&sbi->s_mb_avg_fragment_size_locks[
+ grp->bb_avg_fragment_size_order]);
+}
+
+/*
+ * Choose next group by traversing largest_free_order lists. Updates *new_cr if
+ * cr level needs an update.
+ */
+static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
+ int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_group_info *iter, *grp;
+ int i;
+
+ if (ac->ac_status == AC_STATUS_FOUND)
+ return;
+
+ if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
+ atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
+
+ grp = NULL;
+ for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
+ if (list_empty(&sbi->s_mb_largest_free_orders[i]))
+ continue;
+ read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
+ if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
+ read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
+ continue;
+ }
+ grp = NULL;
+ list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
+ bb_largest_free_order_node) {
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
+ if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
+ grp = iter;
+ break;
+ }
+ }
+ read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
+ if (grp)
+ break;
+ }
+
+ if (!grp) {
+ /* Increment cr and search again */
+ *new_cr = 1;
+ } else {
+ *group = grp->bb_group;
+ ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
+ }
+}
+
+/*
+ * Choose next group by traversing average fragment size list of suitable
+ * order. Updates *new_cr if cr level needs an update.
+ */
+static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
+ int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_group_info *grp = NULL, *iter;
+ int i;
+
+ if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
+ if (sbi->s_mb_stats)
+ atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
+ }
+
+ for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
+ i < MB_NUM_ORDERS(ac->ac_sb); i++) {
+ if (list_empty(&sbi->s_mb_avg_fragment_size[i]))
+ continue;
+ read_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
+ if (list_empty(&sbi->s_mb_avg_fragment_size[i])) {
+ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
+ continue;
+ }
+ list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
+ bb_avg_fragment_size_node) {
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
+ if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
+ grp = iter;
+ break;
+ }
+ }
+ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
+ if (grp)
+ break;
+ }
+
+ if (grp) {
+ *group = grp->bb_group;
+ ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
+ } else {
+ *new_cr = 2;
+ }
+}
+
+static inline int should_optimize_scan(struct ext4_allocation_context *ac)
+{
+ if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
+ return 0;
+ if (ac->ac_criteria >= 2)
+ return 0;
+ if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
+ return 0;
+ return 1;
+}
+
+/*
+ * Return next linear group for allocation. If linear traversal should not be
+ * performed, this function just returns the same group
+ */
+static int
+next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
+{
+ if (!should_optimize_scan(ac))
+ goto inc_and_return;
+
+ if (ac->ac_groups_linear_remaining) {
+ ac->ac_groups_linear_remaining--;
+ goto inc_and_return;
+ }
+
+ return group;
+inc_and_return:
+ /*
+ * Artificially restricted ngroups for non-extent
+ * files makes group > ngroups possible on first loop.
+ */
+ return group + 1 >= ngroups ? 0 : group + 1;
+}
+
+/*
+ * ext4_mb_choose_next_group: choose next group for allocation.
+ *
+ * @ac Allocation Context
+ * @new_cr This is an output parameter. If the there is no good group
+ * available at current CR level, this field is updated to indicate
+ * the new cr level that should be used.
+ * @group This is an input / output parameter. As an input it indicates the
+ * next group that the allocator intends to use for allocation. As
+ * output, this field indicates the next group that should be used as
+ * determined by the optimization functions.
+ * @ngroups Total number of groups
+ */
+static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
+ int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+{
+ *new_cr = ac->ac_criteria;
+
+ if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
+ *group = next_linear_group(ac, *group, ngroups);
+ return;
+ }
+
+ if (*new_cr == 0) {
+ ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
+ } else if (*new_cr == 1) {
+ ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
+ } else {
+ /*
+ * TODO: For CR=2, we can arrange groups in an rb tree sorted by
+ * bb_free. But until that happens, we should never come here.
+ */
+ WARN_ON(1);
+ }
+}
+
/*
* Cache the order of the largest free extent we have available in this block
* group.
@@ -693,17 +1027,34 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
static void
mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
int i;
- int bits;
-
- grp->bb_largest_free_order = -1; /* uninit */
- bits = sb->s_blocksize_bits + 1;
- for (i = bits; i >= 0; i--) {
- if (grp->bb_counters[i] > 0) {
- grp->bb_largest_free_order = i;
+ for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
+ if (grp->bb_counters[i] > 0)
break;
- }
+ /* No need to move between order lists? */
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
+ i == grp->bb_largest_free_order) {
+ grp->bb_largest_free_order = i;
+ return;
+ }
+
+ if (grp->bb_largest_free_order >= 0) {
+ write_lock(&sbi->s_mb_largest_free_orders_locks[
+ grp->bb_largest_free_order]);
+ list_del_init(&grp->bb_largest_free_order_node);
+ write_unlock(&sbi->s_mb_largest_free_orders_locks[
+ grp->bb_largest_free_order]);
+ }
+ grp->bb_largest_free_order = i;
+ if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
+ write_lock(&sbi->s_mb_largest_free_orders_locks[
+ grp->bb_largest_free_order]);
+ list_add_tail(&grp->bb_largest_free_order_node,
+ &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
+ write_unlock(&sbi->s_mb_largest_free_orders_locks[
+ grp->bb_largest_free_order]);
}
}
@@ -754,32 +1105,13 @@ void ext4_mb_generate_buddy(struct super_block *sb,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_set_largest_free_order(sb, grp);
+ mb_update_avg_fragment_size(sb, grp);
clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
period = get_cycles() - period;
- spin_lock(&sbi->s_bal_lock);
- sbi->s_mb_buddies_generated++;
- sbi->s_mb_generation_time += period;
- spin_unlock(&sbi->s_bal_lock);
-}
-
-static void mb_regenerate_buddy(struct ext4_buddy *e4b)
-{
- int count;
- int order = 1;
- void *buddy;
-
- while ((buddy = mb_find_buddy(e4b, order++, &count))) {
- ext4_set_bits(buddy, 0, count);
- }
- e4b->bd_info->bb_fragments = 0;
- memset(e4b->bd_info->bb_counters, 0,
- sizeof(*e4b->bd_info->bb_counters) *
- (e4b->bd_sb->s_blocksize_bits + 2));
-
- ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
- e4b->bd_bitmap, e4b->bd_group);
+ atomic_inc(&sbi->s_mb_buddies_generated);
+ atomic64_add(period, &sbi->s_mb_generation_time);
}
/* The buddy information is attached the buddy cache inode
@@ -820,14 +1152,14 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
char *bitmap;
struct ext4_group_info *grinfo;
- mb_debug(1, "init page %lu\n", page->index);
-
inode = page->mapping->host;
sb = inode->i_sb;
ngroups = ext4_get_groups_count(sb);
blocksize = i_blocksize(inode);
blocks_per_page = PAGE_SIZE / blocksize;
+ mb_debug(sb, "init page %lu\n", page->index);
+
groups_per_page = blocks_per_page >> 1;
if (groups_per_page == 0)
groups_per_page = 1;
@@ -861,13 +1193,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
bh[i] = NULL;
continue;
}
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
if (IS_ERR(bh[i])) {
err = PTR_ERR(bh[i]);
bh[i] = NULL;
goto out;
}
- mb_debug(1, "read bitmap for group %u\n", group);
+ mb_debug(sb, "read bitmap for group %u\n", group);
}
/* wait for I/O completion */
@@ -912,14 +1244,14 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
- mb_debug(1, "put buddy for group %u in page %lu/%x\n",
+ mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
trace_ext4_mb_buddy_bitmap_load(sb, group);
grinfo = ext4_get_group_info(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
sizeof(*grinfo->bb_counters) *
- (sb->s_blocksize_bits+2));
+ (MB_NUM_ORDERS(sb)));
/*
* incore got set to the group block bitmap below
*/
@@ -932,7 +1264,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
} else {
/* this is block of bitmap */
BUG_ON(incore != NULL);
- mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
+ mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
trace_ext4_mb_bitmap_load(sb, group);
@@ -1038,7 +1370,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
int ret = 0;
might_sleep();
- mb_debug(1, "init group %u\n", group);
+ mb_debug(sb, "init group %u\n", group);
this_grp = ext4_get_group_info(sb, group);
/*
* This ensures that we don't reinit the buddy cache
@@ -1110,7 +1442,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
struct inode *inode = sbi->s_buddy_cache;
might_sleep();
- mb_debug(1, "load group %u\n", group);
+ mb_debug(sb, "load group %u\n", group);
blocks_per_page = PAGE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
@@ -1218,9 +1550,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- BUG_ON(e4b->bd_bitmap_page == NULL);
- BUG_ON(e4b->bd_buddy_page == NULL);
-
return 0;
err:
@@ -1252,22 +1581,18 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
{
- int order = 1;
- int bb_incr = 1 << (e4b->bd_blkbits - 1);
+ int order = 1, max;
void *bb;
BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
- bb = e4b->bd_buddy;
while (order <= e4b->bd_blkbits + 1) {
- block = block >> 1;
- if (!mb_test_bit(block, bb)) {
+ bb = mb_find_buddy(e4b, order, &max);
+ if (!mb_test_bit(block >> order, bb)) {
/* this block is part of buddy of order 'order' */
return order;
}
- bb += bb_incr;
- bb_incr >>= 1;
order++;
}
return 0;
@@ -1318,7 +1643,7 @@ static int mb_test_and_clear_bits(void *bm, int cur, int len)
return zero_bit;
}
-void ext4_set_bits(void *bm, int cur, int len)
+void mb_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1336,9 +1661,6 @@ void ext4_set_bits(void *bm, int cur, int len)
}
}
-/*
- * _________________________________________________________________ */
-
static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
{
if (mb_test_bit(*bit + side, bitmap)) {
@@ -1430,6 +1752,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
+ this_cpu_inc(discard_pa_seq);
e4b->bd_info->bb_free += count;
if (first < e4b->bd_info->bb_first_free)
e4b->bd_info->bb_first_free = first;
@@ -1449,15 +1772,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(sbi, block);
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block "
- "(bit %u); block bitmap corrupt.",
- block);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing already freed block (bit %u); block bitmap corrupt.",
+ block);
+ ext4_mark_group_bitmap_corrupted(
+ sb, e4b->bd_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
- mb_regenerate_buddy(e4b);
+ }
goto done;
}
@@ -1487,6 +1811,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
done:
mb_set_largest_free_order(sb, e4b->bd_info);
+ mb_update_avg_fragment_size(sb, e4b->bd_info);
mb_check_buddy(e4b);
}
@@ -1542,10 +1867,11 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
/* Should never happen! (but apparently sometimes does?!?) */
WARN_ON(1);
- ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
- "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
- block, order, needed, ex->fe_group, ex->fe_start,
- ex->fe_len, ex->fe_logical);
+ ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
+ "corruption or bug in mb_find_extent "
+ "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
+ block, order, needed, ex->fe_group, ex->fe_start,
+ ex->fe_len, ex->fe_logical);
ex->fe_len = 0;
ex->fe_start = 0;
ex->fe_group = 0;
@@ -1564,6 +1890,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
unsigned ret = 0;
int len0 = len;
void *buddy;
+ bool split = false;
BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
BUG_ON(e4b->bd_group != ex->fe_group);
@@ -1571,6 +1898,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
mb_check_buddy(e4b);
mb_mark_used_double(e4b, start, len);
+ this_cpu_inc(discard_pa_seq);
e4b->bd_info->bb_free -= len;
if (e4b->bd_info->bb_first_free == start)
e4b->bd_info->bb_first_free += len;
@@ -1587,12 +1915,16 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
/* let's maintain buddy itself */
while (len) {
- ord = mb_find_order_for_block(e4b, start);
+ if (!split)
+ ord = mb_find_order_for_block(e4b, start);
if (((start >> ord) << ord) == start && len >= (1 << ord)) {
/* the whole chunk may be allocated at once! */
mlen = 1 << ord;
- buddy = mb_find_buddy(e4b, ord, &max);
+ if (!split)
+ buddy = mb_find_buddy(e4b, ord, &max);
+ else
+ split = false;
BUG_ON((start >> ord) >= max);
mb_set_bit(start >> ord, buddy);
e4b->bd_info->bb_counters[ord]--;
@@ -1619,10 +1951,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
mb_clear_bit(cur + 1, buddy);
e4b->bd_info->bb_counters[ord]++;
e4b->bd_info->bb_counters[ord]++;
+ split = true;
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
- ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
+ mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
+ mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
mb_check_buddy(e4b);
return ret;
@@ -1670,11 +2004,15 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
spin_unlock(&sbi->s_md_lock);
}
-}
+ /*
+ * As we've just preallocated more space than
+ * user requested originally, we store allocated
+ * space in a special descriptor.
+ */
+ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ ext4_mb_new_preallocation(ac);
-/*
- * regular allocator, for general purposes allocation
- */
+}
static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b,
@@ -1893,7 +2231,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
int max;
BUG_ON(ac->ac_2order <= 0);
- for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+ for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
if (grp->bb_counters[i] == 0)
continue;
@@ -1901,8 +2239,15 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
BUG_ON(buddy == NULL);
k = mb_find_next_zero_bit(buddy, max, 0);
- BUG_ON(k >= max);
-
+ if (k >= max) {
+ ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
+ "%d free clusters of order %d. But found 0",
+ grp->bb_counters[i], i);
+ ext4_mark_group_bitmap_corrupted(ac->ac_sb,
+ e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+ break;
+ }
ac->ac_found++;
ac->ac_b_ex.fe_len = 1 << i;
@@ -1911,7 +2256,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
- BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
+ BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
if (EXT4_SB(sb)->s_mb_stats)
atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
@@ -1936,7 +2281,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
int free;
free = e4b->bd_info->bb_free;
- BUG_ON(free <= 0);
+ if (WARN_ON(free <= 0))
+ return;
i = e4b->bd_info->bb_first_free;
@@ -1947,7 +2293,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
/*
* IF we have corrupt bitmap, we won't find any
* free blocks even though group info says we
- * we have free blocks
+ * have free blocks
*/
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
@@ -1959,7 +2305,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
}
mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
- BUG_ON(ex.fe_len <= 0);
+ if (WARN_ON(ex.fe_len <= 0))
+ break;
if (free < ex.fe_len) {
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
@@ -2026,39 +2373,29 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
}
/*
- * This is now called BEFORE we load the buddy bitmap.
+ * This is also called BEFORE we load the buddy bitmap.
* Returns either 1 or 0 indicating that the group is either suitable
- * for the allocation or not. In addition it can also return negative
- * error code when something goes wrong.
+ * for the allocation or not.
*/
-static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
{
- unsigned free, fragments;
+ ext4_grpblk_t free, fragments;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
BUG_ON(cr < 0 || cr >= 4);
- free = grp->bb_free;
- if (free == 0)
- return 0;
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
- return 0;
-
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
- return 0;
+ return false;
- /* We only do this if the grp has never been initialized */
- if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
- if (ret)
- return ret;
- }
+ free = grp->bb_free;
+ if (free == 0)
+ return false;
fragments = grp->bb_fragments;
if (fragments == 0)
- return 0;
+ return false;
switch (cr) {
case 0:
@@ -2068,42 +2405,201 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
- return 0;
+ return false;
+
+ if (free < ac->ac_g_ex.fe_len)
+ return false;
- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
- (free / fragments) >= ac->ac_g_ex.fe_len)
- return 1;
+ if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
+ return true;
if (grp->bb_largest_free_order < ac->ac_2order)
- return 0;
+ return false;
- return 1;
+ return true;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
- return 1;
+ return true;
break;
case 2:
if (free >= ac->ac_g_ex.fe_len)
- return 1;
+ return true;
break;
case 3:
- return 1;
+ return true;
default:
BUG();
}
- return 0;
+ return false;
+}
+
+/*
+ * This could return negative error code if something goes wrong
+ * during ext4_mb_init_group(). This should not be called with
+ * ext4_lock_group() held.
+ *
+ * Note: because we are conditionally operating with the group lock in
+ * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
+ * function using __acquire and __release. This means we need to be
+ * super careful before messing with the error path handling via "goto
+ * out"!
+ */
+static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
+ ext4_group_t group, int cr)
+{
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
+ ext4_grpblk_t free;
+ int ret = 0;
+
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
+ if (should_lock) {
+ ext4_lock_group(sb, group);
+ __release(ext4_group_lock_ptr(sb, group));
+ }
+ free = grp->bb_free;
+ if (free == 0)
+ goto out;
+ if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ goto out;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ goto out;
+ if (should_lock) {
+ __acquire(ext4_group_lock_ptr(sb, group));
+ ext4_unlock_group(sb, group);
+ }
+
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ struct ext4_group_desc *gdp =
+ ext4_get_group_desc(sb, group, NULL);
+ int ret;
+
+ /* cr=0/1 is a very optimistic search to find large
+ * good chunks almost for free. If buddy data is not
+ * ready, then this optimization makes no sense. But
+ * we never skip the first block group in a flex_bg,
+ * since this gets used for metadata block allocation,
+ * and we want to make sure we locate metadata blocks
+ * in the first block group in the flex_bg if possible.
+ */
+ if (cr < 2 &&
+ (!sbi->s_log_groups_per_flex ||
+ ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
+ return 0;
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+
+ if (should_lock) {
+ ext4_lock_group(sb, group);
+ __release(ext4_group_lock_ptr(sb, group));
+ }
+ ret = ext4_mb_good_group(ac, group, cr);
+out:
+ if (should_lock) {
+ __acquire(ext4_group_lock_ptr(sb, group));
+ ext4_unlock_group(sb, group);
+ }
+ return ret;
+}
+
+/*
+ * Start prefetching @nr block bitmaps starting at @group.
+ * Return the next group which needs to be prefetched.
+ */
+ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
+ unsigned int nr, int *cnt)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct buffer_head *bh;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ /*
+ * Prefetch block groups with free blocks; but don't
+ * bother if it is marked uninitialized on disk, since
+ * it won't require I/O to read. Also only try to
+ * prefetch once, so we avoid getblk() call, which can
+ * be expensive.
+ */
+ if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+ EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
+ if (bh && !IS_ERR(bh)) {
+ if (!buffer_uptodate(bh) && cnt)
+ (*cnt)++;
+ brelse(bh);
+ }
+ }
+ if (++group >= ngroups)
+ group = 0;
+ }
+ blk_finish_plug(&plug);
+ return group;
+}
+
+/*
+ * Prefetching reads the block bitmap into the buffer cache; but we
+ * need to make sure that the buddy bitmap in the page cache has been
+ * initialized. Note that ext4_mb_init_group() will block if the I/O
+ * is not yet completed, or indeed if it was not initiated by
+ * ext4_mb_prefetch did not start the I/O.
+ *
+ * TODO: We should actually kick off the buddy bitmap setup in a work
+ * queue when the buffer I/O is completed, so that we don't block
+ * waiting for the block allocation bitmap read to finish when
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
+ */
+void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr)
+{
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ if (!group)
+ group = ext4_get_groups_count(sb);
+ group--;
+ grp = ext4_get_group_info(sb, group);
+
+ if (EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ if (ext4_mb_init_group(sb, group, GFP_NOFS))
+ break;
+ }
+ }
}
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t ngroups, group, i;
- int cr;
+ ext4_group_t prefetch_grp = 0, ngroups, group, i;
+ int cr = -1, new_cr;
int err = 0, first_err = 0;
+ unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
+ int lost;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
@@ -2123,8 +2619,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
goto out;
/*
- * ac->ac2_order is set only if the fe_len is a power of 2
- * if ac2_order is set we also set criteria to 0 so that we
+ * ac->ac_2order is set only if the fe_len is a power of 2
+ * if ac->ac_2order is set we also set criteria to 0 so that we
* try exact allocation using buddy.
*/
i = fls(ac->ac_g_ex.fe_len);
@@ -2136,13 +2632,13 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* We also support searching for power-of-two requests only for
* requests upto maximum buddy size we have constructed.
*/
- if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
+ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
/*
* This should tell if fe_len is exactly power of 2
*/
if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
ac->ac_2order = array_index_nospec(i - 1,
- sb->s_blocksize_bits + 2);
+ MB_NUM_ORDERS(sb));
}
/* if stream allocation is enabled, use global goal */
@@ -2168,19 +2664,44 @@ repeat:
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
+ ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
+ prefetch_grp = group;
- for (i = 0; i < ngroups; group++, i++) {
+ for (i = 0, new_cr = cr; i < ngroups; i++,
+ ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
int ret = 0;
+
cond_resched();
+ if (new_cr != cr) {
+ cr = new_cr;
+ goto repeat;
+ }
+
/*
- * Artificially restricted ngroups for non-extent
- * files makes group > ngroups possible on first loop.
+ * Batch reads of the block allocation bitmaps
+ * to get multiple READs in flight; limit
+ * prefetching at cr=0/1, otherwise mballoc can
+ * spend a lot of time loading imperfect groups
*/
- if (group >= ngroups)
- group = 0;
+ if ((prefetch_grp == group) &&
+ (cr > 1 ||
+ prefetch_ios < sbi->s_mb_prefetch_limit)) {
+ unsigned int curr_ios = prefetch_ios;
+
+ nr = sbi->s_mb_prefetch;
+ if (ext4_has_feature_flex_bg(sb)) {
+ nr = 1 << sbi->s_log_groups_per_flex;
+ nr -= group & (nr - 1);
+ nr = min(nr, sbi->s_mb_prefetch);
+ }
+ prefetch_grp = ext4_mb_prefetch(sb, group,
+ nr, &prefetch_ios);
+ if (prefetch_ios == curr_ios)
+ nr = 0;
+ }
/* This now checks without needing the buddy page */
- ret = ext4_mb_good_group(ac, group, cr);
+ ret = ext4_mb_good_group_nolock(ac, group, cr);
if (ret <= 0) {
if (!first_err)
first_err = ret;
@@ -2198,11 +2719,9 @@ repeat:
* block group
*/
ret = ext4_mb_good_group(ac, group, cr);
- if (ret <= 0) {
+ if (ret == 0) {
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
- if (!first_err)
- first_err = ret;
continue;
}
@@ -2221,6 +2740,9 @@ repeat:
if (ac->ac_status != AC_STATUS_CONTINUE)
break;
}
+ /* Processed all groups and haven't found blocks */
+ if (sbi->s_mb_stats && i == ngroups)
+ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
}
if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2229,34 +2751,47 @@ repeat:
* We've been searching too long. Let's try to allocate
* the best chunk we've found so far
*/
-
ext4_mb_try_best_found(ac, &e4b);
if (ac->ac_status != AC_STATUS_FOUND) {
/*
* Someone more lucky has already allocated it.
* The only thing we can do is just take first
* found block(s)
- printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
*/
+ lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
+ mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
+ ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len, lost);
+
ac->ac_b_ex.fe_group = 0;
ac->ac_b_ex.fe_start = 0;
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
cr = 3;
- atomic_inc(&sbi->s_mb_lost_chunks);
goto repeat;
}
}
+
+ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
+ atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
out:
if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
err = first_err;
+
+ mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
+ ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
+ ac->ac_flags, cr, err);
+
+ if (nr)
+ ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+
return err;
}
static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group;
if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
@@ -2267,7 +2802,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group;
++*pos;
@@ -2279,7 +2814,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
- struct super_block *sb = PDE_DATA(file_inode(seq->file));
+ struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group = (ext4_group_t) ((unsigned long) v);
int i;
int err, buddy_loaded = 0;
@@ -2323,7 +2858,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
sg.info.bb_counters[i] : 0);
- seq_printf(seq, " ]\n");
+ seq_puts(seq, " ]\n");
return 0;
}
@@ -2339,6 +2874,148 @@ const struct seq_operations ext4_mb_seq_groups_ops = {
.show = ext4_mb_seq_groups_show,
};
+int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ seq_puts(seq, "mballoc:\n");
+ if (!sbi->s_mb_stats) {
+ seq_puts(seq, "\tmb stats collection turned off.\n");
+ seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
+ return 0;
+ }
+ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
+ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
+
+ seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned));
+
+ seq_puts(seq, "\tcr0_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[0]));
+ seq_printf(seq, "\t\tbad_suggestions: %u\n",
+ atomic_read(&sbi->s_bal_cr0_bad_suggestions));
+
+ seq_puts(seq, "\tcr1_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[1]));
+ seq_printf(seq, "\t\tbad_suggestions: %u\n",
+ atomic_read(&sbi->s_bal_cr1_bad_suggestions));
+
+ seq_puts(seq, "\tcr2_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[2]));
+
+ seq_puts(seq, "\tcr3_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[3]));
+ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
+ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
+ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
+ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
+
+ seq_printf(seq, "\tbuddies_generated: %u/%u\n",
+ atomic_read(&sbi->s_mb_buddies_generated),
+ ext4_get_groups_count(sb));
+ seq_printf(seq, "\tbuddies_time_used: %llu\n",
+ atomic64_read(&sbi->s_mb_generation_time));
+ seq_printf(seq, "\tpreallocated: %u\n",
+ atomic_read(&sbi->s_mb_preallocated));
+ seq_printf(seq, "\tdiscarded: %u\n",
+ atomic_read(&sbi->s_mb_discarded));
+ return 0;
+}
+
+static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
+__acquires(&EXT4_SB(sb)->s_mb_rb_lock)
+{
+ struct super_block *sb = pde_data(file_inode(seq->file));
+ unsigned long position;
+
+ if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
+ return NULL;
+ position = *pos + 1;
+ return (void *) ((unsigned long) position);
+}
+
+static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct super_block *sb = pde_data(file_inode(seq->file));
+ unsigned long position;
+
+ ++*pos;
+ if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
+ return NULL;
+ position = *pos + 1;
+ return (void *) ((unsigned long) position);
+}
+
+static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
+{
+ struct super_block *sb = pde_data(file_inode(seq->file));
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned long position = ((unsigned long) v);
+ struct ext4_group_info *grp;
+ unsigned int count;
+
+ position--;
+ if (position >= MB_NUM_ORDERS(sb)) {
+ position -= MB_NUM_ORDERS(sb);
+ if (position == 0)
+ seq_puts(seq, "avg_fragment_size_lists:\n");
+
+ count = 0;
+ read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
+ list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
+ bb_avg_fragment_size_node)
+ count++;
+ read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
+ seq_printf(seq, "\tlist_order_%u_groups: %u\n",
+ (unsigned int)position, count);
+ return 0;
+ }
+
+ if (position == 0) {
+ seq_printf(seq, "optimize_scan: %d\n",
+ test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
+ seq_puts(seq, "max_free_order_lists:\n");
+ }
+ count = 0;
+ read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
+ list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
+ bb_largest_free_order_node)
+ count++;
+ read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
+ seq_printf(seq, "\tlist_order_%u_groups: %u\n",
+ (unsigned int)position, count);
+
+ return 0;
+}
+
+static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
+{
+}
+
+const struct seq_operations ext4_mb_seq_structs_summary_ops = {
+ .start = ext4_mb_seq_structs_summary_start,
+ .next = ext4_mb_seq_structs_summary_next,
+ .stop = ext4_mb_seq_structs_summary_stop,
+ .show = ext4_mb_seq_structs_summary_show,
+};
+
static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
{
int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
@@ -2379,7 +3056,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
if (old_groupinfo)
ext4_kvfree_array_rcu(old_groupinfo);
- ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
+ ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
sbi->s_group_info_size);
return 0;
}
@@ -2441,22 +3118,13 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root = RB_ROOT;
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
+ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */
+ meta_group_info[i]->bb_group = group;
-#ifdef DOUBLE_CHECK
- {
- struct buffer_head *bh;
- meta_group_info[i]->bb_bitmap =
- kmalloc(sb->s_blocksize, GFP_NOFS);
- BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
- bh = ext4_read_block_bitmap(sb, group);
- BUG_ON(IS_ERR_OR_NULL(bh));
- memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
- sb->s_blocksize);
- put_bh(bh);
- }
-#endif
-
+ mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
return 0;
exit_group_info:
@@ -2510,6 +3178,34 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freebuddy;
}
+ if (ext4_has_feature_flex_bg(sb)) {
+ /* a single flex group is supposed to be read by a single IO.
+ * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
+ * unsigned integer, so the maximum shift is 32.
+ */
+ if (sbi->s_es->s_log_groups_per_flex >= 32) {
+ ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
+ goto err_freebuddy;
+ }
+ sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
+ BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+ } else {
+ sbi->s_mb_prefetch = 32;
+ }
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+ /* now many real IOs to prefetch within a single allocation at cr=0
+ * given cr=0 is an CPU-related optimization we shouldn't try to
+ * load too many groups, at some point we should start to use what
+ * we've got in memory.
+ * with an average random access time 5ms, it'd take a second to get
+ * 200 groups (* N with flex_bg), so let's make this limit 4
+ */
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
return 0;
err_freebuddy:
@@ -2579,6 +3275,57 @@ static int ext4_groupinfo_create_slab(size_t size)
return 0;
}
+static void ext4_discard_work(struct work_struct *work)
+{
+ struct ext4_sb_info *sbi = container_of(work,
+ struct ext4_sb_info, s_discard_work);
+ struct super_block *sb = sbi->s_sb;
+ struct ext4_free_data *fd, *nfd;
+ struct ext4_buddy e4b;
+ struct list_head discard_list;
+ ext4_group_t grp, load_grp;
+ int err = 0;
+
+ INIT_LIST_HEAD(&discard_list);
+ spin_lock(&sbi->s_md_lock);
+ list_splice_init(&sbi->s_discard_list, &discard_list);
+ spin_unlock(&sbi->s_md_lock);
+
+ load_grp = UINT_MAX;
+ list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
+ /*
+ * If filesystem is umounting or no memory or suffering
+ * from no space, give up the discard
+ */
+ if ((sb->s_flags & SB_ACTIVE) && !err &&
+ !atomic_read(&sbi->s_retry_alloc_pending)) {
+ grp = fd->efd_group;
+ if (grp != load_grp) {
+ if (load_grp != UINT_MAX)
+ ext4_mb_unload_buddy(&e4b);
+
+ err = ext4_mb_load_buddy(sb, grp, &e4b);
+ if (err) {
+ kmem_cache_free(ext4_free_data_cachep, fd);
+ load_grp = UINT_MAX;
+ continue;
+ } else {
+ load_grp = grp;
+ }
+ }
+
+ ext4_lock_group(sb, grp);
+ ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
+ fd->efd_start_cluster + fd->efd_count - 1, 1);
+ ext4_unlock_group(sb, grp);
+ }
+ kmem_cache_free(ext4_free_data_cachep, fd);
+ }
+
+ if (load_grp != UINT_MAX)
+ ext4_mb_unload_buddy(&e4b);
+}
+
int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2587,7 +3334,7 @@ int ext4_mb_init(struct super_block *sb)
unsigned max;
int ret;
- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
+ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_offsets == NULL) {
@@ -2595,7 +3342,7 @@ int ext4_mb_init(struct super_block *sb)
goto out;
}
- i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
+ i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
ret = -ENOMEM;
@@ -2621,18 +3368,58 @@ int ext4_mb_init(struct super_block *sb)
offset_incr = offset_incr >> 1;
max = max >> 1;
i++;
- } while (i <= sb->s_blocksize_bits + 1);
+ } while (i < MB_NUM_ORDERS(sb));
+
+ sbi->s_mb_avg_fragment_size =
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!sbi->s_mb_avg_fragment_size) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sbi->s_mb_avg_fragment_size_locks =
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
+ GFP_KERNEL);
+ if (!sbi->s_mb_avg_fragment_size_locks) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
+ INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
+ rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
+ }
+ sbi->s_mb_largest_free_orders =
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!sbi->s_mb_largest_free_orders) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sbi->s_mb_largest_free_orders_locks =
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
+ GFP_KERNEL);
+ if (!sbi->s_mb_largest_free_orders_locks) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
+ INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
+ rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
+ }
spin_lock_init(&sbi->s_md_lock);
- spin_lock_init(&sbi->s_bal_lock);
sbi->s_mb_free_pending = 0;
INIT_LIST_HEAD(&sbi->s_freed_data_list);
+ INIT_LIST_HEAD(&sbi->s_discard_list);
+ INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
+ atomic_set(&sbi->s_retry_alloc_pending, 0);
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
@@ -2674,6 +3461,10 @@ int ext4_mb_init(struct super_block *sb)
spin_lock_init(&lg->lg_prealloc_lock);
}
+ if (bdev_nonrot(sb->s_bdev))
+ sbi->s_mb_max_linear_groups = 0;
+ else
+ sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
/* init file for buddy data */
ret = ext4_mb_init_backend(sb);
if (ret != 0)
@@ -2685,6 +3476,10 @@ out_free_locality_groups:
free_percpu(sbi->s_locality_groups);
sbi->s_locality_groups = NULL;
out:
+ kfree(sbi->s_mb_avg_fragment_size);
+ kfree(sbi->s_mb_avg_fragment_size_locks);
+ kfree(sbi->s_mb_largest_free_orders);
+ kfree(sbi->s_mb_largest_free_orders_locks);
kfree(sbi->s_mb_offsets);
sbi->s_mb_offsets = NULL;
kfree(sbi->s_mb_maxs);
@@ -2693,7 +3488,7 @@ out:
}
/* need to called with the ext4 group lock held */
-static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
+static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
{
struct ext4_prealloc_space *pa;
struct list_head *cur, *tmp;
@@ -2705,9 +3500,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
count++;
kmem_cache_free(ext4_pspace_cachep, pa);
}
- if (count)
- mb_debug(1, "mballoc: %u PAs left\n", count);
-
+ return count;
}
int ext4_mb_release(struct super_block *sb)
@@ -2718,16 +3511,26 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_group_info *grinfo, ***group_info;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+ int count;
+
+ if (test_opt(sb, DISCARD)) {
+ /*
+ * wait the discard work to drain all of ext4_free_data
+ */
+ flush_work(&sbi->s_discard_work);
+ WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
+ }
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
cond_resched();
grinfo = ext4_get_group_info(sb, i);
-#ifdef DOUBLE_CHECK
- kfree(grinfo->bb_bitmap);
-#endif
+ mb_group_bb_bitmap_free(grinfo);
ext4_lock_group(sb, i);
- ext4_mb_cleanup_pa(grinfo);
+ count = ext4_mb_cleanup_pa(grinfo);
+ if (count)
+ mb_debug(sb, "mballoc: %d PAs left\n",
+ count);
ext4_unlock_group(sb, i);
kmem_cache_free(cachep, grinfo);
}
@@ -2741,6 +3544,10 @@ int ext4_mb_release(struct super_block *sb)
kvfree(group_info);
rcu_read_unlock();
}
+ kfree(sbi->s_mb_avg_fragment_size);
+ kfree(sbi->s_mb_avg_fragment_size_locks);
+ kfree(sbi->s_mb_largest_free_orders);
+ kfree(sbi->s_mb_largest_free_orders_locks);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
iput(sbi->s_buddy_cache);
@@ -2751,17 +3558,18 @@ int ext4_mb_release(struct super_block *sb)
atomic_read(&sbi->s_bal_reqs),
atomic_read(&sbi->s_bal_success));
ext4_msg(sb, KERN_INFO,
- "mballoc: %u extents scanned, %u goal hits, "
+ "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
"%u 2^N hits, %u breaks, %u lost",
atomic_read(&sbi->s_bal_ex_scanned),
+ atomic_read(&sbi->s_bal_groups_scanned),
atomic_read(&sbi->s_bal_goals),
atomic_read(&sbi->s_bal_2orders),
atomic_read(&sbi->s_bal_breaks),
atomic_read(&sbi->s_mb_lost_chunks));
ext4_msg(sb, KERN_INFO,
- "mballoc: %lu generated and it took %Lu",
- sbi->s_mb_buddies_generated,
- sbi->s_mb_generation_time);
+ "mballoc: %u generated and it took %llu",
+ atomic_read(&sbi->s_mb_buddies_generated),
+ atomic64_read(&sbi->s_mb_generation_time));
ext4_msg(sb, KERN_INFO,
"mballoc: %u preallocated, %u discarded",
atomic_read(&sbi->s_mb_preallocated),
@@ -2788,7 +3596,7 @@ static inline int ext4_issue_discard(struct super_block *sb,
return __blkdev_issue_discard(sb->s_bdev,
(sector_t)discard_block << (sb->s_blocksize_bits - 9),
(sector_t)count << (sb->s_blocksize_bits - 9),
- GFP_NOFS, 0, biop);
+ GFP_NOFS, biop);
} else
return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
@@ -2800,7 +3608,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
- mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+ mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
entry->efd_count, entry->efd_group, entry);
err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
@@ -2837,10 +3645,10 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
put_page(e4b.bd_bitmap_page);
}
ext4_unlock_group(sb, entry->efd_group);
- kmem_cache_free(ext4_free_data_cachep, entry);
ext4_mb_unload_buddy(&e4b);
- mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+ mb_debug(sb, "freed %d blocks in %d structures\n", count,
+ count2);
}
/*
@@ -2851,10 +3659,9 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_free_data *entry, *tmp;
- struct bio *discard_bio = NULL;
struct list_head freed_data_list;
struct list_head *cut_pos = NULL;
- int err;
+ bool wake;
INIT_LIST_HEAD(&freed_data_list);
@@ -2869,30 +3676,20 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
cut_pos);
spin_unlock(&sbi->s_md_lock);
- if (test_opt(sb, DISCARD)) {
- list_for_each_entry(entry, &freed_data_list, efd_list) {
- err = ext4_issue_discard(sb, entry->efd_group,
- entry->efd_start_cluster,
- entry->efd_count,
- &discard_bio);
- if (err && err != -EOPNOTSUPP) {
- ext4_msg(sb, KERN_WARNING, "discard request in"
- " group:%d block:%d count:%d failed"
- " with %d", entry->efd_group,
- entry->efd_start_cluster,
- entry->efd_count, err);
- } else if (err == -EOPNOTSUPP)
- break;
- }
+ list_for_each_entry(entry, &freed_data_list, efd_list)
+ ext4_free_data_in_buddy(sb, entry);
- if (discard_bio) {
- submit_bio_wait(discard_bio);
- bio_put(discard_bio);
- }
+ if (test_opt(sb, DISCARD)) {
+ spin_lock(&sbi->s_md_lock);
+ wake = list_empty(&sbi->s_discard_list);
+ list_splice_tail(&freed_data_list, &sbi->s_discard_list);
+ spin_unlock(&sbi->s_md_lock);
+ if (wake)
+ queue_work(system_unbound_wq, &sbi->s_discard_work);
+ } else {
+ list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
-
- list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
- ext4_free_data_in_buddy(sb, entry);
}
int __init ext4_init_mballoc(void)
@@ -2900,23 +3697,26 @@ int __init ext4_init_mballoc(void)
ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
SLAB_RECLAIM_ACCOUNT);
if (ext4_pspace_cachep == NULL)
- return -ENOMEM;
+ goto out;
ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
SLAB_RECLAIM_ACCOUNT);
- if (ext4_ac_cachep == NULL) {
- kmem_cache_destroy(ext4_pspace_cachep);
- return -ENOMEM;
- }
+ if (ext4_ac_cachep == NULL)
+ goto out_pa_free;
ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
SLAB_RECLAIM_ACCOUNT);
- if (ext4_free_data_cachep == NULL) {
- kmem_cache_destroy(ext4_pspace_cachep);
- kmem_cache_destroy(ext4_ac_cachep);
- return -ENOMEM;
- }
+ if (ext4_free_data_cachep == NULL)
+ goto out_ac_free;
+
return 0;
+
+out_ac_free:
+ kmem_cache_destroy(ext4_ac_cachep);
+out_pa_free:
+ kmem_cache_destroy(ext4_pspace_cachep);
+out:
+ return -ENOMEM;
}
void ext4_exit_mballoc(void)
@@ -2963,7 +3763,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
if (err)
goto out_err;
@@ -2976,14 +3777,14 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ext4_free_group_clusters(sb, gdp));
BUFFER_TRACE(gdp_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdp_bh);
+ err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE);
if (err)
goto out_err;
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (!ext4_data_block_valid(sbi, block, len)) {
+ if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
"fs metadata", block, block+len);
/* File system mounted not to panic on error
@@ -2991,7 +3792,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
* We leak some of the blocks here.
*/
ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3010,7 +3811,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
}
#endif
- ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len);
if (ext4_has_group_desc_csum(sb) &&
(gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
@@ -3053,6 +3854,118 @@ out_err:
}
/*
+ * Idempotent helper for Ext4 fast commit replay path to set the state of
+ * blocks in bitmaps and update counters.
+ */
+void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
+ int len, int state)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ int i, err;
+ int already;
+ unsigned int clen, clen_changed, thisgrp_len;
+
+ while (len > 0) {
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
+
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ * In case of flex_bg, this can happen that (block, len) may
+ * span across more than one group. In that case we need to
+ * get the corresponding group metadata to work with.
+ * For this we have goto again loop.
+ */
+ thisgrp_len = min_t(unsigned int, (unsigned int)len,
+ EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
+ clen = EXT4_NUM_B2C(sbi, thisgrp_len);
+
+ if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
+ ext4_error(sb, "Marking blocks in system zone - "
+ "Block = %llu, len = %u",
+ block, thisgrp_len);
+ bitmap_bh = NULL;
+ break;
+ }
+
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
+ break;
+ }
+
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ break;
+
+ ext4_lock_group(sb, group);
+ already = 0;
+ for (i = 0; i < clen; i++)
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
+ !state)
+ already++;
+
+ clen_changed = clen - already;
+ if (state)
+ mb_set_bits(bitmap_bh->b_data, blkoff, clen);
+ else
+ mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ }
+ if (state)
+ clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
+ else
+ clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
+
+ ext4_free_group_clusters_set(sb, gdp, clen);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+
+ ext4_unlock_group(sb, group);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ struct flex_groups *fg = sbi_array_rcu_deref(sbi,
+ s_flex_groups, flex_group);
+
+ if (state)
+ atomic64_sub(clen_changed, &fg->free_clusters);
+ else
+ atomic64_add(clen_changed, &fg->free_clusters);
+
+ }
+
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ break;
+ sync_dirty_buffer(bitmap_bh);
+ err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(gdp_bh);
+ if (err)
+ break;
+
+ block += thisgrp_len;
+ len -= thisgrp_len;
+ brelse(bitmap_bh);
+ BUG_ON(len < 0);
+ }
+
+ if (err)
+ brelse(bitmap_bh);
+}
+
+/*
* here we normalize request for locality group
* Group request are normalized to s_mb_group_prealloc, which goes to
* s_strip if we set the same via mount option.
@@ -3068,8 +3981,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
BUG_ON(lg == NULL);
ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
- mb_debug(1, "#%u: goal %u blocks for locality group\n",
- current->pid, ac->ac_g_ex.fe_len);
+ mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
}
/*
@@ -3162,6 +4074,15 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
size = size >> bsbits;
start = start_off >> bsbits;
+ /*
+ * For tiny groups (smaller than 8MB) the chosen allocation
+ * alignment may be larger than group size. Make sure the
+ * alignment does not move allocation to a different group which
+ * makes mballoc fail assertions later.
+ */
+ start = max(start, rounddown(ac->ac_o_ex.fe_logical,
+ (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
+
/* don't cover already allocated blocks in selected range */
if (ar->pleft && start <= ar->lleft) {
size -= ar->lleft + 1 - start;
@@ -3234,7 +4155,22 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
}
rcu_read_unlock();
- if (start + size <= ac->ac_o_ex.fe_logical &&
+ /*
+ * In this function "start" and "size" are normalized for better
+ * alignment and length such that we could preallocate more blocks.
+ * This normalization is done such that original request of
+ * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
+ * "size" boundaries.
+ * (Note fe_len can be relaxed since FS block allocation API does not
+ * provide gurantee on number of contiguous blocks allocation since that
+ * depends upon free space left, etc).
+ * In case of inode pa, later we use the allocated blocks
+ * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated
+ * range of goal/best blocks [start, size] to put it at the
+ * ac_o_ex.fe_logical extent of this inode.
+ * (See ext4_mb_use_inode_pa() for more details)
+ */
+ if (start + size <= ac->ac_o_ex.fe_logical ||
start > ac->ac_o_ex.fe_logical) {
ext4_msg(ac->ac_sb, KERN_ERR,
"start %lu, size %lu, fe_logical %lu",
@@ -3267,20 +4203,21 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
- (unsigned) orig_size, (unsigned) start);
+ mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
+ orig_size, start);
}
static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
+ if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
atomic_inc(&sbi->s_bal_reqs);
atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
atomic_inc(&sbi->s_bal_success);
atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+ atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
atomic_inc(&sbi->s_bal_goals);
@@ -3357,7 +4294,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
- mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
+ mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
}
/*
@@ -3381,7 +4318,8 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
* in on-disk bitmap -- see ext4_mb_release_context()
* Other CPUs are prevented from allocating from this pa by lg_mutex
*/
- mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
+ pa->pa_lstart-len, len, pa);
}
/*
@@ -3416,7 +4354,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
/*
* search goal blocks in preallocated space
*/
-static noinline_for_stack int
+static noinline_for_stack bool
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
@@ -3428,7 +4366,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* only data can be preallocated */
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
- return 0;
+ return false;
/* first, try per-file preallocation */
rcu_read_lock();
@@ -3455,7 +4393,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
spin_unlock(&pa->pa_lock);
ac->ac_criteria = 10;
rcu_read_unlock();
- return 1;
+ return true;
}
spin_unlock(&pa->pa_lock);
}
@@ -3463,12 +4401,12 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* can we use group allocation? */
if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
- return 0;
+ return false;
/* inode may have no locality group for some reason */
lg = ac->ac_lg;
if (lg == NULL)
- return 0;
+ return false;
order = fls(ac->ac_o_ex.fe_len) - 1;
if (order > PREALLOC_TB_SIZE - 1)
/* The max size of hash table is PREALLOC_TB_SIZE */
@@ -3497,9 +4435,9 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
if (cpa) {
ext4_mb_use_group_pa(ac, cpa);
ac->ac_criteria = 20;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -3520,7 +4458,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
while (n) {
entry = rb_entry(n, struct ext4_free_data, efd_node);
- ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
+ mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
n = rb_next(n);
}
return;
@@ -3561,10 +4499,30 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
if (unlikely(len == 0))
continue;
BUG_ON(groupnr != group);
- ext4_set_bits(bitmap, start, len);
+ mb_set_bits(bitmap, start, len);
preallocated += len;
}
- mb_debug(1, "preallocated %u for group %u\n", preallocated, group);
+ mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
+}
+
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
+ struct ext4_prealloc_space *pa)
+{
+ struct ext4_inode_info *ei;
+
+ if (pa->pa_deleted) {
+ ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
+ pa->pa_type, pa->pa_pstart, pa->pa_lstart,
+ pa->pa_len);
+ return;
+ }
+
+ pa->pa_deleted = 1;
+
+ if (pa->pa_type == MB_INODE_PA) {
+ ei = EXT4_I(pa->pa_inode);
+ atomic_dec(&ei->i_prealloc_active);
+ }
}
static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3599,7 +4557,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
return;
}
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
grp_blk = pa->pa_pstart;
@@ -3640,7 +4598,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
/*
* creates new preallocated space for given inode
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -3653,10 +4611,9 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+ BUG_ON(ac->ac_pa == NULL);
- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
- if (pa == NULL)
- return -ENOMEM;
+ pa = ac->ac_pa;
if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
int winl;
@@ -3700,15 +4657,14 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
- atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_INODE_PA;
- mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
+ pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_inode_pa(ac, pa);
ext4_mb_use_inode_pa(ac, pa);
@@ -3720,21 +4676,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_obj_lock = &ei->i_prealloc_lock;
pa->pa_inode = ac->ac_inode;
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
spin_lock(pa->pa_obj_lock);
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
-
- return 0;
+ atomic_inc(&ei->i_prealloc_active);
}
/*
* creates new preallocated space for locality group inodes belongs to
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -3746,11 +4699,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+ BUG_ON(ac->ac_pa == NULL);
- BUG_ON(ext4_pspace_cachep == NULL);
- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
- if (pa == NULL)
- return -ENOMEM;
+ pa = ac->ac_pa;
/* preallocation can change ac_b_ex, thus we store actually
* allocated blocks for history */
@@ -3760,15 +4711,14 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_lstart = pa->pa_pstart;
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
- atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_GROUP_PA;
- mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
+ pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_group_pa(ac, pa);
ext4_mb_use_group_pa(ac, pa);
@@ -3781,26 +4731,20 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_obj_lock = &lg->lg_prealloc_lock;
pa->pa_inode = NULL;
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
/*
* We will later add the new pa to the right bucket
* after updating the pa_free in ext4_mb_release_context
*/
- return 0;
}
-static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
{
- int err;
-
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
- err = ext4_mb_new_group_pa(ac);
+ ext4_mb_new_group_pa(ac);
else
- err = ext4_mb_new_inode_pa(ac);
- return err;
+ ext4_mb_new_inode_pa(ac);
}
/*
@@ -3835,7 +4779,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (bit >= end)
break;
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
- mb_debug(1, " free preallocated %u/%u in group %u\n",
+ mb_debug(sb, "free preallocated %u/%u in group %u\n",
(unsigned) ext4_group_first_block_no(sb, group) + bit,
(unsigned) next - bit, (unsigned) group);
free += next - bit;
@@ -3849,10 +4793,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
}
if (free != pa->pa_free) {
ext4_msg(e4b->bd_sb, KERN_CRIT,
- "pa %p: logic %lu, phys. %lu, len %lu",
+ "pa %p: logic %lu, phys. %lu, len %d",
pa, (unsigned long) pa->pa_lstart,
(unsigned long) pa->pa_pstart,
- (unsigned long) pa->pa_len);
+ pa->pa_len);
ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
free, pa->pa_free);
/*
@@ -3895,7 +4839,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
*/
static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block *sb,
- ext4_group_t group, int needed)
+ ext4_group_t group, int *busy)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct buffer_head *bitmap_bh = NULL;
@@ -3903,21 +4847,19 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
struct list_head list;
struct ext4_buddy e4b;
int err;
- int busy = 0;
int free = 0;
- mb_debug(1, "discard preallocation for group %u\n", group);
-
+ mb_debug(sb, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
- return 0;
+ goto out_dbg;
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
- ext4_set_errno(sb, -err);
- ext4_error(sb, "Error %d reading block bitmap for %u",
- err, group);
- return 0;
+ ext4_error_err(sb, -err,
+ "Error %d reading block bitmap for %u",
+ err, group);
+ goto out_dbg;
}
err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -3925,21 +4867,17 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
ext4_warning(sb, "Error %d loading buddy information for %u",
err, group);
put_bh(bitmap_bh);
- return 0;
+ goto out_dbg;
}
- if (needed == 0)
- needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
-
INIT_LIST_HEAD(&list);
-repeat:
ext4_lock_group(sb, group);
list_for_each_entry_safe(pa, tmp,
&grp->bb_prealloc_list, pa_group_list) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
spin_unlock(&pa->pa_lock);
- busy = 1;
+ *busy = 1;
continue;
}
if (pa->pa_deleted) {
@@ -3948,7 +4886,10 @@ repeat:
}
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
+
+ if (!free)
+ this_cpu_inc(discard_pa_seq);
/* we can trust pa_free ... */
free += pa->pa_free;
@@ -3959,20 +4900,6 @@ repeat:
list_add(&pa->u.pa_tmp_list, &list);
}
- /* if we still need more blocks and some PAs were used, try again */
- if (free < needed && busy) {
- busy = 0;
- ext4_unlock_group(sb, group);
- cond_resched();
- goto repeat;
- }
-
- /* found anything to free? */
- if (list_empty(&list)) {
- BUG_ON(free != 0);
- goto out;
- }
-
/* now free all selected PAs */
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
@@ -3990,10 +4917,12 @@ repeat:
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
-out:
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
+out_dbg:
+ mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
+ free, group, grp->bb_free);
return free;
}
@@ -4006,7 +4935,7 @@ out:
*
* FIXME!! Make sure it is valid at all the call sites
*/
-void ext4_discard_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
@@ -4022,16 +4951,24 @@ void ext4_discard_preallocations(struct inode *inode)
return;
}
- mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
- trace_ext4_discard_preallocations(inode);
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
+ mb_debug(sb, "discard preallocation for inode %lu\n",
+ inode->i_ino);
+ trace_ext4_discard_preallocations(inode,
+ atomic_read(&ei->i_prealloc_active), needed);
INIT_LIST_HEAD(&list);
+ if (needed == 0)
+ needed = UINT_MAX;
+
repeat:
/* first, collect all pa's in the inode */
spin_lock(&ei->i_prealloc_lock);
- while (!list_empty(&ei->i_prealloc_list)) {
- pa = list_entry(ei->i_prealloc_list.next,
+ while (!list_empty(&ei->i_prealloc_list) && needed) {
+ pa = list_entry(ei->i_prealloc_list.prev,
struct ext4_prealloc_space, pa_inode_list);
BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
spin_lock(&pa->pa_lock);
@@ -4048,10 +4985,11 @@ repeat:
}
if (pa->pa_deleted == 0) {
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
list_add(&pa->u.pa_tmp_list, &list);
+ needed--;
continue;
}
@@ -4083,18 +5021,16 @@ repeat:
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
- ext4_set_errno(sb, -err);
- ext4_error(sb, "Error %d loading buddy information for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
+ err, group);
continue;
}
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
- ext4_set_errno(sb, -err);
- ext4_error(sb, "Error %d reading block bitmap for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
+ err, group);
ext4_mb_unload_buddy(&e4b);
continue;
}
@@ -4112,22 +5048,74 @@ repeat:
}
}
+static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa;
+
+ BUG_ON(ext4_pspace_cachep == NULL);
+ pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
+ if (!pa)
+ return -ENOMEM;
+ atomic_set(&pa->pa_count, 1);
+ ac->ac_pa = pa;
+ return 0;
+}
+
+static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+
+ BUG_ON(!pa);
+ ac->ac_pa = NULL;
+ WARN_ON(!atomic_dec_and_test(&pa->pa_count));
+ kmem_cache_free(ext4_pspace_cachep, pa);
+}
+
#ifdef CONFIG_EXT4_DEBUG
+static inline void ext4_mb_show_pa(struct super_block *sb)
+{
+ ext4_group_t i, ngroups;
+
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
+ return;
+
+ ngroups = ext4_get_groups_count(sb);
+ mb_debug(sb, "groups: ");
+ for (i = 0; i < ngroups; i++) {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+ struct ext4_prealloc_space *pa;
+ ext4_grpblk_t start;
+ struct list_head *cur;
+ ext4_lock_group(sb, i);
+ list_for_each(cur, &grp->bb_prealloc_list) {
+ pa = list_entry(cur, struct ext4_prealloc_space,
+ pa_group_list);
+ spin_lock(&pa->pa_lock);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+ NULL, &start);
+ spin_unlock(&pa->pa_lock);
+ mb_debug(sb, "PA:%u:%d:%d\n", i, start,
+ pa->pa_len);
+ }
+ ext4_unlock_group(sb, i);
+ mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
+ grp->bb_fragments);
+ }
+}
+
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- ext4_group_t ngroups, i;
- if (!ext4_mballoc_debug ||
- (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
return;
- ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
+ mb_debug(sb, "Can't allocate:"
" Allocation context details:");
- ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
+ mb_debug(sb, "status %u flags 0x%x",
ac->ac_status, ac->ac_flags);
- ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
- "goal %lu/%lu/%lu@%lu, "
+ mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
+ "goal %lu/%lu/%lu@%lu, "
"best %lu/%lu/%lu@%lu cr %d",
(unsigned long)ac->ac_o_ex.fe_group,
(unsigned long)ac->ac_o_ex.fe_start,
@@ -4142,37 +5130,17 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_len,
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
- ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
- ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
- ngroups = ext4_get_groups_count(sb);
- for (i = 0; i < ngroups; i++) {
- struct ext4_group_info *grp = ext4_get_group_info(sb, i);
- struct ext4_prealloc_space *pa;
- ext4_grpblk_t start;
- struct list_head *cur;
- ext4_lock_group(sb, i);
- list_for_each(cur, &grp->bb_prealloc_list) {
- pa = list_entry(cur, struct ext4_prealloc_space,
- pa_group_list);
- spin_lock(&pa->pa_lock);
- ext4_get_group_no_and_offset(sb, pa->pa_pstart,
- NULL, &start);
- spin_unlock(&pa->pa_lock);
- printk(KERN_ERR "PA:%u:%d:%u \n", i,
- start, pa->pa_len);
- }
- ext4_unlock_group(sb, i);
-
- if (grp->bb_free == 0)
- continue;
- printk(KERN_ERR "%u: %d/%d \n",
- i, grp->bb_free, grp->bb_fragments);
- }
- printk(KERN_ERR "\n");
+ mb_debug(sb, "%u found", ac->ac_found);
+ ext4_mb_show_pa(sb);
}
#else
+static inline void ext4_mb_show_pa(struct super_block *sb)
+{
+ return;
+}
static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
+ ext4_mb_show_pa(ac->ac_sb);
return;
}
#endif
@@ -4189,6 +5157,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits = ac->ac_sb->s_blocksize_bits;
loff_t size, isize;
+ bool inode_pa_eligible, group_pa_eligible;
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;
@@ -4196,25 +5165,27 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
+ group_pa_eligible = sbi->s_mb_group_prealloc > 0;
+ inode_pa_eligible = true;
size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
+ /* No point in using inode preallocation for closed files */
if ((size == isize) && !ext4_fs_is_busy(sbi) &&
- !inode_is_open_for_write(ac->ac_inode)) {
- ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
- return;
- }
+ !inode_is_open_for_write(ac->ac_inode))
+ inode_pa_eligible = false;
- if (sbi->s_mb_group_prealloc <= 0) {
- ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
- return;
- }
-
- /* don't use group allocation for large files */
size = max(size, isize);
- if (size > sbi->s_mb_stream_request) {
- ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ /* Don't use group allocation for large files */
+ if (size > sbi->s_mb_stream_request)
+ group_pa_eligible = false;
+
+ if (!group_pa_eligible) {
+ if (inode_pa_eligible)
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ else
+ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
}
@@ -4271,11 +5242,11 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
- /* we have to define context: we'll we work with a file or
+ /* we have to define context: we'll work with a file or
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
- mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+ mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
"left: %u/%u, right %u/%u to %swritable\n",
(unsigned) ar->len, (unsigned) ar->logical,
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4296,13 +5267,14 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
struct list_head discard_list;
struct ext4_prealloc_space *pa, *tmp;
- mb_debug(1, "discard locality group preallocation\n");
+ mb_debug(sb, "discard locality group preallocation\n");
INIT_LIST_HEAD(&discard_list);
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
- pa_inode_list) {
+ pa_inode_list,
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
/*
@@ -4321,7 +5293,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
BUG_ON(pa->pa_type != MB_GROUP_PA);
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
@@ -4347,9 +5319,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
- ext4_set_errno(sb, -err);
- ext4_error(sb, "Error %d loading buddy information for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
+ err, group);
continue;
}
ext4_lock_group(sb, group);
@@ -4386,7 +5357,8 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/* Add the prealloc space to lg */
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
- pa_inode_list) {
+ pa_inode_list,
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
spin_unlock(&tmp_pa->pa_lock);
@@ -4420,10 +5392,29 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
}
/*
+ * if per-inode prealloc list is too long, trim some PA
+ */
+static void ext4_mb_trim_inode_pa(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int count, delta;
+
+ count = atomic_read(&ei->i_prealloc_active);
+ delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
+ if (count > sbi->s_mb_max_inode_prealloc + delta) {
+ count -= sbi->s_mb_max_inode_prealloc;
+ ext4_discard_preallocations(inode, count);
+ }
+}
+
+/*
* release all resource we used in allocation
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
+ struct inode *inode = ac->ac_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
@@ -4435,21 +5426,31 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
+
+ /*
+ * We want to add the pa to the right bucket.
+ * Remove it from the list and while adding
+ * make sure the list to which we are adding
+ * doesn't grow big.
+ */
+ if (likely(pa->pa_free)) {
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+ ext4_mb_add_n_trim(ac);
+ }
}
- }
- if (pa) {
- /*
- * We want to add the pa to the right bucket.
- * Remove it from the list and while adding
- * make sure the list to which we are adding
- * doesn't grow big.
- */
- if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+
+ if (pa->pa_type == MB_INODE_PA) {
+ /*
+ * treat per-inode prealloc list as a lru list, then try
+ * to trim the least recently used PA.
+ */
spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
+ list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
- ext4_mb_add_n_trim(ac);
}
+
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
@@ -4459,6 +5460,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
+ ext4_mb_trim_inode_pa(inode);
return 0;
}
@@ -4466,18 +5468,56 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
{
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int ret;
- int freed = 0;
+ int freed = 0, busy = 0;
+ int retry = 0;
trace_ext4_mb_discard_preallocations(sb, needed);
+
+ if (needed == 0)
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
+ repeat:
for (i = 0; i < ngroups && needed > 0; i++) {
- ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+ ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
freed += ret;
needed -= ret;
+ cond_resched();
+ }
+
+ if (needed > 0 && busy && ++retry < 3) {
+ busy = 0;
+ goto repeat;
}
return freed;
}
+static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
+ struct ext4_allocation_context *ac, u64 *seq)
+{
+ int freed;
+ u64 seq_retry = 0;
+ bool ret = false;
+
+ freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
+ if (freed) {
+ ret = true;
+ goto out_dbg;
+ }
+ seq_retry = ext4_get_discard_pa_seq_sum();
+ if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
+ ac->ac_flags |= EXT4_MB_STRICT_CHECK;
+ *seq = seq_retry;
+ ret = true;
+ }
+
+out_dbg:
+ mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
+ return ret;
+}
+
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
+ struct ext4_allocation_request *ar, int *errp);
+
/*
* Main entry point into mballoc to allocate blocks
* it tries to use preallocation first, then falls back
@@ -4486,19 +5526,22 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
struct ext4_allocation_request *ar, int *errp)
{
- int freed;
struct ext4_allocation_context *ac = NULL;
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
unsigned int reserv_clstrs = 0;
+ int retries = 0;
+ u64 seq;
might_sleep();
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
trace_ext4_request_blocks(ar);
+ if (sbi->s_mount_state & EXT4_FC_REPLAY)
+ return ext4_mb_new_blocks_simple(handle, ar, errp);
/* Allow to use superuser reservation for quota file */
if (ext4_is_quota_file(ar->inode))
@@ -4517,6 +5560,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
ar->len = ar->len >> 1;
}
if (!ar->len) {
+ ext4_mb_show_pa(sb);
*errp = -ENOSPC;
return 0;
}
@@ -4554,26 +5598,32 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
+ seq = this_cpu_read(discard_pa_seq);
if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
+
+ *errp = ext4_mb_pa_alloc(ac);
+ if (*errp)
+ goto errout;
repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
- if (*errp)
- goto discard_and_exit;
-
- /* as we've just preallocated more space than
- * user requested originally, we store allocated
- * space in a special descriptor */
- if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- *errp = ext4_mb_new_preallocation(ac);
+ /*
+ * pa allocated above is added to grp->bb_prealloc_list only
+ * when we were able to allocate some block i.e. when
+ * ac->ac_status == AC_STATUS_FOUND.
+ * And error from above mean ac->ac_status != AC_STATUS_FOUND
+ * So we have to free this pa here itself.
+ */
if (*errp) {
- discard_and_exit:
+ ext4_mb_pa_free(ac);
ext4_discard_allocated_blocks(ac);
goto errout;
}
+ if (ac->ac_status == AC_STATUS_FOUND &&
+ ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
+ ext4_mb_pa_free(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4585,9 +5635,14 @@ repeat:
ar->len = ac->ac_b_ex.fe_len;
}
} else {
- freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
- if (freed)
+ if (++retries < 3 &&
+ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
goto repeat;
+ /*
+ * If block allocation fails then the pa allocated above
+ * needs to be freed here itself.
+ */
+ ext4_mb_pa_free(ac);
*errp = -ENOSPC;
}
@@ -4686,6 +5741,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
ext4_group_first_block_no(sb, group) +
EXT4_C2B(sbi, cluster),
"Block already on to-be-freed list");
+ kmem_cache_free(ext4_free_data_cachep, new_entry);
return 0;
}
}
@@ -4715,18 +5771,122 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
return 0;
}
+/*
+ * Simple allocator for Ext4 fast commit replay path. It searches for blocks
+ * linearly starting at the goal block and also excludes the blocks which
+ * are going to be in use after fast commit replay.
+ */
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
+ struct ext4_allocation_request *ar, int *errp)
+{
+ struct buffer_head *bitmap_bh;
+ struct super_block *sb = ar->inode->i_sb;
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_fsblk_t goal, block;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ goal = ar->goal;
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
+ goal >= ext4_blocks_count(es))
+ goal = le32_to_cpu(es->s_first_data_block);
+
+ ar->len = 0;
+ ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
+ for (; group < ext4_get_groups_count(sb); group++) {
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ *errp = PTR_ERR(bitmap_bh);
+ pr_warn("Failed to read block bitmap\n");
+ return 0;
+ }
+
+ ext4_get_group_no_and_offset(sb,
+ max(ext4_group_first_block_no(sb, group), goal),
+ NULL, &blkoff);
+ while (1) {
+ i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
+ blkoff);
+ if (i >= max)
+ break;
+ if (ext4_fc_replay_check_excluded(sb,
+ ext4_group_first_block_no(sb, group) + i)) {
+ blkoff = i + 1;
+ } else
+ break;
+ }
+ brelse(bitmap_bh);
+ if (i < max)
+ break;
+ }
+
+ if (group >= ext4_get_groups_count(sb) || i >= max) {
+ *errp = -ENOSPC;
+ return 0;
+ }
+
+ block = ext4_group_first_block_no(sb, group) + i;
+ ext4_mb_mark_bb(sb, block, 1, 1);
+ ar->len = 1;
+
+ return block;
+}
+
+static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
+ unsigned long count)
+{
+ struct buffer_head *bitmap_bh;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ int already_freed = 0, err, i;
+
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ pr_warn("Failed to read block bitmap\n");
+ return;
+ }
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ return;
+
+ for (i = 0; i < count; i++) {
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
+ already_freed++;
+ }
+ mb_clear_bits(bitmap_bh->b_data, blkoff, count);
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ return;
+ ext4_free_group_clusters_set(
+ sb, gdp, ext4_free_group_clusters(sb, gdp) +
+ count - already_freed);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(bitmap_bh);
+ sync_dirty_buffer(gdp_bh);
+ brelse(bitmap_bh);
+}
+
/**
- * ext4_free_blocks() -- Free given blocks and update quota
+ * ext4_mb_clear_bb() -- helper function for freeing blocks.
+ * Used by ext4_free_blocks()
* @handle: handle for this transaction
* @inode: inode
- * @bh: optional buffer of the block to be freed
* @block: starting physical block to be freed
* @count: number of blocks to be freed
* @flags: flags used by ext4_free_blocks
*/
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t block,
- unsigned long count, int flags)
+static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t block, unsigned long count,
+ int flags)
{
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
@@ -4741,75 +5901,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
int err = 0;
int ret;
- might_sleep();
- if (bh) {
- if (block)
- BUG_ON(block != bh->b_blocknr);
- else
- block = bh->b_blocknr;
- }
-
sbi = EXT4_SB(sb);
+
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
- !ext4_data_block_valid(sbi, block, count)) {
- ext4_error(sb, "Freeing blocks not in datazone - "
- "block = %llu, count = %lu", block, count);
+ !ext4_inode_block_valid(inode, block, count)) {
+ ext4_error(sb, "Freeing blocks in system zone - "
+ "Block = %llu, count = %lu", block, count);
+ /* err = 0. ext4_std_error should be a no op */
goto error_return;
}
-
- ext4_debug("freeing block %llu\n", block);
- trace_ext4_free_blocks(inode, block, count, flags);
-
- if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- BUG_ON(count > 1);
-
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, bh, block);
- }
-
- /*
- * If the extent to be freed does not begin on a cluster
- * boundary, we need to deal with partial clusters at the
- * beginning and end of the extent. Normally we will free
- * blocks at the beginning or the end unless we are explicitly
- * requested to avoid doing so.
- */
- overflow = EXT4_PBLK_COFF(sbi, block);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
- overflow = sbi->s_cluster_ratio - overflow;
- block += overflow;
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else {
- block -= overflow;
- count += overflow;
- }
- }
- overflow = EXT4_LBLK_COFF(sbi, count);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else
- count += sbi->s_cluster_ratio - overflow;
- }
-
- if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- int i;
- int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
-
- for (i = 0; i < count; i++) {
- cond_resched();
- if (is_metadata)
- bh = sb_find_get_block(inode->i_sb, block + i);
- ext4_forget(handle, is_metadata, inode, bh, block + i);
- }
- }
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
do_more:
overflow = 0;
@@ -4827,6 +5928,8 @@ do_more:
overflow = EXT4_C2B(sbi, bit) + count -
EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
}
count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
@@ -4841,13 +5944,8 @@ do_more:
goto error_return;
}
- if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
- in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
- in_range(block, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group)) {
-
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
@@ -4855,7 +5953,8 @@ do_more:
}
BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
if (err)
goto error_return;
@@ -4865,7 +5964,7 @@ do_more:
* using it
*/
BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gd_bh);
+ err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
if (err)
goto error_return;
#ifdef AGGRESSIVE_CHECK
@@ -4917,7 +6016,7 @@ do_more:
NULL);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
- " group:%d block:%d count:%lu failed"
+ " group:%u block:%d count:%lu failed"
" with %d", block_group, bit, count,
err);
} else
@@ -4969,6 +6068,8 @@ do_more:
block += count;
count = overflow;
put_bh(bitmap_bh);
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
goto do_more;
}
error_return:
@@ -4978,6 +6079,108 @@ error_return:
}
/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @bh: optional buffer of the block to be freed
+ * @block: starting physical block to be freed
+ * @count: number of blocks to be freed
+ * @flags: flags used by ext4_free_blocks
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned int overflow;
+ struct ext4_sb_info *sbi;
+
+ sbi = EXT4_SB(sb);
+
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ ext4_free_blocks_simple(inode, block, count);
+ return;
+ }
+
+ might_sleep();
+ if (bh) {
+ if (block)
+ BUG_ON(block != bh->b_blocknr);
+ else
+ block = bh->b_blocknr;
+ }
+
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
+ ext4_error(sb, "Freeing blocks not in datazone - "
+ "block = %llu, count = %lu", block, count);
+ return;
+ }
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
+
+ ext4_debug("freeing block %llu\n", block);
+ trace_ext4_free_blocks(inode, block, count, flags);
+
+ if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ BUG_ON(count > 1);
+
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block);
+ }
+
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = EXT4_PBLK_COFF(sbi, block);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
+ }
+ overflow = EXT4_LBLK_COFF(sbi, count);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
+ }
+
+ if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ int i;
+ int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
+
+ for (i = 0; i < count; i++) {
+ cond_resched();
+ if (is_metadata)
+ bh = sb_find_get_block(inode->i_sb, block + i);
+ ext4_forget(handle, is_metadata, inode, bh, block + i);
+ }
+ }
+
+ ext4_mb_clear_bb(handle, inode, block, count, flags);
+ return;
+}
+
+/**
* ext4_group_add_blocks() -- Add given blocks to an existing group
* @handle: handle to this transaction
* @sb: super block
@@ -5033,11 +6236,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
goto error_return;
}
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, desc),
- sbi->s_itb_per_group)) {
+ if (!ext4_sb_block_valid(sb, NULL, block, count)) {
ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
@@ -5046,7 +6245,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
}
BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
if (err)
goto error_return;
@@ -5056,7 +6256,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
* using it
*/
BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gd_bh);
+ err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
if (err)
goto error_return;
@@ -5122,19 +6322,19 @@ error_return:
* @sb: super block for the file system
* @start: starting block of the free extent in the alloc. group
* @count: number of blocks to TRIM
- * @group: alloc. group we are working with
* @e4b: ext4 buddy for the group
*
* Trim "count" blocks starting at "start" in the "group". To assure that no
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+static int ext4_trim_extent(struct super_block *sb,
+ int start, int count, struct ext4_buddy *e4b)
__releases(bitlock)
__acquires(bitlock)
{
struct ext4_free_extent ex;
+ ext4_group_t group = e4b->bd_group;
int ret = 0;
trace_ext4_trim_extent(sb, group, start, count);
@@ -5157,51 +6357,20 @@ __acquires(bitlock)
return ret;
}
-/**
- * ext4_trim_all_free -- function to trim all free space in alloc. group
- * @sb: super block for file system
- * @group: group to be trimmed
- * @start: first group block to examine
- * @max: last group block to examine
- * @minblocks: minimum extent block count
- *
- * ext4_trim_all_free walks through group's buddy bitmap searching for free
- * extents. When the free block is found, ext4_trim_extent is called to TRIM
- * the extent.
- *
- *
- * ext4_trim_all_free walks through group's block bitmap searching for free
- * extents. When the free extent is found, mark it as used in group buddy
- * bitmap. Then issue a TRIM command on this extent and free the extent in
- * the group buddy bitmap. This is done until whole group is scanned.
- */
-static ext4_grpblk_t
-ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
- ext4_grpblk_t start, ext4_grpblk_t max,
- ext4_grpblk_t minblocks)
+static int ext4_try_to_trim_range(struct super_block *sb,
+ struct ext4_buddy *e4b, ext4_grpblk_t start,
+ ext4_grpblk_t max, ext4_grpblk_t minblocks)
+__acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
+__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
+ ext4_grpblk_t next, count, free_count;
void *bitmap;
- ext4_grpblk_t next, count = 0, free_count = 0;
- struct ext4_buddy e4b;
- int ret = 0;
- trace_ext4_trim_all_free(sb, group, start, max);
-
- ret = ext4_mb_load_buddy(sb, group, &e4b);
- if (ret) {
- ext4_warning(sb, "Error %d loading buddy information for %u",
- ret, group);
- return ret;
- }
- bitmap = e4b.bd_bitmap;
-
- ext4_lock_group(sb, group);
- if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
- minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
- goto out;
-
- start = (e4b.bd_info->bb_first_free > start) ?
- e4b.bd_info->bb_first_free : start;
+ bitmap = e4b->bd_bitmap;
+ start = (e4b->bd_info->bb_first_free > start) ?
+ e4b->bd_info->bb_first_free : start;
+ count = 0;
+ free_count = 0;
while (start <= max) {
start = mb_find_next_zero_bit(bitmap, max + 1, start);
@@ -5210,11 +6379,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
next = mb_find_next_bit(bitmap, max + 1, start);
if ((next - start) >= minblocks) {
- ret = ext4_trim_extent(sb, start,
- next - start, group, &e4b);
+ int ret = ext4_trim_extent(sb, start, next - start, e4b);
+
if (ret && ret != -EOPNOTSUPP)
break;
- ret = 0;
count += next - start;
}
free_count += next - start;
@@ -5226,25 +6394,65 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
}
if (need_resched()) {
- ext4_unlock_group(sb, group);
+ ext4_unlock_group(sb, e4b->bd_group);
cond_resched();
- ext4_lock_group(sb, group);
+ ext4_lock_group(sb, e4b->bd_group);
}
- if ((e4b.bd_info->bb_free - free_count) < minblocks)
+ if ((e4b->bd_info->bb_free - free_count) < minblocks)
break;
}
- if (!ret) {
- ret = count;
- EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+ return count;
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb: super block for file system
+ * @group: group to be trimmed
+ * @start: first group block to examine
+ * @max: last group block to examine
+ * @minblocks: minimum extent block count
+ * @set_trimmed: set the trimmed flag if at least one block is trimmed
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap.
+ */
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+ ext4_grpblk_t start, ext4_grpblk_t max,
+ ext4_grpblk_t minblocks, bool set_trimmed)
+{
+ struct ext4_buddy e4b;
+ int ret;
+
+ trace_ext4_trim_all_free(sb, group, start, max);
+
+ ret = ext4_mb_load_buddy(sb, group, &e4b);
+ if (ret) {
+ ext4_warning(sb, "Error %d loading buddy information for %u",
+ ret, group);
+ return ret;
+ }
+
+ ext4_lock_group(sb, group);
+
+ if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
+ minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
+ ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
+ if (ret >= 0 && set_trimmed)
+ EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+ } else {
+ ret = 0;
}
-out:
+
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
ext4_debug("trimmed %d blocks in the group %d\n",
- count, group);
+ ret, group);
return ret;
}
@@ -5263,6 +6471,7 @@ out:
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
+ unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
struct ext4_group_info *grp;
ext4_group_t group, first_group, last_group;
ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
@@ -5270,6 +6479,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
+ bool whole_group, eof = false;
int ret = 0;
start = range->start >> sb->s_blocksize_bits;
@@ -5281,8 +6491,17 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
start >= max_blks ||
range->len < sb->s_blocksize)
return -EINVAL;
- if (end >= max_blks)
+ /* No point to try to trim less than discard granularity */
+ if (range->minlen < discard_granularity) {
+ minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+ discard_granularity >> sb->s_blocksize_bits);
+ if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
+ goto out;
+ }
+ if (end >= max_blks - 1) {
end = max_blks - 1;
+ eof = true;
+ }
if (end <= first_data_blk)
goto out;
if (start < first_data_blk)
@@ -5296,6 +6515,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
/* end now represents the last cluster to discard in this group */
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+ whole_group = true;
for (group = first_group; group <= last_group; group++) {
grp = ext4_get_group_info(sb, group);
@@ -5312,12 +6532,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
* change it for the last group, note that last_cluster is
* already computed earlier by ext4_get_group_no_and_offset()
*/
- if (group == last_group)
+ if (group == last_group) {
end = last_cluster;
-
+ whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+ }
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen);
+ end, minlen, whole_group);
if (cnt < 0) {
ret = cnt;
break;
@@ -5333,7 +6554,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
if (!ret)
- atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+ EXT4_SB(sb)->s_last_trim_minblks = minlen;
out:
range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 88c98f17e3d9..dcda2a943cee 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -24,19 +24,15 @@
#include "ext4.h"
/*
+ * mb_debug() dynamic printk msgs could be used to debug mballoc code.
*/
#ifdef CONFIG_EXT4_DEBUG
-extern ushort ext4_mballoc_debug;
-
-#define mb_debug(n, fmt, ...) \
-do { \
- if ((n) <= ext4_mballoc_debug) { \
- printk(KERN_DEBUG "(%s, %d): %s: " fmt, \
- __FILE__, __LINE__, __func__, ##__VA_ARGS__); \
- } \
-} while (0)
+#define mb_debug(sb, fmt, ...) \
+ pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt, \
+ current->comm, task_pid_nr(current), sb->s_id, \
+ __FILE__, __LINE__, __func__, ##__VA_ARGS__)
#else
-#define mb_debug(n, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#define mb_debug(sb, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
@@ -63,7 +59,7 @@ do { \
* by the stream allocator, which purpose is to pack requests
* as close each to other as possible to produce smooth I/O traffic
* We use locality group prealloc space for stream request.
- * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ * We can tune the same via /proc/fs/ext4/<partition>/stream_req
*/
#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */
@@ -77,6 +73,27 @@ do { \
*/
#define MB_DEFAULT_GROUP_PREALLOC 512
+/*
+ * maximum length of inode prealloc list
+ */
+#define MB_DEFAULT_MAX_INODE_PREALLOC 512
+
+/*
+ * Number of groups to search linearly before performing group scanning
+ * optimization.
+ */
+#define MB_DEFAULT_LINEAR_LIMIT 4
+
+/*
+ * Minimum number of groups that should be present in the file system to perform
+ * group scanning optimizations.
+ */
+#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16
+
+/*
+ * Number of valid buddy orders
+ */
+#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2)
struct ext4_free_data {
/* this links the free block information from sb_info */
@@ -161,11 +178,13 @@ struct ext4_allocation_context {
/* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
+ __u32 ac_groups_considered;
+ __u32 ac_flags; /* allocation hints */
__u16 ac_groups_scanned;
+ __u16 ac_groups_linear_remaining;
__u16 ac_found;
__u16 ac_tail;
__u16 ac_buddy;
- __u16 ac_flags; /* allocation hints */
__u8 ac_status;
__u8 ac_criteria;
__u8 ac_2order; /* if request is to allocate 2^N blocks and
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index fb6520f37135..a19a9661646e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -32,7 +32,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
newext.ee_block = cpu_to_le32(lb->first_block);
newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
ext4_ext_store_pblock(&newext, lb->first_pblock);
- /* Locking only for convinience since we are operating on temp inode */
+ /* Locking only for convenience since we are operating on temp inode */
down_write(&EXT4_I(inode)->i_data_sem);
path = ext4_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
@@ -43,8 +43,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
/*
* Calculate the credit needed to inserting this extent
- * Since we are doing this in loop we may accumalate extra
- * credit. But below we try to not accumalate too much
+ * Since we are doing this in loop we may accumulate extra
+ * credit. But below we try to not accumulate too much
* of them by restarting the journal.
*/
needed = ext4_ext_calc_credits_for_single_extent(inode,
@@ -56,8 +56,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
lb->first_pblock = 0;
return retval;
}
@@ -287,7 +286,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
struct inode *tmp_inode)
{
- int retval;
+ int retval, retval2 = 0;
__le32 i_data[3];
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
@@ -342,7 +341,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* i_blocks when freeing the indirect meta-data blocks
*/
retval = free_ind_block(handle, inode, i_data);
- ext4_mark_inode_dirty(handle, inode);
+ retval2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(retval2 && !retval))
+ retval = retval2;
err_out:
return retval;
@@ -415,7 +416,7 @@ int ext4_ext_migrate(struct inode *inode)
struct inode *tmp_inode = NULL;
struct migrate_struct lb;
unsigned long max_entries;
- __u32 goal;
+ __u32 goal, tmp_csum_seed;
uid_t owner[2];
/*
@@ -423,7 +424,8 @@ int ext4_ext_migrate(struct inode *inode)
* already is extent-based, error out.
*/
if (!ext4_has_feature_extents(inode->i_sb) ||
- (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ ext4_has_inline_data(inode))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
@@ -435,12 +437,12 @@ int ext4_ext_migrate(struct inode *inode)
percpu_down_write(&sbi->s_writepages_rwsem);
/*
- * Worst case we can touch the allocation bitmaps, a bgd
- * block, and a block to link in the orphan list. We do need
- * need to worry about credits for modifying the quota inode.
+ * Worst case we can touch the allocation bitmaps and a block
+ * group descriptor block. We do need to worry about
+ * credits for modifying the quota inode.
*/
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
- 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+ 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
@@ -457,6 +459,14 @@ int ext4_ext_migrate(struct inode *inode)
ext4_journal_stop(handle);
goto out_unlock;
}
+ /*
+ * Use the correct seed for checksum (i.e. the seed from 'inode'). This
+ * is so that the metadata blocks will have the correct checksum after
+ * the migration.
+ */
+ ei = EXT4_I(inode);
+ tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed;
+ EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed;
i_size_write(tmp_inode, i_size_read(inode));
/*
* Set the i_nlink to zero so it will be deleted later
@@ -465,7 +475,6 @@ int ext4_ext_migrate(struct inode *inode)
clear_nlink(tmp_inode);
ext4_ext_tree_init(handle, tmp_inode);
- ext4_orphan_add(handle, tmp_inode);
ext4_journal_stop(handle);
/*
@@ -477,7 +486,7 @@ int ext4_ext_migrate(struct inode *inode)
* when we add extents we extent the journal
*/
/*
- * Even though we take i_mutex we can still cause block
+ * Even though we take i_rwsem we can still cause block
* allocation via mmap write to holes. If we have allocated
* new blocks we fail migrate. New block allocation will
* clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
@@ -490,17 +499,10 @@ int ext4_ext_migrate(struct inode *inode)
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle)) {
- /*
- * It is impossible to update on-disk structures without
- * a handle, so just rollback in-core changes and live other
- * work to orphan_list_cleanup()
- */
- ext4_orphan_del(NULL, tmp_inode);
retval = PTR_ERR(handle);
goto out_tmp_inode;
}
- ei = EXT4_I(inode);
i_data = ei->i_data;
memset(&lb, 0, sizeof(lb));
@@ -574,6 +576,7 @@ err_out:
* the inode is not visible to user space.
*/
tmp_inode->i_blocks = 0;
+ EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed;
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
@@ -601,7 +604,7 @@ int ext4_ind_migrate(struct inode *inode)
ext4_lblk_t start, end;
ext4_fsblk_t blk;
handle_t *handle;
- int ret;
+ int ret, ret2 = 0;
if (!ext4_has_feature_extents(inode->i_sb) ||
(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
@@ -655,7 +658,9 @@ int ext4_ind_migrate(struct inode *inode)
memset(ei->i_data, 0, sizeof(ei->i_data));
for (i = start; i <= end; i++)
ei->i_data[i] = cpu_to_le32(blk++);
- ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret2 && !ret))
+ ret = ret2;
errout:
ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 87f7551c5132..588cb09c5291 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -52,11 +52,11 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
- return 1;
+ return -EIO;
return 0;
}
@@ -85,15 +85,11 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
}
}
- get_bh(*bh);
lock_buffer(*bh);
- (*bh)->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh);
- wait_on_buffer(*bh);
- if (!buffer_uptodate(*bh)) {
- ret = -EIO;
+ ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL);
+ if (ret)
goto warn_exit;
- }
+
mmp = (struct mmp_struct *)((*bh)->b_data);
if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
ret = -EFSCORRUPTED;
@@ -131,9 +127,9 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
*/
static int kmmpd(void *data)
{
- struct super_block *sb = ((struct mmpd_data *) data)->sb;
- struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+ struct super_block *sb = data;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh;
struct mmp_struct *mmp;
ext4_fsblk_t mmp_block;
u32 seq = 0;
@@ -142,7 +138,7 @@ static int kmmpd(void *data)
unsigned mmp_check_interval;
unsigned long last_update_time;
unsigned long diff;
- int retval;
+ int retval = 0;
mmp_block = le64_to_cpu(es->s_mmp_block);
mmp = (struct mmp_struct *)(bh->b_data);
@@ -154,13 +150,16 @@ static int kmmpd(void *data)
mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
- BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
- bdevname(bh->b_bdev, mmp->mmp_bdevname);
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
- while (!kthread_should_stop()) {
+ while (!kthread_should_stop() && !sb_rdonly(sb)) {
+ if (!ext4_has_feature_mmp(sb)) {
+ ext4_warning(sb, "kmmpd being stopped since MMP feature"
+ " has been disabled.");
+ goto wait_to_exit;
+ }
if (++seq > EXT4_MMP_SEQ_MAX)
seq = 1;
@@ -175,22 +174,12 @@ static int kmmpd(void *data)
*/
if (retval) {
if ((failed_writes % 60) == 0) {
- ext4_set_errno(sb, -retval);
- ext4_error(sb, "Error writing to MMP block");
+ ext4_error_err(sb, -retval,
+ "Error writing to MMP block");
}
failed_writes++;
}
- if (!(le32_to_cpu(es->s_feature_incompat) &
- EXT4_FEATURE_INCOMPAT_MMP)) {
- ext4_warning(sb, "kmmpd being stopped since MMP feature"
- " has been disabled.");
- goto exit_thread;
- }
-
- if (sb_rdonly(sb))
- break;
-
diff = jiffies - last_update_time;
if (diff < mmp_update_interval * HZ)
schedule_timeout_interruptible(mmp_update_interval *
@@ -208,10 +197,10 @@ static int kmmpd(void *data)
retval = read_mmp_block(sb, &bh_check, mmp_block);
if (retval) {
- ext4_set_errno(sb, -retval);
- ext4_error(sb, "error reading MMP data: %d",
- retval);
- goto exit_thread;
+ ext4_error_err(sb, -retval,
+ "error reading MMP data: %d",
+ retval);
+ goto wait_to_exit;
}
mmp_check = (struct mmp_struct *)(bh_check->b_data);
@@ -222,11 +211,10 @@ static int kmmpd(void *data)
"Error while updating MMP info. "
"The filesystem seems to have been"
" multiply mounted.");
- ext4_set_errno(sb, EBUSY);
- ext4_error(sb, "abort");
+ ext4_error_err(sb, EBUSY, "abort");
put_bh(bh_check);
retval = -EBUSY;
- goto exit_thread;
+ goto wait_to_exit;
}
put_bh(bh_check);
}
@@ -249,13 +237,25 @@ static int kmmpd(void *data)
retval = write_mmp_block(sb, bh);
-exit_thread:
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- kfree(data);
- brelse(bh);
+wait_to_exit:
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!kthread_should_stop())
+ schedule();
+ }
+ set_current_state(TASK_RUNNING);
return retval;
}
+void ext4_stop_mmpd(struct ext4_sb_info *sbi)
+{
+ if (sbi->s_mmp_tsk) {
+ kthread_stop(sbi->s_mmp_tsk);
+ brelse(sbi->s_mmp_bh);
+ sbi->s_mmp_tsk = NULL;
+ }
+}
+
/*
* Get a random new sequence number but make sure it is not greater than
* EXT4_MMP_SEQ_MAX.
@@ -265,7 +265,7 @@ static unsigned int mmp_new_seq(void)
u32 new_seq;
do {
- new_seq = prandom_u32();
+ new_seq = get_random_u32();
} while (new_seq > EXT4_MMP_SEQ_MAX);
return new_seq;
@@ -280,7 +280,6 @@ int ext4_multi_mount_protect(struct super_block *sb,
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct buffer_head *bh = NULL;
struct mmp_struct *mmp = NULL;
- struct mmpd_data *mmpd_data;
u32 seq;
unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
unsigned int wait_time = 0;
@@ -369,24 +368,20 @@ skip:
goto failed;
}
- mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL);
- if (!mmpd_data) {
- ext4_warning(sb, "not enough memory for mmpd_data");
- goto failed;
- }
- mmpd_data->sb = sb;
- mmpd_data->bh = bh;
+ EXT4_SB(sb)->s_mmp_bh = bh;
+
+ BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
+ snprintf(mmp->mmp_bdevname, sizeof(mmp->mmp_bdevname),
+ "%pg", bh->b_bdev);
/*
* Start a kernel thread to update the MMP block periodically.
*/
- EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s",
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s",
(int)sizeof(mmp->mmp_bdevname),
- bdevname(bh->b_bdev,
- mmp->mmp_bdevname));
+ mmp->mmp_bdevname);
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
EXT4_SB(sb)->s_mmp_tsk = NULL;
- kfree(mmpd_data);
ext4_warning(sb, "Unable to create kmmpd thread for %s.",
sb->s_id);
goto failed;
@@ -398,5 +393,3 @@ failed:
brelse(bh);
return 1;
}
-
-
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 30ce3dc69378..044e34cd835c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "ext4_extents.h"
@@ -31,8 +32,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
if (IS_ERR(path))
return PTR_ERR(path);
if (path[ext_depth(inode)].p_ext == NULL) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
*ppath = NULL;
return -ENODATA;
}
@@ -102,12 +102,10 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
if (unwritten != ext4_ext_is_unwritten(ext))
goto out;
from += ext4_ext_get_actual_len(ext);
- ext4_ext_drop_refs(path);
}
ret = 1;
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -127,7 +125,7 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
pgoff_t index1, pgoff_t index2, struct page *page[2])
{
struct address_space *mapping[2];
- unsigned fl = AOP_FLAG_NOFS;
+ unsigned int flags;
BUG_ON(!inode1 || !inode2);
if (inode1 < inode2) {
@@ -139,11 +137,15 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
mapping[1] = inode1->i_mapping;
}
- page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
- if (!page[0])
+ flags = memalloc_nofs_save();
+ page[0] = grab_cache_page_write_begin(mapping[0], index1);
+ if (!page[0]) {
+ memalloc_nofs_restore(flags);
return -ENOMEM;
+ }
- page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
+ page[1] = grab_cache_page_write_begin(mapping[1], index2);
+ memalloc_nofs_restore(flags);
if (!page[1]) {
unlock_page(page[0]);
put_page(page[0]);
@@ -215,7 +217,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
for (i = 0; i < nr; i++) {
bh = arr[i];
if (!bh_uptodate_or_lock(bh)) {
- err = bh_submit_read(bh);
+ err = ext4_read_bh(bh, 0, NULL);
if (err)
return err;
}
@@ -422,8 +424,8 @@ repair_branches:
block_len_in_page, 0, &err2);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
if (replaced_count != block_len_in_page) {
- EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
- "Unable to copy data block,"
+ ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset),
+ EIO, "Unable to copy data block,"
" data will be lost.");
*err = -EIO;
}
@@ -467,19 +469,17 @@ mext_check_arguments(struct inode *orig_inode,
if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
return -EPERM;
- /* Ext4 move extent does not support swapfile */
+ /* Ext4 move extent does not support swap files */
if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
- ext4_debug("ext4 move extent: The argument files should "
- "not be swapfile [ino:orig %lu, donor %lu]\n",
+ ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
- return -EBUSY;
+ return -ETXTBSY;
}
if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) {
- ext4_debug("ext4 move extent: The argument files should "
- "not be quota files [ino:orig %lu, donor %lu]\n",
+ ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
- return -EBUSY;
+ return -EOPNOTSUPP;
}
/* Ext4 move extent supports only extent based file */
@@ -626,13 +626,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
if (ret)
goto out;
ex = path[path->p_depth].p_ext;
- next_blk = ext4_ext_next_allocated_block(path);
cur_blk = le32_to_cpu(ex->ee_block);
cur_len = ext4_ext_get_actual_len(ex);
/* Check hole before the start pos */
if (cur_blk + cur_len - 1 < o_start) {
+ next_blk = ext4_ext_next_allocated_block(path);
if (next_blk == EXT_MAX_BLOCKS) {
- o_start = o_end;
ret = -ENODATA;
goto out;
}
@@ -659,14 +658,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
donor_page_index = d_start >> (PAGE_SHIFT -
donor_inode->i_blkbits);
offset_in_page = o_start % blocks_per_page;
- if (cur_len > blocks_per_page- offset_in_page)
+ if (cur_len > blocks_per_page - offset_in_page)
cur_len = blocks_per_page - offset_in_page;
/*
* Up semaphore to avoid following problems:
* a. transaction deadlock among ext4_journal_start,
* ->write_begin via pagefault, and jbd2_journal_commit
- * b. racing with ->readpage, ->write_begin, and ext4_get_block
- * in move_extent_per_page
+ * b. racing with ->read_folio, ->write_begin, and
+ * ext4_get_block in move_extent_per_page
*/
ext4_double_up_write_data_sem(orig_inode, donor_inode);
/* Swap original branches with new branches */
@@ -686,12 +685,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
out:
if (*moved_len) {
- ext4_discard_preallocations(orig_inode);
- ext4_discard_preallocations(donor_inode);
+ ext4_discard_preallocations(orig_inode, 0);
+ ext4_discard_preallocations(donor_inode, 0);
}
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
unlock_two_nondirectories(orig_inode, donor_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b05ea72f38fd..c08c0aba1883 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -54,6 +54,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
struct inode *inode,
ext4_lblk_t *block)
{
+ struct ext4_map_blocks map;
struct buffer_head *bh;
int err;
@@ -63,20 +64,41 @@ static struct buffer_head *ext4_append(handle_t *handle,
return ERR_PTR(-ENOSPC);
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+ map.m_lblk = *block;
+ map.m_len = 1;
+
+ /*
+ * We're appending new directory block. Make sure the block is not
+ * allocated yet, otherwise we will end up corrupting the
+ * directory.
+ */
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+ if (err) {
+ EXT4_ERROR_INODE(inode, "Logical block already allocated");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
if (IS_ERR(bh))
return bh;
inode->i_size += inode->i_sb->s_blocksize;
EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err)
+ goto out;
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
- if (err) {
- brelse(bh);
- ext4_std_error(inode->i_sb, err);
- return ERR_PTR(err);
- }
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out;
return bh;
+
+out:
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+ return ERR_PTR(err);
}
static int ext4_dx_csum_verify(struct inode *inode,
@@ -109,6 +131,13 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
struct ext4_dir_entry *dirent;
int is_dx_block = 0;
+ if (block >= inode->i_size >> inode->i_blkbits) {
+ ext4_error_inode(inode, func, line, block,
+ "Attempting to read directory block (%u) that is past i_size (%llu)",
+ block, inode->i_size);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
bh = ERR_PTR(-EIO);
else
@@ -160,9 +189,9 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
!ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
set_buffer_verified(bh);
else {
- ext4_set_errno(inode->i_sb, EFSBADCRC);
- ext4_error_inode(inode, func, line, block,
- "Directory index failed checksum");
+ ext4_error_inode_err(inode, func, line, block,
+ EFSBADCRC,
+ "Directory index failed checksum");
brelse(bh);
return ERR_PTR(-EFSBADCRC);
}
@@ -172,9 +201,9 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
!ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
set_buffer_verified(bh);
else {
- ext4_set_errno(inode->i_sb, EFSBADCRC);
- ext4_error_inode(inode, func, line, block,
- "Directory block failed checksum");
+ ext4_error_inode_err(inode, func, line, block,
+ EFSBADCRC,
+ "Directory block failed checksum");
brelse(bh);
return ERR_PTR(-EFSBADCRC);
}
@@ -182,10 +211,6 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
return bh;
}
-#ifndef assert
-#define assert(test) J_ASSERT(test)
-#endif
-
#ifdef DX_DEBUG
#define dxtrace(command) command
#else
@@ -233,13 +258,13 @@ struct dx_root
u8 unused_flags;
}
info;
- struct dx_entry entries[0];
+ struct dx_entry entries[];
};
struct dx_node
{
struct fake_dirent fake;
- struct dx_entry entries[0];
+ struct dx_entry entries[];
};
@@ -280,13 +305,15 @@ static struct dx_frame *dx_probe(struct ext4_filename *fname,
struct dx_hash_info *hinfo,
struct dx_frame *frame);
static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
- unsigned blocksize, struct dx_hash_info *hinfo,
- struct dx_map_entry map[]);
+static int dx_make_map(struct inode *dir, struct buffer_head *bh,
+ struct dx_hash_info *hinfo,
+ struct dx_map_entry *map_tail);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
- struct dx_map_entry *offsets, int count, unsigned blocksize);
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
+static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from,
+ char *to, struct dx_map_entry *offsets,
+ int count, unsigned int blocksize);
+static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
+ unsigned int blocksize);
static void dx_insert_block(struct dx_frame *frame,
u32 hash, ext4_lblk_t block);
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -578,8 +605,9 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
{
- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
- EXT4_DIR_REC_LEN(2) - infosize;
+ unsigned int entry_space = dir->i_sb->s_blocksize -
+ ext4_dir_rec_len(1, NULL) -
+ ext4_dir_rec_len(2, NULL) - infosize;
if (ext4_has_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
@@ -588,7 +616,8 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
static inline unsigned dx_node_limit(struct inode *dir)
{
- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+ unsigned int entry_space = dir->i_sb->s_blocksize -
+ ext4_dir_rec_len(0, dir);
if (ext4_has_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
@@ -643,13 +672,7 @@ static struct stats dx_show_leaf(struct inode *dir,
name = de->name;
len = de->name_len;
- if (IS_ENCRYPTED(dir))
- res = fscrypt_get_encryption_info(dir);
- if (res) {
- printk(KERN_WARNING "Error setting up"
- " fname crypto: %d\n", res);
- }
- if (!fscrypt_has_encryption_key(dir)) {
+ if (!IS_ENCRYPTED(dir)) {
/* Directory is not encrypted */
ext4fs_dirhash(dir, de->name,
de->name_len, &h);
@@ -663,8 +686,7 @@ static struct stats dx_show_leaf(struct inode *dir,
/* Directory is encrypted */
res = fscrypt_fname_alloc_buffer(
- dir, len,
- &fname_crypto_str);
+ len, &fname_crypto_str);
if (res)
printk(KERN_WARNING "Error "
"allocating crypto "
@@ -684,7 +706,10 @@ static struct stats dx_show_leaf(struct inode *dir,
name = fname_crypto_str.name;
len = fname_crypto_str.len;
}
- ext4fs_dirhash(dir, de->name,
+ if (IS_CASEFOLDED(dir))
+ h.hash = EXT4_DIRENT_HASH(de);
+ else
+ ext4fs_dirhash(dir, de->name,
de->name_len, &h);
printk("%*.s:(E)%x.%u ", len, name,
h.hash, (unsigned) ((char *) de
@@ -700,7 +725,7 @@ static struct stats dx_show_leaf(struct inode *dir,
(unsigned) ((char *) de - base));
#endif
}
- space += EXT4_DIR_REC_LEN(de->name_len);
+ space += ext4_dir_rec_len(de->name_len, dir);
names++;
}
de = ext4_next_entry(de, size);
@@ -742,6 +767,29 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
(space/bcount)*100/blocksize);
return (struct stats) { names, space, bcount};
}
+
+/*
+ * Linear search cross check
+ */
+static inline void htree_rep_invariant_check(struct dx_entry *at,
+ struct dx_entry *target,
+ u32 hash, unsigned int n)
+{
+ while (n--) {
+ dxtrace(printk(KERN_CONT ","));
+ if (dx_get_hash(++at) > hash) {
+ at--;
+ break;
+ }
+ }
+ ASSERT(at == target - 1);
+}
+#else /* DX_DEBUG */
+static inline void htree_rep_invariant_check(struct dx_entry *at,
+ struct dx_entry *target,
+ u32 hash, unsigned int n)
+{
+}
#endif /* DX_DEBUG */
/*
@@ -757,12 +805,14 @@ static struct dx_frame *
dx_probe(struct ext4_filename *fname, struct inode *dir,
struct dx_hash_info *hinfo, struct dx_frame *frame_in)
{
- unsigned count, indirect;
+ unsigned count, indirect, level, i;
struct dx_entry *at, *entries, *p, *q, *m;
struct dx_root *root;
struct dx_frame *frame = frame_in;
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;
+ ext4_lblk_t block;
+ ext4_lblk_t blocks[EXT4_HTREE_LEVEL];
memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
frame->bh = ext4_read_dirblock(dir, 0, INDEX);
@@ -772,18 +822,34 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
root = (struct dx_root *) frame->bh->b_data;
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
- root->info.hash_version != DX_HASH_LEGACY) {
+ root->info.hash_version != DX_HASH_LEGACY &&
+ root->info.hash_version != DX_HASH_SIPHASH) {
ext4_warning_inode(dir, "Unrecognised inode hash code %u",
root->info.hash_version);
goto fail;
}
+ if (ext4_hash_in_dirent(dir)) {
+ if (root->info.hash_version != DX_HASH_SIPHASH) {
+ ext4_warning_inode(dir,
+ "Hash in dirent, but hash is not SIPHASH");
+ goto fail;
+ }
+ } else {
+ if (root->info.hash_version == DX_HASH_SIPHASH) {
+ ext4_warning_inode(dir,
+ "Hash code is SIPHASH, but hash not in dirent");
+ goto fail;
+ }
+ }
if (fname)
hinfo = &fname->hinfo;
hinfo->hash_version = root->info.hash_version;
if (hinfo->hash_version <= DX_HASH_TEA)
hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- if (fname && fname_name(fname))
+ /* hash is already computed for encrypted casefolded directory */
+ if (fname && fname_name(fname) &&
+ !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir)))
ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo);
hash = hinfo->hash;
@@ -818,6 +884,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
}
dxtrace(printk("Look up %x", hash));
+ level = 0;
+ blocks[0] = 0;
while (1) {
count = dx_get_count(entries);
if (!count || count > dx_get_limit(entries)) {
@@ -838,20 +906,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
p = m + 1;
}
- if (0) { // linear search cross check
- unsigned n = count - 1;
- at = entries;
- while (n--)
- {
- dxtrace(printk(KERN_CONT ","));
- if (dx_get_hash(++at) > hash)
- {
- at--;
- break;
- }
- }
- assert (at == p - 1);
- }
+ htree_rep_invariant_check(entries, p, hash, count - 1);
at = p - 1;
dxtrace(printk(KERN_CONT " %x->%u\n",
@@ -859,15 +914,27 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
dx_get_block(at)));
frame->entries = entries;
frame->at = at;
- if (!indirect--)
+
+ block = dx_get_block(at);
+ for (i = 0; i <= level; i++) {
+ if (blocks[i] == block) {
+ ext4_warning_inode(dir,
+ "dx entry: tree cycle block %u points back to block %u",
+ blocks[level], block);
+ goto fail;
+ }
+ }
+ if (++level > indirect)
return frame;
+ blocks[level] = block;
frame++;
- frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+ frame->bh = ext4_read_dirblock(dir, block, INDEX);
if (IS_ERR(frame->bh)) {
ret_err = (struct dx_frame *) frame->bh;
frame->bh = NULL;
goto fail;
}
+
entries = ((struct dx_node *) frame->bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit(dir)) {
@@ -957,7 +1024,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
* If the hash is 1, then continue only if the next page has a
* continuation hash of any value. This is used for readdir
* handling. Otherwise, check to see if the hash matches the
- * desired contiuation hash. If it doesn't, return since
+ * desired continuation hash. If it doesn't, return since
* there's no point to read in the successive index pages.
*/
bhash = dx_get_hash(p->at);
@@ -998,6 +1065,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
struct ext4_dir_entry_2 *de, *top;
int err = 0, count = 0;
struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str;
+ int csum = ext4_has_metadata_csum(dir->i_sb);
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
@@ -1006,18 +1074,20 @@ static int htree_dirblock_to_tree(struct file *dir_file,
return PTR_ERR(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
+ /* csum entries are not larger in the casefolded encrypted case */
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
- EXT4_DIR_REC_LEN(0));
+ ext4_dir_rec_len(0,
+ csum ? NULL : dir));
/* Check if the directory is encrypted */
if (IS_ENCRYPTED(dir)) {
- err = fscrypt_get_encryption_info(dir);
+ err = fscrypt_prepare_readdir(dir);
if (err < 0) {
brelse(bh);
return err;
}
- err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN,
- &fname_crypto_str);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN,
+ &fname_crypto_str);
if (err < 0) {
brelse(bh);
return err;
@@ -1032,7 +1102,17 @@ static int htree_dirblock_to_tree(struct file *dir_file,
/* silently ignore the rest of the block */
break;
}
- ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ if (ext4_hash_in_dirent(dir)) {
+ if (de->name_len && de->inode) {
+ hinfo->hash = EXT4_DIRENT_HASH(de);
+ hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
+ } else {
+ hinfo->hash = 0;
+ hinfo->minor_hash = 0;
+ }
+ } else {
+ ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ }
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
@@ -1101,7 +1181,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
start_hash, start_minor_hash));
dir = file_inode(dir_file);
if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
- hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (ext4_hash_in_dirent(dir))
+ hinfo.hash_version = DX_HASH_SIPHASH;
+ else
+ hinfo.hash_version =
+ EXT4_SB(dir->i_sb)->s_def_hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
hinfo.hash_version +=
EXT4_SB(dir->i_sb)->s_hash_unsigned;
@@ -1209,17 +1293,28 @@ static inline int search_dirblock(struct buffer_head *bh,
* Create map of hash values, offsets, and sizes, stored at end of block.
* Returns number of entries mapped.
*/
-static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
- unsigned blocksize, struct dx_hash_info *hinfo,
+static int dx_make_map(struct inode *dir, struct buffer_head *bh,
+ struct dx_hash_info *hinfo,
struct dx_map_entry *map_tail)
{
int count = 0;
- char *base = (char *) de;
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data;
+ unsigned int buflen = bh->b_size;
+ char *base = bh->b_data;
struct dx_hash_info h = *hinfo;
- while ((char *) de < base + blocksize) {
+ if (ext4_has_metadata_csum(dir->i_sb))
+ buflen -= sizeof(struct ext4_dir_entry_tail);
+
+ while ((char *) de < base + buflen) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen,
+ ((char *)de) - base))
+ return -EFSCORRUPTED;
if (de->name_len && de->inode) {
- ext4fs_dirhash(dir, de->name, de->name_len, &h);
+ if (ext4_hash_in_dirent(dir))
+ h.hash = EXT4_DIRENT_HASH(de);
+ else
+ ext4fs_dirhash(dir, de->name, de->name_len, &h);
map_tail--;
map_tail->hash = h.hash;
map_tail->offs = ((char *) de - base)>>2;
@@ -1227,8 +1322,7 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
count++;
cond_resched();
}
- /* XXX: do we need to check rec_len == 0 case? -Chris */
- de = ext4_next_entry(de, blocksize);
+ de = ext4_next_entry(de, dir->i_sb->s_blocksize);
}
return count;
}
@@ -1266,15 +1360,15 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
struct dx_entry *old = frame->at, *new = old + 1;
int count = dx_get_count(entries);
- assert(count < dx_get_limit(entries));
- assert(old < entries + count);
+ ASSERT(count < dx_get_limit(entries));
+ ASSERT(old < entries + count);
memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
dx_set_hash(new, hash);
dx_set_block(new, block);
dx_set_count(entries, count + 1);
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/*
* Test whether a case-insensitive directory entry matches the filename
* being searched for. If quick is set, assume the name being looked up
@@ -1283,58 +1377,85 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
* Returns: 0 if the directory entry matches, more than 0 if it
* doesn't match or less than zero on error.
*/
-int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
- const struct qstr *entry, bool quick)
+static int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
+ u8 *de_name, size_t de_name_len, bool quick)
{
- const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb);
- const struct unicode_map *um = sbi->s_encoding;
+ const struct super_block *sb = parent->i_sb;
+ const struct unicode_map *um = sb->s_encoding;
+ struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
+ struct qstr entry = QSTR_INIT(de_name, de_name_len);
int ret;
+ if (IS_ENCRYPTED(parent)) {
+ const struct fscrypt_str encrypted_name =
+ FSTR_INIT(de_name, de_name_len);
+
+ decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
+ if (!decrypted_name.name)
+ return -ENOMEM;
+ ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name,
+ &decrypted_name);
+ if (ret < 0)
+ goto out;
+ entry.name = decrypted_name.name;
+ entry.len = decrypted_name.len;
+ }
+
if (quick)
- ret = utf8_strncasecmp_folded(um, name, entry);
+ ret = utf8_strncasecmp_folded(um, name, &entry);
else
- ret = utf8_strncasecmp(um, name, entry);
-
+ ret = utf8_strncasecmp(um, name, &entry);
if (ret < 0) {
/* Handle invalid character sequence as either an error
* or as an opaque byte sequence.
*/
- if (ext4_has_strict_mode(sbi))
- return -EINVAL;
-
- if (name->len != entry->len)
- return 1;
-
- return !!memcmp(name->name, entry->name, name->len);
+ if (sb_has_strict_encoding(sb))
+ ret = -EINVAL;
+ else if (name->len != entry.len)
+ ret = 1;
+ else
+ ret = !!memcmp(name->name, entry.name, entry.len);
}
-
+out:
+ kfree(decrypted_name.name);
return ret;
}
-void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
- struct fscrypt_str *cf_name)
+int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
+ struct ext4_filename *name)
{
+ struct fscrypt_str *cf_name = &name->cf_name;
+ struct dx_hash_info *hinfo = &name->hinfo;
int len;
- if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) {
+ if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding ||
+ (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) {
cf_name->name = NULL;
- return;
+ return 0;
}
cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
if (!cf_name->name)
- return;
+ return -ENOMEM;
- len = utf8_casefold(EXT4_SB(dir->i_sb)->s_encoding,
+ len = utf8_casefold(dir->i_sb->s_encoding,
iname, cf_name->name,
EXT4_NAME_LEN);
if (len <= 0) {
kfree(cf_name->name);
cf_name->name = NULL;
- return;
}
cf_name->len = (unsigned) len;
+ if (!IS_ENCRYPTED(dir))
+ return 0;
+ hinfo->hash_version = DX_HASH_SIPHASH;
+ hinfo->seed = NULL;
+ if (cf_name->name)
+ ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo);
+ else
+ ext4fs_dirhash(dir, iname->name, iname->len, hinfo);
+ return 0;
}
#endif
@@ -1343,14 +1464,11 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
*
* Return: %true if the directory entry matches, otherwise %false.
*/
-static inline bool ext4_match(const struct inode *parent,
+static bool ext4_match(struct inode *parent,
const struct ext4_filename *fname,
- const struct ext4_dir_entry_2 *de)
+ struct ext4_dir_entry_2 *de)
{
struct fscrypt_name f;
-#ifdef CONFIG_UNICODE
- const struct qstr entry = {.name = de->name, .len = de->name_len};
-#endif
if (!de->inode)
return false;
@@ -1361,15 +1479,25 @@ static inline bool ext4_match(const struct inode *parent,
f.crypto_buf = fname->crypto_buf;
#endif
-#ifdef CONFIG_UNICODE
- if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent)) {
+#if IS_ENABLED(CONFIG_UNICODE)
+ if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) &&
+ (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) {
if (fname->cf_name.name) {
struct qstr cf = {.name = fname->cf_name.name,
.len = fname->cf_name.len};
- return !ext4_ci_compare(parent, &cf, &entry, true);
+ if (IS_ENCRYPTED(parent)) {
+ if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
+ fname->hinfo.minor_hash !=
+ EXT4_DIRENT_MINOR_HASH(de)) {
+
+ return false;
+ }
+ }
+ return !ext4_ci_compare(parent, &cf, de->name,
+ de->name_len, true);
}
- return !ext4_ci_compare(parent, fname->usr_fname, &entry,
- false);
+ return !ext4_ci_compare(parent, fname->usr_fname, de->name,
+ de->name_len, false);
}
#endif
@@ -1389,15 +1517,15 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
- while ((char *) de < dlimit) {
+ while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
- if ((char *) de + de->name_len <= dlimit &&
+ if (de->name + de->name_len <= dlimit &&
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
- bh->b_size, offset))
+ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf,
+ buf_size, offset))
return -1;
*res_dir = de;
return 1;
@@ -1532,9 +1660,9 @@ restart:
goto next;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- ext4_set_errno(sb, EIO);
- EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
- (unsigned long) block);
+ EXT4_ERROR_INODE_ERR(dir, EIO,
+ "reading directory lblock %lu",
+ (unsigned long) block);
brelse(bh);
ret = ERR_PTR(-EIO);
goto cleanup_and_exit;
@@ -1543,9 +1671,9 @@ restart:
!is_dx_internal_node(dir, block,
(struct ext4_dir_entry *)bh->b_data) &&
!ext4_dirblock_csum_verify(dir, bh)) {
- ext4_set_errno(sb, EFSBADCRC);
- EXT4_ERROR_INODE(dir, "checksumming directory "
- "block %lu", (unsigned long)block);
+ EXT4_ERROR_INODE_ERR(dir, EFSBADCRC,
+ "checksumming directory "
+ "block %lu", (unsigned long)block);
brelse(bh);
ret = ERR_PTR(-EFSBADCRC);
goto cleanup_and_exit;
@@ -1615,6 +1743,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
struct buffer_head *bh;
err = ext4_fname_prepare_lookup(dir, dentry, &fname);
+ generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
return NULL;
if (err)
@@ -1722,7 +1851,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
}
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (!inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
@@ -1739,11 +1868,10 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
struct dentry *ext4_get_parent(struct dentry *child)
{
__u32 ino;
- static const struct qstr dotdot = QSTR_INIT("..", 2);
struct ext4_dir_entry_2 * de;
struct buffer_head *bh;
- bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
+ bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL);
if (IS_ERR(bh))
return ERR_CAST(bh);
if (!bh)
@@ -1765,7 +1893,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
* Returns pointer to last entry moved.
*/
static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+dx_move_dirents(struct inode *dir, char *from, char *to,
+ struct dx_map_entry *map, int count,
unsigned blocksize)
{
unsigned rec_len = 0;
@@ -1773,11 +1902,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
while (count--) {
struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
(from + (map->offs<<2));
- rec_len = EXT4_DIR_REC_LEN(de->name_len);
+ rec_len = ext4_dir_rec_len(de->name_len, dir);
+
memcpy (to, de, rec_len);
((struct ext4_dir_entry_2 *) to)->rec_len =
ext4_rec_len_to_disk(rec_len, blocksize);
+
+ /* wipe dir_entry excluding the rec_len field */
de->inode = 0;
+ memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len,
+ blocksize) -
+ offsetof(struct ext4_dir_entry_2,
+ name_len));
+
map++;
to += rec_len;
}
@@ -1788,7 +1925,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
* Compact each dir entry in the range to the minimal rec_len.
* Returns pointer to last entry in range.
*/
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
+ unsigned int blocksize)
{
struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
unsigned rec_len = 0;
@@ -1797,7 +1935,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
while ((char*)de < base + blocksize) {
next = ext4_next_entry(de, blocksize);
if (de->inode && de->name_len) {
- rec_len = EXT4_DIR_REC_LEN(de->name_len);
+ rec_len = ext4_dir_rec_len(de->name_len, dir);
if (de > to)
memmove(to, de, rec_len);
to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
@@ -1819,7 +1957,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
struct dx_hash_info *hinfo)
{
unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count, continued;
+ unsigned continued;
+ int count;
struct buffer_head *bh2;
ext4_lblk_t newblock;
u32 hash2;
@@ -1841,12 +1980,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
}
BUFFER_TRACE(*bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, *bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, *bh,
+ EXT4_JTR_NONE);
if (err)
goto journal_error;
BUFFER_TRACE(frame->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, frame->bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh,
+ EXT4_JTR_NONE);
if (err)
goto journal_error;
@@ -1854,11 +1995,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* create map in the end of data2 block */
map = (struct dx_map_entry *) (data2 + blocksize);
- count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1,
- blocksize, hinfo, map);
+ count = dx_make_map(dir, *bh, hinfo, map);
+ if (count < 0) {
+ err = count;
+ goto journal_error;
+ }
map -= count;
dx_sort_map(map, count);
- /* Split the existing block in the middle, size-wise */
+ /* Ensure that neither split block is over half full */
size = 0;
move = 0;
for (i = count-1; i >= 0; i--) {
@@ -1868,8 +2012,18 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
size += map[i].size;
move++;
}
- /* map index at which we will split */
- split = count - move;
+ /*
+ * map index at which we will split
+ *
+ * If the sum of active entries didn't exceed half the block size, just
+ * split it in half by count; each resulting block will have at least
+ * half the space free.
+ */
+ if (i > 0)
+ split = count - move;
+ else
+ split = count/2;
+
hash2 = map[split].hash;
continued = hash2 == map[split - 1].hash;
dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
@@ -1877,9 +2031,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
- de2 = dx_move_dirents(data1, data2, map + split, count - split,
+ de2 = dx_move_dirents(dir, data1, data2, map + split, count - split,
blocksize);
- de = dx_pack_dirents(data1, blocksize);
+ de = dx_pack_dirents(dir, data1, blocksize);
de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
(char *) de,
blocksize);
@@ -1927,12 +2081,12 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
struct ext4_dir_entry_2 **dest_de)
{
struct ext4_dir_entry_2 *de;
- unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname));
+ unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir);
int nlen, rlen;
unsigned int offset = 0;
char *top;
- de = (struct ext4_dir_entry_2 *)buf;
+ de = buf;
top = buf + buf_size - reclen;
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
@@ -1940,7 +2094,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
return -EFSCORRUPTED;
if (ext4_match(dir, fname, de))
return -EEXIST;
- nlen = EXT4_DIR_REC_LEN(de->name_len);
+ nlen = ext4_dir_rec_len(de->name_len, dir);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if ((de->inode ? rlen - nlen : rlen) >= reclen)
break;
@@ -1954,7 +2108,8 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
return 0;
}
-void ext4_insert_dentry(struct inode *inode,
+void ext4_insert_dentry(struct inode *dir,
+ struct inode *inode,
struct ext4_dir_entry_2 *de,
int buf_size,
struct ext4_filename *fname)
@@ -1962,7 +2117,7 @@ void ext4_insert_dentry(struct inode *inode,
int nlen, rlen;
- nlen = EXT4_DIR_REC_LEN(de->name_len);
+ nlen = ext4_dir_rec_len(de->name_len, dir);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if (de->inode) {
struct ext4_dir_entry_2 *de1 =
@@ -1976,6 +2131,13 @@ void ext4_insert_dentry(struct inode *inode,
ext4_set_de_type(inode->i_sb, de, inode->i_mode);
de->name_len = fname_len(fname);
memcpy(de->name, fname_name(fname), fname_len(fname));
+ if (ext4_hash_in_dirent(dir)) {
+ struct dx_hash_info *hinfo = &fname->hinfo;
+
+ EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash);
+ EXT4_DIRENT_HASHES(de)->minor_hash =
+ cpu_to_le32(hinfo->minor_hash);
+ }
}
/*
@@ -1993,7 +2155,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
{
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
- int err;
+ int err, err2;
if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
@@ -2005,14 +2167,15 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err;
}
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (err) {
ext4_std_error(dir->i_sb, err);
return err;
}
/* By now the buffer is marked for journaling */
- ext4_insert_dentry(inode, de, blocksize, fname);
+ ext4_insert_dentry(dir, inode, de, blocksize, fname);
/*
* XXX shouldn't update any times until successful
@@ -2028,12 +2191,12 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
- ext4_mark_inode_dirty(handle, dir);
+ err2 = ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_dirblock(handle, dir, bh);
if (err)
ext4_std_error(dir->i_sb, err);
- return 0;
+ return err ? err : err2;
}
/*
@@ -2063,7 +2226,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
blocksize = dir->i_sb->s_blocksize;
dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
BUFFER_TRACE(bh, "get_write_access");
- retval = ext4_journal_get_write_access(handle, bh);
+ retval = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (retval) {
ext4_std_error(dir->i_sb, retval);
brelse(bh);
@@ -2092,10 +2256,19 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
data2 = bh2->b_data;
memcpy(data2, de, len);
+ memset(de, 0, len); /* wipe old data */
de = (struct ext4_dir_entry_2 *) data2;
top = data2 + len;
- while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+ while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
+ (data2 + (blocksize - csum_size) -
+ (char *) de))) {
+ brelse(bh2);
+ brelse(bh);
+ return -EFSCORRUPTED;
+ }
de = de2;
+ }
de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
(char *) de, blocksize);
@@ -2104,11 +2277,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
/* Initialize the root; the dot dirents already exist */
de = (struct ext4_dir_entry_2 *) (&root->dotdot);
- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
- blocksize);
+ de->rec_len = ext4_rec_len_to_disk(
+ blocksize - ext4_dir_rec_len(2, NULL), blocksize);
memset (&root->info, 0, sizeof(root->info));
root->info.info_length = sizeof(root->info);
- root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (ext4_hash_in_dirent(dir))
+ root->info.hash_version = DX_HASH_SIPHASH;
+ else
+ root->info.hash_version =
+ EXT4_SB(dir->i_sb)->s_def_hash_version;
+
entries = root->entries;
dx_set_block(entries, 1);
dx_set_count(entries, 1);
@@ -2119,7 +2297,11 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
if (fname->hinfo.hash_version <= DX_HASH_TEA)
fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo);
+
+ /* casefolded encrypted hashes are computed on fname setup */
+ if (!ext4_hash_in_dirent(dir))
+ ext4fs_dirhash(dir, fname_name(fname),
+ fname_len(fname), &fname->hinfo);
memset(frames, 0, sizeof(frames));
frame = frames;
@@ -2129,10 +2311,10 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (retval)
- goto out_frames;
+ goto out_frames;
retval = ext4_handle_dirty_dirblock(handle, dir, bh2);
if (retval)
- goto out_frames;
+ goto out_frames;
de = do_split(handle,dir, &bh2, frame, &fname->hinfo);
if (IS_ERR(de)) {
@@ -2171,9 +2353,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
struct buffer_head *bh = NULL;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
-#ifdef CONFIG_UNICODE
- struct ext4_sb_info *sbi;
-#endif
struct ext4_filename fname;
int retval;
int dx_fallback=0;
@@ -2189,10 +2368,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (!dentry->d_name.len)
return -EINVAL;
-#ifdef CONFIG_UNICODE
- sbi = EXT4_SB(sb);
- if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
- sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
+ if (fscrypt_is_nokey_name(dentry))
+ return -ENOKEY;
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
+ sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))
return -EINVAL;
#endif
@@ -2223,7 +2404,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
}
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
- ext4_mark_inode_dirty(handle, dir);
+ retval = ext4_mark_inode_dirty(handle, dir);
+ if (unlikely(retval))
+ goto out;
}
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
@@ -2304,7 +2487,7 @@ again:
}
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
if (err)
goto journal_error;
@@ -2361,7 +2544,8 @@ again:
node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
sb->s_blocksize);
BUFFER_TRACE(frame->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, frame->bh);
+ err = ext4_journal_get_write_access(handle, sb, frame->bh,
+ EXT4_JTR_NONE);
if (err)
goto journal_error;
if (!add_level) {
@@ -2371,8 +2555,9 @@ again:
icount1, icount2));
BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
- err = ext4_journal_get_write_access(handle,
- (frame - 1)->bh);
+ err = ext4_journal_get_write_access(handle, sb,
+ (frame - 1)->bh,
+ EXT4_JTR_NONE);
if (err)
goto journal_error;
@@ -2384,7 +2569,7 @@ again:
/* Which index block gets the new entry? */
if (at - entries >= icount1) {
- frame->at = at = at - entries - icount1 + entries2;
+ frame->at = at - entries - icount1 + entries2;
frame->entries = entries = entries2;
swap(frame->bh, bh2);
}
@@ -2400,11 +2585,10 @@ again:
(frame - 1)->bh);
if (err)
goto journal_error;
- if (restart) {
- err = ext4_handle_dirty_dx_node(handle, dir,
- frame->bh);
+ err = ext4_handle_dirty_dx_node(handle, dir,
+ frame->bh);
+ if (restart || err)
goto journal_error;
- }
} else {
struct dx_root *dxroot;
memcpy((char *) entries2, (char *) entries,
@@ -2453,8 +2637,7 @@ cleanup:
* ext4_generic_delete_entry deletes a directory entry by merging it
* with the previous entry
*/
-int ext4_generic_delete_entry(handle_t *handle,
- struct inode *dir,
+int ext4_generic_delete_entry(struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh,
void *entry_buf,
@@ -2467,21 +2650,33 @@ int ext4_generic_delete_entry(handle_t *handle,
i = 0;
pde = NULL;
- de = (struct ext4_dir_entry_2 *)entry_buf;
+ de = entry_buf;
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
- bh->b_data, bh->b_size, i))
+ entry_buf, buf_size, i))
return -EFSCORRUPTED;
if (de == de_del) {
- if (pde)
+ if (pde) {
pde->rec_len = ext4_rec_len_to_disk(
ext4_rec_len_from_disk(pde->rec_len,
blocksize) +
ext4_rec_len_from_disk(de->rec_len,
blocksize),
blocksize);
- else
+
+ /* wipe entire dir_entry */
+ memset(de, 0, ext4_rec_len_from_disk(de->rec_len,
+ blocksize));
+ } else {
+ /* wipe dir_entry excluding the rec_len field */
de->inode = 0;
+ memset(&de->name_len, 0,
+ ext4_rec_len_from_disk(de->rec_len,
+ blocksize) -
+ offsetof(struct ext4_dir_entry_2,
+ name_len));
+ }
+
inode_inc_iversion(dir);
return 0;
}
@@ -2511,12 +2706,12 @@ static int ext4_delete_entry(handle_t *handle,
csum_size = sizeof(struct ext4_dir_entry_tail);
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (unlikely(err))
goto out;
- err = ext4_generic_delete_entry(handle, dir, de_del,
- bh, bh->b_data,
+ err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data,
dir->i_sb->s_blocksize, csum_size);
if (err)
goto out;
@@ -2544,7 +2739,7 @@ out:
* for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set
* on regular files) and to avoid creating huge/slow non-HTREE directories.
*/
-static void ext4_inc_count(handle_t *handle, struct inode *inode)
+static void ext4_inc_count(struct inode *inode)
{
inc_nlink(inode);
if (is_dx(inode) &&
@@ -2556,7 +2751,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
* If a directory had nlink == 1, then we should let it be 1. This indicates
* directory has >EXT4_LINK_MAX subdirs.
*/
-static void ext4_dec_count(handle_t *handle, struct inode *inode)
+static void ext4_dec_count(struct inode *inode)
{
if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
drop_nlink(inode);
@@ -2576,12 +2771,12 @@ static int ext4_add_nondir(handle_t *handle,
struct inode *inode = *inodep;
int err = ext4_add_entry(handle, dentry, inode);
if (!err) {
- ext4_mark_inode_dirty(handle, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
d_instantiate_new(dentry, inode);
*inodep = NULL;
- return 0;
+ return err;
}
drop_nlink(inode);
ext4_orphan_add(handle, inode);
@@ -2597,8 +2792,8 @@ static int ext4_add_nondir(handle_t *handle,
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
handle_t *handle;
struct inode *inode;
@@ -2611,8 +2806,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
- inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
- NULL, EXT4_HT_DIR, credits);
+ inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name,
+ 0, NULL, EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
@@ -2620,6 +2815,8 @@ retry:
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, &inode);
+ if (!err)
+ ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
@@ -2630,8 +2827,8 @@ retry:
return err;
}
-static int ext4_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
+static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
{
handle_t *handle;
struct inode *inode;
@@ -2644,14 +2841,16 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
- inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
- NULL, EXT4_HT_DIR, credits);
+ inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name,
+ 0, NULL, EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ext4_special_inode_operations;
err = ext4_add_nondir(handle, dentry, &inode);
+ if (!err)
+ ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
@@ -2662,7 +2861,8 @@ retry:
return err;
}
-static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct file *file, umode_t mode)
{
handle_t *handle;
struct inode *inode;
@@ -2673,7 +2873,7 @@ static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
return err;
retry:
- inode = ext4_new_inode_start_handle(dir, mode,
+ inode = ext4_new_inode_start_handle(mnt_userns, dir, mode,
NULL, 0, NULL,
EXT4_HT_DIR,
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
@@ -2684,7 +2884,7 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
err = ext4_orphan_add(handle, inode);
if (err)
goto err_unlock_inode;
@@ -2695,7 +2895,7 @@ retry:
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
- return err;
+ return finish_open_simple(file, err);
err_unlock_inode:
ext4_journal_stop(handle);
unlock_new_inode(inode);
@@ -2709,7 +2909,7 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
{
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+ de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL),
blocksize);
strcpy(de->name, ".");
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
@@ -2719,18 +2919,19 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
de->name_len = 2;
if (!dotdot_real_len)
de->rec_len = ext4_rec_len_to_disk(blocksize -
- (csum_size + EXT4_DIR_REC_LEN(1)),
+ (csum_size + ext4_dir_rec_len(1, NULL)),
blocksize);
else
de->rec_len = ext4_rec_len_to_disk(
- EXT4_DIR_REC_LEN(de->name_len), blocksize);
+ ext4_dir_rec_len(de->name_len, NULL),
+ blocksize);
strcpy(de->name, "..");
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
return ext4_next_entry(de, blocksize);
}
-static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct inode *inode)
{
struct buffer_head *dir_block = NULL;
@@ -2771,11 +2972,12 @@ out:
return err;
}
-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
handle_t *handle;
struct inode *inode;
- int err, credits, retries = 0;
+ int err, err2 = 0, credits, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
return -EMLINK;
@@ -2787,7 +2989,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
- inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
+ inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFDIR | mode,
&dentry->d_name,
0, NULL, EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
@@ -2808,17 +3010,21 @@ out_clear_inode:
clear_nlink(inode);
ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
- ext4_mark_inode_dirty(handle, inode);
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err2))
+ err = err2;
ext4_journal_stop(handle);
iput(inode);
goto out_retry;
}
- ext4_inc_count(handle, dir);
+ ext4_inc_count(dir);
+
ext4_update_dx_flag(dir);
err = ext4_mark_inode_dirty(handle, dir);
if (err)
goto out_clear_inode;
d_instantiate_new(dentry, inode);
+ ext4_fc_track_create(handle, dentry);
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
@@ -2851,16 +3057,17 @@ bool ext4_empty_dir(struct inode *inode)
}
sb = inode->i_sb;
- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
+ if (inode->i_size < ext4_dir_rec_len(1, NULL) +
+ ext4_dir_rec_len(2, NULL)) {
EXT4_ERROR_INODE(inode, "invalid size");
- return true;
+ return false;
}
/* The first directory block must not be a hole,
* so treat it as DIRENT_HTREE
*/
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh))
- return true;
+ return false;
de = (struct ext4_dir_entry_2 *) bh->b_data;
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
@@ -2868,7 +3075,7 @@ bool ext4_empty_dir(struct inode *inode)
le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
ext4_warning_inode(inode, "directory missing '.'");
brelse(bh);
- return true;
+ return false;
}
offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
de = ext4_next_entry(de, sb->s_blocksize);
@@ -2877,7 +3084,7 @@ bool ext4_empty_dir(struct inode *inode)
le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
- return true;
+ return false;
}
offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
while (offset < inode->i_size) {
@@ -2891,16 +3098,13 @@ bool ext4_empty_dir(struct inode *inode)
continue;
}
if (IS_ERR(bh))
- return true;
+ return false;
}
de = (struct ext4_dir_entry_2 *) (bh->b_data +
(offset & (sb->s_blocksize - 1)));
if (ext4_check_dir_entry(inode, NULL, de, bh,
- bh->b_data, bh->b_size, offset)) {
- offset = (offset | (sb->s_blocksize - 1)) + 1;
- continue;
- }
- if (le32_to_cpu(de->inode)) {
+ bh->b_data, bh->b_size, offset) ||
+ le32_to_cpu(de->inode)) {
brelse(bh);
return false;
}
@@ -2910,180 +3114,6 @@ bool ext4_empty_dir(struct inode *inode)
return true;
}
-/*
- * ext4_orphan_add() links an unlinked or truncated inode into a list of
- * such inodes, starting at the superblock, in case we crash before the
- * file is closed/deleted, or in case the inode truncate spans multiple
- * transactions and the last transaction is not recovered after a crash.
- *
- * At filesystem recovery time, we walk this list deleting unlinked
- * inodes and truncating linked inodes in ext4_orphan_cleanup().
- *
- * Orphan list manipulation functions must be called under i_mutex unless
- * we are just creating the inode or deleting it.
- */
-int ext4_orphan_add(handle_t *handle, struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_iloc iloc;
- int err = 0, rc;
- bool dirty = false;
-
- if (!sbi->s_journal || is_bad_inode(inode))
- return 0;
-
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
- !inode_is_locked(inode));
- /*
- * Exit early if inode already is on orphan list. This is a big speedup
- * since we don't have to contend on the global s_orphan_lock.
- */
- if (!list_empty(&EXT4_I(inode)->i_orphan))
- return 0;
-
- /*
- * Orphan handling is only valid for files with data blocks
- * being truncated, or files being unlinked. Note that we either
- * hold i_mutex, or the inode can not be referenced from outside,
- * so i_nlink should not be bumped due to race
- */
- J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
-
- BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err)
- goto out;
-
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
- goto out;
-
- mutex_lock(&sbi->s_orphan_lock);
- /*
- * Due to previous errors inode may be already a part of on-disk
- * orphan list. If so skip on-disk list modification.
- */
- if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
- (le32_to_cpu(sbi->s_es->s_inodes_count))) {
- /* Insert this inode at the head of the on-disk orphan list */
- NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
- sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- dirty = true;
- }
- list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
- mutex_unlock(&sbi->s_orphan_lock);
-
- if (dirty) {
- err = ext4_handle_dirty_super(handle, sb);
- rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
- if (!err)
- err = rc;
- if (err) {
- /*
- * We have to remove inode from in-memory list if
- * addition to on disk orphan list failed. Stray orphan
- * list entries can cause panics at unmount time.
- */
- mutex_lock(&sbi->s_orphan_lock);
- list_del_init(&EXT4_I(inode)->i_orphan);
- mutex_unlock(&sbi->s_orphan_lock);
- }
- } else
- brelse(iloc.bh);
-
- jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
- jbd_debug(4, "orphan inode %lu will point to %d\n",
- inode->i_ino, NEXT_ORPHAN(inode));
-out:
- ext4_std_error(sb, err);
- return err;
-}
-
-/*
- * ext4_orphan_del() removes an unlinked or truncated inode from the list
- * of such inodes stored on disk, because it is finally being cleaned up.
- */
-int ext4_orphan_del(handle_t *handle, struct inode *inode)
-{
- struct list_head *prev;
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- __u32 ino_next;
- struct ext4_iloc iloc;
- int err = 0;
-
- if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
- return 0;
-
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
- !inode_is_locked(inode));
- /* Do this quick check before taking global s_orphan_lock. */
- if (list_empty(&ei->i_orphan))
- return 0;
-
- if (handle) {
- /* Grab inode buffer early before taking global s_orphan_lock */
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- }
-
- mutex_lock(&sbi->s_orphan_lock);
- jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
-
- prev = ei->i_orphan.prev;
- list_del_init(&ei->i_orphan);
-
- /* If we're on an error path, we may not have a valid
- * transaction handle with which to update the orphan list on
- * disk, but we still need to remove the inode from the linked
- * list in memory. */
- if (!handle || err) {
- mutex_unlock(&sbi->s_orphan_lock);
- goto out_err;
- }
-
- ino_next = NEXT_ORPHAN(inode);
- if (prev == &sbi->s_orphan) {
- jbd_debug(4, "superblock will point to %u\n", ino_next);
- BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err) {
- mutex_unlock(&sbi->s_orphan_lock);
- goto out_brelse;
- }
- sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- mutex_unlock(&sbi->s_orphan_lock);
- err = ext4_handle_dirty_super(handle, inode->i_sb);
- } else {
- struct ext4_iloc iloc2;
- struct inode *i_prev =
- &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
-
- jbd_debug(4, "orphan inode %lu will point to %u\n",
- i_prev->i_ino, ino_next);
- err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
- if (err) {
- mutex_unlock(&sbi->s_orphan_lock);
- goto out_brelse;
- }
- NEXT_ORPHAN(i_prev) = ino_next;
- err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
- mutex_unlock(&sbi->s_orphan_lock);
- }
- if (err)
- goto out_brelse;
- NEXT_ORPHAN(inode) = 0;
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-out_err:
- ext4_std_error(inode->i_sb, err);
- return err;
-
-out_brelse:
- brelse(iloc.bh);
- goto out_err;
-}
-
static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
{
int retval;
@@ -3148,12 +3178,15 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_size = 0;
ext4_orphan_add(handle, inode);
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- ext4_dec_count(handle, dir);
+ retval = ext4_mark_inode_dirty(handle, inode);
+ if (retval)
+ goto end_rmdir;
+ ext4_dec_count(dir);
ext4_update_dx_flag(dir);
- ext4_mark_inode_dirty(handle, dir);
+ ext4_fc_track_unlink(handle, dentry);
+ retval = ext4_mark_inode_dirty(handle, dir);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
@@ -3171,68 +3204,94 @@ end_rmdir:
return retval;
}
-static int ext4_unlink(struct inode *dir, struct dentry *dentry)
+int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
+ struct inode *inode)
{
- int retval;
- struct inode *inode;
+ int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
- handle_t *handle = NULL;
-
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
- return -EIO;
+ int skip_remove_dentry = 0;
- trace_ext4_unlink_enter(dir, dentry);
- /* Initialize quotas before so that eventual writes go
- * in separate transaction */
- retval = dquot_initialize(dir);
- if (retval)
- return retval;
- retval = dquot_initialize(d_inode(dentry));
- if (retval)
- return retval;
-
- retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
- if (!bh)
- goto end_unlink;
- inode = d_inode(dentry);
-
- retval = -EFSCORRUPTED;
- if (le32_to_cpu(de->inode) != inode->i_ino)
- goto end_unlink;
+ if (!bh)
+ return -ENOENT;
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle)) {
- retval = PTR_ERR(handle);
- handle = NULL;
- goto end_unlink;
+ if (le32_to_cpu(de->inode) != inode->i_ino) {
+ /*
+ * It's okay if we find dont find dentry which matches
+ * the inode. That's because it might have gotten
+ * renamed to a different inode number
+ */
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ skip_remove_dentry = 1;
+ else
+ goto out;
}
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- retval = ext4_delete_entry(handle, dir, de, bh);
- if (retval)
- goto end_unlink;
- dir->i_ctime = dir->i_mtime = current_time(dir);
- ext4_update_dx_flag(dir);
- ext4_mark_inode_dirty(handle, dir);
+ if (!skip_remove_dentry) {
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto out;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ ext4_update_dx_flag(dir);
+ retval = ext4_mark_inode_dirty(handle, dir);
+ if (retval)
+ goto out;
+ } else {
+ retval = 0;
+ }
if (inode->i_nlink == 0)
ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
- dentry->d_name.len, dentry->d_name.name);
+ d_name->len, d_name->name);
else
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ retval = ext4_mark_inode_dirty(handle, inode);
+
+out:
+ brelse(bh);
+ return retval;
+}
-#ifdef CONFIG_UNICODE
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
+{
+ handle_t *handle;
+ int retval;
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+ return -EIO;
+
+ trace_ext4_unlink_enter(dir, dentry);
+ /*
+ * Initialize quotas before so that eventual writes go
+ * in separate transaction
+ */
+ retval = dquot_initialize(dir);
+ if (retval)
+ goto out_trace;
+ retval = dquot_initialize(d_inode(dentry));
+ if (retval)
+ goto out_trace;
+
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ goto out_trace;
+ }
+
+ retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));
+ if (!retval)
+ ext4_fc_track_unlink(handle, dentry);
+#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
@@ -3242,16 +3301,41 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
-
-end_unlink:
- brelse(bh);
if (handle)
ext4_journal_stop(handle);
+
+out_trace:
trace_ext4_unlink_exit(dentry, retval);
return retval;
}
-static int ext4_symlink(struct inode *dir,
+static int ext4_init_symlink_block(handle_t *handle, struct inode *inode,
+ struct fscrypt_str *disk_link)
+{
+ struct buffer_head *bh;
+ char *kaddr;
+ int err = 0;
+
+ bh = ext4_bread(handle, inode, 0, EXT4_GET_BLOCKS_CREATE);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE);
+ if (err)
+ goto out;
+
+ kaddr = (char *)bh->b_data;
+ memcpy(kaddr, disk_link->name, disk_link->len);
+ inode->i_size = disk_link->len - 1;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+out:
+ brelse(bh);
+ return err;
+}
+
+static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, const char *symname)
{
handle_t *handle;
@@ -3259,6 +3343,7 @@ static int ext4_symlink(struct inode *dir,
int err, len = strlen(symname);
int credits;
struct fscrypt_str disk_link;
+ int retries = 0;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
return -EIO;
@@ -3272,34 +3357,24 @@ static int ext4_symlink(struct inode *dir,
if (err)
return err;
- if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
- /*
- * For non-fast symlinks, we just allocate inode and put it on
- * orphan list in the first transaction => we need bitmap,
- * group descriptor, sb, inode block, quota blocks, and
- * possibly selinux xattr blocks.
- */
- credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
- EXT4_XATTR_TRANS_BLOCKS;
- } else {
- /*
- * Fast symlink. We have to add entry to directory
- * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
- * allocate new inode (bitmap, group descriptor, inode block,
- * quota blocks, sb is already counted in previous macros).
- */
- credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
- }
-
- inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
+ /*
+ * EXT4_INDEX_EXTRA_TRANS_BLOCKS for addition of entry into the
+ * directory. +3 for inode, inode bitmap, group descriptor allocation.
+ * EXT4_DATA_TRANS_BLOCKS for the data block allocation and
+ * modification.
+ */
+ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
+retry:
+ inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFLNK|S_IRWXUGO,
&dentry->d_name, 0, NULL,
EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
if (IS_ERR(inode)) {
if (handle)
ext4_journal_stop(handle);
- return PTR_ERR(inode);
+ err = PTR_ERR(inode);
+ goto out_retry;
}
if (IS_ENCRYPTED(inode)) {
@@ -3307,102 +3382,53 @@ static int ext4_symlink(struct inode *dir,
if (err)
goto err_drop_inode;
inode->i_op = &ext4_encrypted_symlink_inode_operations;
+ } else {
+ if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
+ inode->i_op = &ext4_symlink_inode_operations;
+ } else {
+ inode->i_op = &ext4_fast_symlink_inode_operations;
+ inode->i_link = (char *)&EXT4_I(inode)->i_data;
+ }
}
if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
- if (!IS_ENCRYPTED(inode))
- inode->i_op = &ext4_symlink_inode_operations;
- inode_nohighmem(inode);
- ext4_set_aops(inode);
- /*
- * We cannot call page_symlink() with transaction started
- * because it calls into ext4_write_begin() which can wait
- * for transaction commit if we are running out of space
- * and thus we deadlock. So we have to stop transaction now
- * and restart it when symlink contents is written.
- *
- * To keep fs consistent in case of crash, we have to put inode
- * to orphan list in the mean time.
- */
- drop_nlink(inode);
- err = ext4_orphan_add(handle, inode);
- ext4_journal_stop(handle);
- handle = NULL;
- if (err)
- goto err_drop_inode;
- err = __page_symlink(inode, disk_link.name, disk_link.len, 1);
- if (err)
- goto err_drop_inode;
- /*
- * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
- * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
- */
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- handle = NULL;
- goto err_drop_inode;
- }
- set_nlink(inode, 1);
- err = ext4_orphan_del(handle, inode);
+ /* alloc symlink block and fill it */
+ err = ext4_init_symlink_block(handle, inode, &disk_link);
if (err)
goto err_drop_inode;
} else {
/* clear the extent format for fast symlink */
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
- if (!IS_ENCRYPTED(inode)) {
- inode->i_op = &ext4_fast_symlink_inode_operations;
- inode->i_link = (char *)&EXT4_I(inode)->i_data;
- }
memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
disk_link.len);
inode->i_size = disk_link.len - 1;
+ EXT4_I(inode)->i_disksize = inode->i_size;
}
- EXT4_I(inode)->i_disksize = inode->i_size;
err = ext4_add_nondir(handle, dentry, &inode);
if (handle)
ext4_journal_stop(handle);
- if (inode)
- iput(inode);
- goto out_free_encrypted_link;
+ iput(inode);
+ goto out_retry;
err_drop_inode:
- if (handle)
- ext4_journal_stop(handle);
clear_nlink(inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
+ if (handle)
+ ext4_journal_stop(handle);
iput(inode);
-out_free_encrypted_link:
+out_retry:
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
if (disk_link.name != (unsigned char *)symname)
kfree(disk_link.name);
return err;
}
-static int ext4_link(struct dentry *old_dentry,
- struct inode *dir, struct dentry *dentry)
+int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry)
{
handle_t *handle;
- struct inode *inode = d_inode(old_dentry);
int err, retries = 0;
-
- if (inode->i_nlink >= EXT4_LINK_MAX)
- return -EMLINK;
-
- err = fscrypt_prepare_link(old_dentry, dir, dentry);
- if (err)
- return err;
-
- if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
- (!projid_eq(EXT4_I(dir)->i_projid,
- EXT4_I(old_dentry->d_inode)->i_projid)))
- return -EXDEV;
-
- err = dquot_initialize(dir);
- if (err)
- return err;
-
retry:
handle = ext4_journal_start(dir, EXT4_HT_DIR,
(EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
@@ -3414,18 +3440,19 @@ retry:
ext4_handle_sync(handle);
inode->i_ctime = current_time(inode);
- ext4_inc_count(handle, inode);
+ ext4_inc_count(inode);
ihold(inode);
err = ext4_add_entry(handle, dentry, inode);
if (!err) {
- ext4_mark_inode_dirty(handle, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
/* this can happen only for tmpfile being
* linked the first time
*/
if (inode->i_nlink == 1)
ext4_orphan_del(handle, inode);
d_instantiate(dentry, inode);
+ ext4_fc_track_link(handle, dentry);
} else {
drop_nlink(inode);
iput(inode);
@@ -3436,6 +3463,29 @@ retry:
return err;
}
+static int ext4_link(struct dentry *old_dentry,
+ struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(old_dentry);
+ int err;
+
+ if (inode->i_nlink >= EXT4_LINK_MAX)
+ return -EMLINK;
+
+ err = fscrypt_prepare_link(old_dentry, dir, dentry);
+ if (err)
+ return err;
+
+ if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
+ (!projid_eq(EXT4_I(dir)->i_projid,
+ EXT4_I(old_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
+ return __ext4_link(dir, inode, dentry);
+}
/*
* Try to find buffer head where contains the parent block.
@@ -3451,6 +3501,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
struct buffer_head *bh;
if (!ext4_has_inline_data(inode)) {
+ struct ext4_dir_entry_2 *de;
+ unsigned int offset;
+
/* The first directory block must not be a hole, so
* treat it as DIRENT_HTREE
*/
@@ -3459,9 +3512,30 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
*retval = PTR_ERR(bh);
return NULL;
}
- *parent_de = ext4_next_entry(
- (struct ext4_dir_entry_2 *)bh->b_data,
- inode->i_sb->s_blocksize);
+
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
+ bh->b_size, 0) ||
+ le32_to_cpu(de->inode) != inode->i_ino ||
+ strcmp(".", de->name)) {
+ EXT4_ERROR_INODE(inode, "directory missing '.'");
+ brelse(bh);
+ *retval = -EFSCORRUPTED;
+ return NULL;
+ }
+ offset = ext4_rec_len_from_disk(de->rec_len,
+ inode->i_sb->s_blocksize);
+ de = ext4_next_entry(de, inode->i_sb->s_blocksize);
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
+ bh->b_size, offset) ||
+ le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ EXT4_ERROR_INODE(inode, "directory missing '..'");
+ brelse(bh);
+ *retval = -EFSCORRUPTED;
+ return NULL;
+ }
+ *parent_de = de;
+
return bh;
}
@@ -3499,7 +3573,8 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
return -EFSCORRUPTED;
BUFFER_TRACE(ent->dir_bh, "get_write_access");
- return ext4_journal_get_write_access(handle, ent->dir_bh);
+ return ext4_journal_get_write_access(handle, ent->dir->i_sb,
+ ent->dir_bh, EXT4_JTR_NONE);
}
static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
@@ -3531,10 +3606,11 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
unsigned ino, unsigned file_type)
{
- int retval;
+ int retval, retval2;
BUFFER_TRACE(ent->bh, "get write access");
- retval = ext4_journal_get_write_access(handle, ent->bh);
+ retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh,
+ EXT4_JTR_NONE);
if (retval)
return retval;
ent->de->inode = cpu_to_le32(ino);
@@ -3543,19 +3619,41 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
inode_inc_iversion(ent->dir);
ent->dir->i_ctime = ent->dir->i_mtime =
current_time(ent->dir);
- ext4_mark_inode_dirty(handle, ent->dir);
+ retval = ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
- retval = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh);
- if (unlikely(retval)) {
- ext4_std_error(ent->dir->i_sb, retval);
- return retval;
+ retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh);
+ if (unlikely(retval2)) {
+ ext4_std_error(ent->dir->i_sb, retval2);
+ return retval2;
}
}
- brelse(ent->bh);
- ent->bh = NULL;
+ return retval;
+}
- return 0;
+static void ext4_resetent(handle_t *handle, struct ext4_renament *ent,
+ unsigned ino, unsigned file_type)
+{
+ struct ext4_renament old = *ent;
+ int retval = 0;
+
+ /*
+ * old->de could have moved from under us during make indexed dir,
+ * so the old->de may no longer valid and need to find it again
+ * before reset old inode info.
+ */
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+ if (IS_ERR(old.bh))
+ retval = PTR_ERR(old.bh);
+ if (!old.bh)
+ retval = -ENOENT;
+ if (retval) {
+ ext4_std_error(old.dir->i_sb, retval);
+ return;
+ }
+
+ ext4_setent(handle, &old, ino, file_type);
+ brelse(old.bh);
}
static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
@@ -3611,14 +3709,15 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
{
if (ent->dir_nlink_delta) {
if (ent->dir_nlink_delta == -1)
- ext4_dec_count(handle, ent->dir);
+ ext4_dec_count(ent->dir);
else
- ext4_inc_count(handle, ent->dir);
+ ext4_inc_count(ent->dir);
ext4_mark_inode_dirty(handle, ent->dir);
}
}
-static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
+static struct inode *ext4_whiteout_for_rename(struct user_namespace *mnt_userns,
+ struct ext4_renament *ent,
int credits, handle_t **h)
{
struct inode *wh;
@@ -3632,7 +3731,8 @@ static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) +
EXT4_XATTR_TRANS_BLOCKS + 4);
retry:
- wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE,
+ wh = ext4_new_inode_start_handle(mnt_userns, ent->dir,
+ S_IFCHR | WHITEOUT_MODE,
&ent->dentry->d_name, 0, NULL,
EXT4_HT_DIR, credits);
@@ -3659,9 +3759,9 @@ retry:
* while new_{dentry,inode) refers to the destination dentry/inode
* This comes from rename(const char *oldpath, const char *newpath)
*/
-static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
handle_t *handle = NULL;
struct ext4_renament old = {
@@ -3717,14 +3817,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
retval = -ENOENT;
if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
- goto end_rename;
+ goto release_bh;
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
new.bh = NULL;
- goto end_rename;
+ goto release_bh;
}
if (new.bh) {
if (!new.inode) {
@@ -3741,18 +3841,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
- handle = NULL;
- goto end_rename;
+ goto release_bh;
}
} else {
- whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
+ whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle);
if (IS_ERR(whiteout)) {
retval = PTR_ERR(whiteout);
- whiteout = NULL;
- goto end_rename;
+ goto release_bh;
}
}
+ old_file_type = old.de->file_type;
if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
ext4_handle_sync(handle);
@@ -3780,7 +3879,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
force_reread = (new.dir->i_ino == old.dir->i_ino &&
ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
- old_file_type = old.de->file_type;
if (whiteout) {
/*
* Do this before adding a new entry, so the old entry is sure
@@ -3790,7 +3888,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
EXT4_FT_CHRDEV);
if (retval)
goto end_rename;
- ext4_mark_inode_dirty(handle, whiteout);
+ retval = ext4_mark_inode_dirty(handle, whiteout);
+ if (unlikely(retval))
+ goto end_rename;
+
}
if (!new.bh) {
retval = ext4_add_entry(handle, new.dentry, old.inode);
@@ -3811,7 +3912,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
* rename.
*/
old.inode->i_ctime = current_time(old.inode);
- ext4_mark_inode_dirty(handle, old.inode);
+ retval = ext4_mark_inode_dirty(handle, old.inode);
+ if (unlikely(retval))
+ goto end_rename;
if (!whiteout) {
/*
@@ -3821,7 +3924,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (new.inode) {
- ext4_dec_count(handle, new.inode);
+ ext4_dec_count(new.inode);
new.inode->i_ctime = current_time(new.inode);
}
old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
@@ -3831,38 +3934,75 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (retval)
goto end_rename;
- ext4_dec_count(handle, old.dir);
+ ext4_dec_count(old.dir);
if (new.inode) {
/* checked ext4_empty_dir above, can't have another
* parent, ext4_dec_count() won't work for many-linked
* dirs */
clear_nlink(new.inode);
} else {
- ext4_inc_count(handle, new.dir);
+ ext4_inc_count(new.dir);
ext4_update_dx_flag(new.dir);
- ext4_mark_inode_dirty(handle, new.dir);
+ retval = ext4_mark_inode_dirty(handle, new.dir);
+ if (unlikely(retval))
+ goto end_rename;
+ }
+ }
+ retval = ext4_mark_inode_dirty(handle, old.dir);
+ if (unlikely(retval))
+ goto end_rename;
+
+ if (S_ISDIR(old.inode->i_mode)) {
+ /*
+ * We disable fast commits here that's because the
+ * replay code is not yet capable of changing dot dot
+ * dirents in directories.
+ */
+ ext4_fc_mark_ineligible(old.inode->i_sb,
+ EXT4_FC_REASON_RENAME_DIR, handle);
+ } else {
+ struct super_block *sb = old.inode->i_sb;
+
+ if (new.inode)
+ ext4_fc_track_unlink(handle, new.dentry);
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) {
+ __ext4_fc_track_link(handle, old.inode, new.dentry);
+ __ext4_fc_track_unlink(handle, old.inode, old.dentry);
+ if (whiteout)
+ __ext4_fc_track_create(handle, whiteout,
+ old.dentry);
}
}
- ext4_mark_inode_dirty(handle, old.dir);
+
if (new.inode) {
- ext4_mark_inode_dirty(handle, new.inode);
+ retval = ext4_mark_inode_dirty(handle, new.inode);
+ if (unlikely(retval))
+ goto end_rename;
if (!new.inode->i_nlink)
ext4_orphan_add(handle, new.inode);
}
retval = 0;
end_rename:
- brelse(old.dir_bh);
- brelse(old.bh);
- brelse(new.bh);
if (whiteout) {
- if (retval)
+ if (retval) {
+ ext4_resetent(handle, &old,
+ old.inode->i_ino, old_file_type);
drop_nlink(whiteout);
+ ext4_orphan_add(handle, whiteout);
+ }
unlock_new_inode(whiteout);
+ ext4_journal_stop(handle);
iput(whiteout);
- }
- if (handle)
+ } else {
ext4_journal_stop(handle);
+ }
+release_bh:
+ brelse(old.dir_bh);
+ brelse(old.bh);
+ brelse(new.bh);
return retval;
}
@@ -3979,9 +4119,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
ctime = current_time(old.inode);
old.inode->i_ctime = ctime;
new.inode->i_ctime = ctime;
- ext4_mark_inode_dirty(handle, old.inode);
- ext4_mark_inode_dirty(handle, new.inode);
-
+ retval = ext4_mark_inode_dirty(handle, old.inode);
+ if (unlikely(retval))
+ goto end_rename;
+ retval = ext4_mark_inode_dirty(handle, new.inode);
+ if (unlikely(retval))
+ goto end_rename;
+ ext4_fc_mark_ineligible(new.inode->i_sb,
+ EXT4_FC_REASON_CROSS_RENAME, handle);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
if (retval)
@@ -4006,7 +4151,8 @@ end_rename:
return retval;
}
-static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+static int ext4_rename2(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
@@ -4028,7 +4174,7 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
new_dir, new_dentry);
}
- return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return ext4_rename(mnt_userns, old_dir, old_dentry, new_dir, new_dentry, flags);
}
/*
@@ -4051,6 +4197,8 @@ const struct inode_operations ext4_dir_inode_operations = {
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
+ .fileattr_get = ext4_fileattr_get,
+ .fileattr_set = ext4_fileattr_set,
};
const struct inode_operations ext4_special_inode_operations = {
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
new file mode 100644
index 000000000000..69a9cf9137a6
--- /dev/null
+++ b/fs/ext4/orphan.c
@@ -0,0 +1,652 @@
+/*
+ * Ext4 orphan inode handling
+ */
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
+{
+ int i, j, start;
+ struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
+ int ret = 0;
+ bool found = false;
+ __le32 *bdata;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
+ int looped = 0;
+
+ /*
+ * Find block with free orphan entry. Use CPU number for a naive hash
+ * for a search start in the orphan file
+ */
+ start = raw_smp_processor_id()*13 % oi->of_blocks;
+ i = start;
+ do {
+ if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries)
+ >= 0) {
+ found = true;
+ break;
+ }
+ if (++i >= oi->of_blocks)
+ i = 0;
+ } while (i != start);
+
+ if (!found) {
+ /*
+ * For now we don't grow or shrink orphan file. We just use
+ * whatever was allocated at mke2fs time. The additional
+ * credits we would have to reserve for each orphan inode
+ * operation just don't seem worth it.
+ */
+ return -ENOSPC;
+ }
+
+ ret = ext4_journal_get_write_access(handle, inode->i_sb,
+ oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
+ if (ret) {
+ atomic_inc(&oi->of_binfo[i].ob_free_entries);
+ return ret;
+ }
+
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ /* Find empty slot in a block */
+ j = 0;
+ do {
+ if (looped) {
+ /*
+ * Did we walk through the block several times without
+ * finding free entry? It is theoretically possible
+ * if entries get constantly allocated and freed or
+ * if the block is corrupted. Avoid indefinite looping
+ * and bail. We'll use orphan list instead.
+ */
+ if (looped > 3) {
+ atomic_inc(&oi->of_binfo[i].ob_free_entries);
+ return -ENOSPC;
+ }
+ cond_resched();
+ }
+ while (bdata[j]) {
+ if (++j >= inodes_per_ob) {
+ j = 0;
+ looped++;
+ }
+ }
+ } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) !=
+ (__le32)0);
+
+ EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
+ ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+
+ return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
+}
+
+/*
+ * ext4_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext4_orphan_cleanup().
+ *
+ * Orphan list manipulation functions must be called under i_rwsem unless
+ * we are just creating the inode or deleting it.
+ */
+int ext4_orphan_add(handle_t *handle, struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_iloc iloc;
+ int err = 0, rc;
+ bool dirty = false;
+
+ if (!sbi->s_journal || is_bad_inode(inode))
+ return 0;
+
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !inode_is_locked(inode));
+ /*
+ * Inode orphaned in orphan file or in orphan list?
+ */
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
+ !list_empty(&EXT4_I(inode)->i_orphan))
+ return 0;
+
+ /*
+ * Orphan handling is only valid for files with data blocks
+ * being truncated, or files being unlinked. Note that we either
+ * hold i_rwsem, or the inode can not be referenced from outside,
+ * so i_nlink should not be bumped due to race
+ */
+ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+ if (sbi->s_orphan_info.of_blocks) {
+ err = ext4_orphan_file_add(handle, inode);
+ /*
+ * Fallback to normal orphan list of orphan file is
+ * out of space
+ */
+ if (err != -ENOSPC)
+ return err;
+ }
+
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out;
+
+ mutex_lock(&sbi->s_orphan_lock);
+ /*
+ * Due to previous errors inode may be already a part of on-disk
+ * orphan list. If so skip on-disk list modification.
+ */
+ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
+ (le32_to_cpu(sbi->s_es->s_inodes_count))) {
+ /* Insert this inode at the head of the on-disk orphan list */
+ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
+ lock_buffer(sbi->s_sbh);
+ sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
+ dirty = true;
+ }
+ list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+
+ if (dirty) {
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+ if (err) {
+ /*
+ * We have to remove inode from in-memory list if
+ * addition to on disk orphan list failed. Stray orphan
+ * list entries can cause panics at unmount time.
+ */
+ mutex_lock(&sbi->s_orphan_lock);
+ list_del_init(&EXT4_I(inode)->i_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ } else
+ brelse(iloc.bh);
+
+ ext4_debug("superblock will point to %lu\n", inode->i_ino);
+ ext4_debug("orphan inode %lu will point to %d\n",
+ inode->i_ino, NEXT_ORPHAN(inode));
+out:
+ ext4_std_error(sb, err);
+ return err;
+}
+
+static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
+ __le32 *bdata;
+ int blk, off;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
+ int ret = 0;
+
+ if (!handle)
+ goto out;
+ blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob;
+ off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob;
+ if (WARN_ON_ONCE(blk >= oi->of_blocks))
+ goto out;
+
+ ret = ext4_journal_get_write_access(handle, inode->i_sb,
+ oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE);
+ if (ret)
+ goto out;
+
+ bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
+ bdata[off] = 0;
+ atomic_inc(&oi->of_binfo[blk].ob_free_entries);
+ ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
+out:
+ ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+ INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan);
+
+ return ret;
+}
+
+/*
+ * ext4_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext4_orphan_del(handle_t *handle, struct inode *inode)
+{
+ struct list_head *prev;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 ino_next;
+ struct ext4_iloc iloc;
+ int err = 0;
+
+ if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
+ return 0;
+
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !inode_is_locked(inode));
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
+ return ext4_orphan_file_del(handle, inode);
+
+ /* Do this quick check before taking global s_orphan_lock. */
+ if (list_empty(&ei->i_orphan))
+ return 0;
+
+ if (handle) {
+ /* Grab inode buffer early before taking global s_orphan_lock */
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ }
+
+ mutex_lock(&sbi->s_orphan_lock);
+ ext4_debug("remove inode %lu from orphan list\n", inode->i_ino);
+
+ prev = ei->i_orphan.prev;
+ list_del_init(&ei->i_orphan);
+
+ /* If we're on an error path, we may not have a valid
+ * transaction handle with which to update the orphan list on
+ * disk, but we still need to remove the inode from the linked
+ * list in memory. */
+ if (!handle || err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_err;
+ }
+
+ ino_next = NEXT_ORPHAN(inode);
+ if (prev == &sbi->s_orphan) {
+ ext4_debug("superblock will point to %u\n", ino_next);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ sbi->s_sbh, EXT4_JTR_NONE);
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_brelse;
+ }
+ lock_buffer(sbi->s_sbh);
+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ ext4_superblock_csum_set(inode->i_sb);
+ unlock_buffer(sbi->s_sbh);
+ mutex_unlock(&sbi->s_orphan_lock);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ } else {
+ struct ext4_iloc iloc2;
+ struct inode *i_prev =
+ &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
+
+ ext4_debug("orphan inode %lu will point to %u\n",
+ i_prev->i_ino, ino_next);
+ err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_brelse;
+ }
+ NEXT_ORPHAN(i_prev) = ino_next;
+ err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ if (err)
+ goto out_brelse;
+ NEXT_ORPHAN(inode) = 0;
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+out_err:
+ ext4_std_error(inode->i_sb, err);
+ return err;
+
+out_brelse:
+ brelse(iloc.bh);
+ goto out_err;
+}
+
+#ifdef CONFIG_QUOTA
+static int ext4_quota_on_mount(struct super_block *sb, int type)
+{
+ return dquot_quota_on_mount(sb,
+ rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type],
+ lockdep_is_held(&sb->s_umount)),
+ EXT4_SB(sb)->s_jquota_fmt, type);
+}
+#endif
+
+static void ext4_process_orphan(struct inode *inode,
+ int *nr_truncates, int *nr_orphans)
+{
+ struct super_block *sb = inode->i_sb;
+ int ret;
+
+ dquot_initialize(inode);
+ if (inode->i_nlink) {
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
+ __func__, inode->i_ino, inode->i_size);
+ ext4_debug("truncating inode %lu to %lld bytes\n",
+ inode->i_ino, inode->i_size);
+ inode_lock(inode);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ret = ext4_truncate(inode);
+ if (ret) {
+ /*
+ * We need to clean up the in-core orphan list
+ * manually if ext4_truncate() failed to get a
+ * transaction handle.
+ */
+ ext4_orphan_del(NULL, inode);
+ ext4_std_error(inode->i_sb, ret);
+ }
+ inode_unlock(inode);
+ (*nr_truncates)++;
+ } else {
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
+ __func__, inode->i_ino);
+ ext4_debug("deleting unreferenced inode %lu\n",
+ inode->i_ino);
+ (*nr_orphans)++;
+ }
+ iput(inode); /* The delete magic happens here! */
+}
+
+/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash. We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode. The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext4_free_inode(). The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
+{
+ unsigned int s_flags = sb->s_flags;
+ int nr_orphans = 0, nr_truncates = 0;
+ struct inode *inode;
+ int i, j;
+#ifdef CONFIG_QUOTA
+ int quota_update = 0;
+#endif
+ __le32 *bdata;
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+
+ if (!es->s_last_orphan && !oi->of_blocks) {
+ ext4_debug("no orphan inodes to clean up\n");
+ return;
+ }
+
+ if (bdev_read_only(sb->s_bdev)) {
+ ext4_msg(sb, KERN_ERR, "write access "
+ "unavailable, skipping orphan cleanup");
+ return;
+ }
+
+ /* Check if feature set would not allow a r/w mount */
+ if (!ext4_feature_set_ok(sb, 0)) {
+ ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+ "unknown ROCOMPAT features");
+ return;
+ }
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ /* don't clear list on RO mount w/ errors */
+ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
+ ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
+ "clearing orphan list.\n");
+ es->s_last_orphan = 0;
+ }
+ ext4_debug("Skipping orphan recovery on fs with errors.\n");
+ return;
+ }
+
+ if (s_flags & SB_RDONLY) {
+ ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
+ sb->s_flags &= ~SB_RDONLY;
+ }
+#ifdef CONFIG_QUOTA
+ /*
+ * Turn on quotas which were not enabled for read-only mounts if
+ * filesystem has quota feature, so that they are updated correctly.
+ */
+ if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
+ int ret = ext4_enable_quotas(sb);
+
+ if (!ret)
+ quota_update = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on quotas: error %d", ret);
+ }
+
+ /* Turn on journaled quotas used for old sytle */
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (EXT4_SB(sb)->s_qf_names[i]) {
+ int ret = ext4_quota_on_mount(sb, i);
+
+ if (!ret)
+ quota_update = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on journaled "
+ "quota: type %d: error %d", i, ret);
+ }
+ }
+#endif
+
+ while (es->s_last_orphan) {
+ /*
+ * We may have encountered an error during cleanup; if
+ * so, skip the rest.
+ */
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ ext4_debug("Skipping orphan recovery on fs with errors.\n");
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+ if (IS_ERR(inode)) {
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+ ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
+ }
+
+ for (i = 0; i < oi->of_blocks; i++) {
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ for (j = 0; j < inodes_per_ob; j++) {
+ if (!bdata[j])
+ continue;
+ inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j]));
+ if (IS_ERR(inode))
+ continue;
+ ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+ EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
+ ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
+ }
+ }
+
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
+
+ if (nr_orphans)
+ ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+ PLURAL(nr_orphans));
+ if (nr_truncates)
+ ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+ PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+ /* Turn off quotas if they were enabled for orphan cleanup */
+ if (quota_update) {
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (sb_dqopt(sb)->files[i])
+ dquot_quota_off(sb, i);
+ }
+ }
+#endif
+ sb->s_flags = s_flags; /* Restore SB_RDONLY status */
+}
+
+void ext4_release_orphan_info(struct super_block *sb)
+{
+ int i;
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+
+ if (!oi->of_blocks)
+ return;
+ for (i = 0; i < oi->of_blocks; i++)
+ brelse(oi->of_binfo[i].ob_bh);
+ kfree(oi->of_binfo);
+}
+
+static struct ext4_orphan_block_tail *ext4_orphan_block_tail(
+ struct super_block *sb,
+ struct buffer_head *bh)
+{
+ return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize -
+ sizeof(struct ext4_orphan_block_tail));
+}
+
+static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ __u32 calculated;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct ext4_orphan_block_tail *ot;
+ __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
+
+ if (!ext4_has_metadata_csum(sb))
+ return 1;
+
+ ot = ext4_orphan_block_tail(sb, bh);
+ calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
+ (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
+ calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data,
+ inodes_per_ob * sizeof(__u32));
+ return le32_to_cpu(ot->ob_checksum) == calculated;
+}
+
+/* This gets called only when checksumming is enabled */
+void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct super_block *sb = EXT4_TRIGGER(triggers)->sb;
+ __u32 csum;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct ext4_orphan_block_tail *ot;
+ __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
+
+ csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
+ (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
+ csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data,
+ inodes_per_ob * sizeof(__u32));
+ ot = ext4_orphan_block_tail(sb, bh);
+ ot->ob_checksum = cpu_to_le32(csum);
+}
+
+int ext4_init_orphan_info(struct super_block *sb)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct inode *inode;
+ int i, j;
+ int ret;
+ int free;
+ __le32 *bdata;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_block_tail *ot;
+ ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);
+
+ if (!ext4_has_feature_orphan_file(sb))
+ return 0;
+
+ inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL);
+ if (IS_ERR(inode)) {
+ ext4_msg(sb, KERN_ERR, "get orphan inode failed");
+ return PTR_ERR(inode);
+ }
+ oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
+ oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
+ oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block),
+ GFP_KERNEL);
+ if (!oi->of_binfo) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+ for (i = 0; i < oi->of_blocks; i++) {
+ oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0);
+ if (IS_ERR(oi->of_binfo[i].ob_bh)) {
+ ret = PTR_ERR(oi->of_binfo[i].ob_bh);
+ goto out_free;
+ }
+ if (!oi->of_binfo[i].ob_bh) {
+ ret = -EIO;
+ goto out_free;
+ }
+ ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh);
+ if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) {
+ ext4_error(sb, "orphan file block %d: bad magic", i);
+ ret = -EIO;
+ goto out_free;
+ }
+ if (!ext4_orphan_file_block_csum_verify(sb,
+ oi->of_binfo[i].ob_bh)) {
+ ext4_error(sb, "orphan file block %d: bad checksum", i);
+ ret = -EIO;
+ goto out_free;
+ }
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ free = 0;
+ for (j = 0; j < inodes_per_ob; j++)
+ if (bdata[j] == 0)
+ free++;
+ atomic_set(&oi->of_binfo[i].ob_free_entries, free);
+ }
+ iput(inode);
+ return 0;
+out_free:
+ for (i--; i >= 0; i--)
+ brelse(oi->of_binfo[i].ob_bh);
+ kfree(oi->of_binfo);
+out_put:
+ iput(inode);
+ return ret;
+}
+
+int ext4_orphan_file_empty(struct super_block *sb)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ int i;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+
+ if (!ext4_has_feature_orphan_file(sb))
+ return 1;
+ for (i = 0; i < oi->of_blocks; i++)
+ if (atomic_read(&oi->of_binfo[i].ob_free_entries) !=
+ inodes_per_ob)
+ return 0;
+ return 1;
+}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 68b39e75446a..97fa7b4c645f 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -24,7 +24,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/backing-dev.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -111,9 +111,6 @@ static void ext4_finish_bio(struct bio *bio)
unsigned under_io = 0;
unsigned long flags;
- if (!page)
- continue;
-
if (fscrypt_is_bounce_page(page)) {
bounce_page = page;
page = fscrypt_pagecache_page(bounce_page);
@@ -125,11 +122,10 @@ static void ext4_finish_bio(struct bio *bio)
}
bh = head = page_buffers(page);
/*
- * We check all buffers in the page under BH_Uptodate_Lock
+ * We check all buffers in the page under b_uptodate_lock
* to avoid races with other end io clearing async_write flags
*/
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ spin_lock_irqsave(&head->b_uptodate_lock, flags);
do {
if (bh_offset(bh) < bio_start ||
bh_offset(bh) + bh->b_size > bio_end) {
@@ -138,11 +134,12 @@ static void ext4_finish_bio(struct bio *bio)
continue;
}
clear_buffer_async_write(bh);
- if (bio->bi_status)
+ if (bio->bi_status) {
+ set_buffer_write_io_error(bh);
buffer_io_error(bh);
+ }
} while ((bh = bh->b_this_page) != head);
- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
if (!under_io) {
fscrypt_free_bounce_page(bounce_page);
end_page_writeback(page);
@@ -284,14 +281,14 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
io_end->inode = inode;
INIT_LIST_HEAD(&io_end->list);
INIT_LIST_HEAD(&io_end->list_vec);
- atomic_set(&io_end->count, 1);
+ refcount_set(&io_end->count, 1);
}
return io_end;
}
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
- if (atomic_dec_and_test(&io_end->count)) {
+ if (refcount_dec_and_test(&io_end->count)) {
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
list_empty(&io_end->list_vec)) {
ext4_release_io_end(io_end);
@@ -305,7 +302,7 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
{
int err = 0;
- if (atomic_dec_and_test(&io_end->count)) {
+ if (refcount_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
err = ext4_convert_unwritten_io_end_vec(io_end->handle,
io_end);
@@ -319,7 +316,7 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
{
- atomic_inc(&io_end->count);
+ refcount_inc(&io_end->count);
return io_end;
}
@@ -328,10 +325,9 @@ static void ext4_end_bio(struct bio *bio)
{
ext4_io_end_t *io_end = bio->bi_private;
sector_t bi_sector = bio->bi_iter.bi_sector;
- char b[BDEVNAME_SIZE];
- if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
- bio_devname(bio, b),
+ if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
+ bio->bi_bdev,
(long long) bio->bi_iter.bi_sector,
(unsigned) bio_sectors(bio),
bio->bi_status)) {
@@ -377,10 +373,8 @@ void ext4_io_submit(struct ext4_io_submit *io)
struct bio *bio = io->io_bio;
if (bio) {
- int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
- REQ_SYNC : 0;
- io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint;
- bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
+ if (io->io_wbc->sync_mode == WB_SYNC_ALL)
+ io->io_bio->bi_opf |= REQ_SYNC;
submit_bio(io->io_bio);
}
io->io_bio = NULL;
@@ -403,9 +397,9 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+ bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO);
+ fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
@@ -420,14 +414,13 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
{
int ret;
- if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+ if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
+ !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
submit_and_retry:
ext4_io_submit(io);
}
- if (io->io_bio == NULL) {
+ if (io->io_bio == NULL)
io_submit_init_bio(io, bh);
- io->io_bio->bi_write_hint = inode->i_write_hint;
- }
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
@@ -438,7 +431,6 @@ submit_and_retry:
int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
int len,
- struct writeback_control *wbc,
bool keep_towrite)
{
struct page *bounce_page = NULL;
@@ -448,6 +440,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
int ret = 0;
int nr_submitted = 0;
int nr_to_submit = 0;
+ struct writeback_control *wbc = io->io_wbc;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
@@ -472,7 +465,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
/*
* In the first loop we prepare and mark buffers to submit. We have to
* mark all buffers in the page before submitting so that
- * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+ * end_page_writeback() cannot be called from ext4_end_bio() when IO
* on the first buffer finishes and we are still working on submitting
* the second buffer.
*/
@@ -508,7 +501,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* (e.g. holes) to be unnecessarily encrypted, but this is rare and
* can't happen in the common case of blocksize == PAGE_SIZE.
*/
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) {
gfp_t gfp_flags = GFP_NOFS;
unsigned int enc_bytes = round_up(len, i_blocksize(inode));
@@ -526,12 +519,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
ret = PTR_ERR(bounce_page);
if (ret == -ENOMEM &&
(io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
- gfp_flags = GFP_NOFS;
+ gfp_t new_gfp_flags = GFP_NOFS;
if (io->io_bio)
ext4_io_submit(io);
else
- gfp_flags |= __GFP_NOFAIL;
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ new_gfp_flags |= __GFP_NOFAIL;
+ memalloc_retry_wait(gfp_flags);
+ gfp_flags = new_gfp_flags;
goto retry_encrypt;
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index c1769afbf799..3d21eae267fc 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -7,8 +7,8 @@
*
* This was originally taken from fs/mpage.c
*
- * The intent is the ext4_mpage_readpages() function here is intended
- * to replace mpage_readpages() in the general case, not just for
+ * The ext4_mpage_readpages() function here is intended to
+ * replace mpage_readahead() in the general case, not just for
* encrypted files. It has some limitations (see below), where it
* will fall back to read_block_full_page(), but these limitations
* should only be hit when page_size != block_size.
@@ -43,7 +43,6 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
-#include <linux/cleancache.h>
#include "ext4.h"
@@ -76,7 +75,7 @@ static void __read_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, iter_all) {
page = bv->bv_page;
- /* PG_error was set if any post_read step failed */
+ /* PG_error was set if verity failed. */
if (bio->bi_status || PageError(page)) {
ClearPageUptodate(page);
/* will re-read again later */
@@ -97,10 +96,12 @@ static void decrypt_work(struct work_struct *work)
{
struct bio_post_read_ctx *ctx =
container_of(work, struct bio_post_read_ctx, work);
+ struct bio *bio = ctx->bio;
- fscrypt_decrypt_bio(ctx->bio);
-
- bio_post_read_processing(ctx);
+ if (fscrypt_decrypt_bio(bio))
+ bio_post_read_processing(ctx);
+ else
+ __read_end_io(bio);
}
static void verity_work(struct work_struct *work)
@@ -110,7 +111,7 @@ static void verity_work(struct work_struct *work)
struct bio *bio = ctx->bio;
/*
- * fsverity_verify_bio() may call readpages() again, and although verity
+ * fsverity_verify_bio() may call readahead() again, and although verity
* will be disabled for that, decryption may still be needed, causing
* another bio_post_read_ctx to be allocated. So to guarantee that
* mempool_alloc() never deadlocks we must free the current ctx first.
@@ -140,7 +141,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
return;
}
ctx->cur_step++;
- /* fall-through */
+ fallthrough;
case STEP_VERITY:
if (ctx->enabled_steps & (1 << STEP_VERITY)) {
INIT_WORK(&ctx->work, verity_work);
@@ -148,7 +149,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
return;
}
ctx->cur_step++;
- /* fall-through */
+ fallthrough;
default:
__read_end_io(ctx->bio);
}
@@ -164,7 +165,7 @@ static bool bio_post_read_required(struct bio *bio)
*
* The mpage code never puts partial pages into a BIO (except for end-of-file).
* If a page does not map to a contiguous run of blocks then it simply falls
- * back to block_read_full_page().
+ * back to block_read_full_folio().
*
* Why is this? If a page's completion depends on a number of different BIOs
* which can complete in any order (or at the same time) then determining the
@@ -195,7 +196,7 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio,
{
unsigned int post_read_steps = 0;
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+ if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= 1 << STEP_DECRYPT;
if (ext4_need_verity(inode, first_idx))
@@ -221,17 +222,16 @@ static inline loff_t ext4_readpage_limit(struct inode *inode)
return i_size_read(inode);
}
-int ext4_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead)
+int ext4_mpage_readpages(struct inode *inode,
+ struct readahead_control *rac, struct page *page)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
- struct inode *inode = mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
+ sector_t next_block;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
@@ -241,6 +241,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
int length;
unsigned relative_block = 0;
struct ext4_map_blocks map;
+ unsigned int nr_pages = rac ? readahead_count(rac) : 1;
map.m_pblk = 0;
map.m_lblk = 0;
@@ -251,20 +252,16 @@ int ext4_mpage_readpages(struct address_space *mapping,
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
- if (pages) {
- page = lru_to_page(pages);
-
+ if (rac) {
+ page = readahead_page(rac);
prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping)))
- goto next_page;
}
if (page_has_buffers(page))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
+ block_in_file = next_block =
+ (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (ext4_readpage_limit(inode) +
blocksize - 1) >> blkbits;
@@ -354,17 +351,13 @@ int ext4_mpage_readpages(struct address_space *mapping,
} else if (fully_mapped) {
SetPageMappedToDisk(page);
}
- if (fully_mapped && blocks_per_page == 1 &&
- !PageUptodate(page) && cleancache_get_page(page) == 0) {
- SetPageUptodate(page);
- goto confused;
- }
/*
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && (last_block_in_bio != blocks[0] - 1)) {
+ if (bio && (last_block_in_bio != blocks[0] - 1 ||
+ !fscrypt_mergeable_bio(bio, inode, next_block))) {
submit_and_realloc:
submit_bio(bio);
bio = NULL;
@@ -374,14 +367,15 @@ int ext4_mpage_readpages(struct address_space *mapping,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_KERNEL,
- min_t(int, nr_pages, BIO_MAX_PAGES));
+ bio = bio_alloc(bdev, bio_max_segs(nr_pages),
+ REQ_OP_READ, GFP_KERNEL);
+ fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
+ GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, page->index);
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
- bio_set_op_attrs(bio, REQ_OP_READ,
- is_readahead ? REQ_RAHEAD : 0);
+ if (rac)
+ bio->bi_opf |= REQ_RAHEAD;
}
length = first_hole << blkbits;
@@ -402,14 +396,13 @@ int ext4_mpage_readpages(struct address_space *mapping,
bio = NULL;
}
if (!PageUptodate(page))
- block_read_full_page(page, ext4_get_block);
+ block_read_full_folio(page_folio(page), ext4_get_block);
else
unlock_page(page);
next_page:
- if (pages)
+ if (rac)
put_page(page);
}
- BUG_ON(pages && !list_empty(pages));
if (bio)
submit_bio(bio);
return 0;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a50b51270ea9..46b87ffeb304 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -14,6 +14,7 @@
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/jiffies.h>
#include "ext4_jbd2.h"
@@ -53,6 +54,16 @@ int ext4_resize_begin(struct super_block *sb)
return -EPERM;
/*
+ * If the reserved GDT blocks is non-zero, the resize_inode feature
+ * should always be set.
+ */
+ if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks &&
+ !ext4_has_feature_resize_inode(sb)) {
+ ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero");
+ return -EFSCORRUPTED;
+ }
+
+ /*
* If we are not using the primary superblock/GDT copy don't resize,
* because the user tools have no way of handling this. Probably a
* bad time to do it anyways.
@@ -74,6 +85,11 @@ int ext4_resize_begin(struct super_block *sb)
return -EPERM;
}
+ if (ext4_has_feature_sparse_super2(sb)) {
+ ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2");
+ return -EOPNOTSUPP;
+ }
+
if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
&EXT4_SB(sb)->s_ext4_flags))
ret = -EBUSY;
@@ -81,10 +97,13 @@ int ext4_resize_begin(struct super_block *sb)
return ret;
}
-void ext4_resize_end(struct super_block *sb)
+int ext4_resize_end(struct super_block *sb, bool update_backups)
{
clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags);
smp_mb__after_atomic();
+ if (update_backups)
+ return ext4_update_overhead(sb, true);
+ return 0;
}
static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
@@ -404,7 +423,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
BUFFER_TRACE(bh, "get_write_access");
- if ((err = ext4_journal_get_write_access(handle, bh))) {
+ err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
+ if (err) {
brelse(bh);
bh = ERR_PTR(err);
} else {
@@ -469,14 +489,15 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
return -ENOMEM;
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, sb, bh,
+ EXT4_JTR_NONE);
if (err) {
brelse(bh);
return err;
}
ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n",
first_cluster, first_cluster - start, count2);
- ext4_set_bits(bh->b_data, first_cluster - start, count2);
+ mb_set_bits(bh->b_data, first_cluster - start, count2);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
@@ -564,7 +585,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
}
BUFFER_TRACE(gdb, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb);
+ err = ext4_journal_get_write_access(handle, sb, gdb,
+ EXT4_JTR_NONE);
if (err) {
brelse(gdb);
goto out;
@@ -624,7 +646,7 @@ handle_bb:
if (overhead != 0) {
ext4_debug("mark backup superblock %#04llx (+0)\n",
start);
- ext4_set_bits(bh->b_data, 0,
+ mb_set_bits(bh->b_data, 0,
EXT4_NUM_B2C(sbi, overhead));
}
ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count),
@@ -709,12 +731,23 @@ out:
* sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
* For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
*/
-static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
- unsigned *five, unsigned *seven)
+unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three,
+ unsigned int *five, unsigned int *seven)
{
- unsigned *min = three;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ unsigned int *min = three;
int mult = 3;
- unsigned ret;
+ unsigned int ret;
+
+ if (ext4_has_feature_sparse_super2(sb)) {
+ do {
+ if (*min > 2)
+ return UINT_MAX;
+ ret = le32_to_cpu(es->s_backup_bgs[*min - 1]);
+ *min += 1;
+ } while (!ret);
+ return ret;
+ }
if (!ext4_has_feature_sparse_super(sb)) {
ret = *min;
@@ -832,19 +865,22 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
}
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE);
if (unlikely(err))
goto errout;
BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
+ err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE);
if (unlikely(err))
goto errout;
BUFFER_TRACE(dind, "get_write_access");
- err = ext4_journal_get_write_access(handle, dind);
- if (unlikely(err))
+ err = ext4_journal_get_write_access(handle, sb, dind, EXT4_JTR_NONE);
+ if (unlikely(err)) {
ext4_std_error(sb, err);
+ goto errout;
+ }
/* ext4_reserve_inode_write() gets a reference on the iloc */
err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -897,8 +933,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
EXT4_SB(sb)->s_gdb_count++;
ext4_kvfree_array_rcu(o_group_desc);
+ lock_buffer(EXT4_SB(sb)->s_sbh);
le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
- err = ext4_handle_dirty_super(handle, sb);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(EXT4_SB(sb)->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
if (err)
ext4_std_error(sb, err);
return err;
@@ -946,7 +985,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
n_group_desc[gdb_num] = gdb_bh;
BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
+ err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE);
if (err) {
kvfree(n_group_desc);
brelse(gdb_bh);
@@ -1032,7 +1071,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
for (i = 0; i < reserved_gdb; i++) {
BUFFER_TRACE(primary[i], "get_write_access");
- if ((err = ext4_journal_get_write_access(handle, primary[i])))
+ if ((err = ext4_journal_get_write_access(handle, sb, primary[i],
+ EXT4_JTR_NONE)))
goto exit_bh;
}
@@ -1118,6 +1158,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
while (group < sbi->s_groups_count) {
struct buffer_head *bh;
ext4_fsblk_t backup_block;
+ struct ext4_super_block *es;
/* Out of journal space, and can't get more - abort - so sad */
err = ext4_resize_ensure_credits_batch(handle, 1);
@@ -1139,14 +1180,17 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
backup_block, backup_block -
ext4_group_first_block_no(sb, group));
BUFFER_TRACE(bh, "get_write_access");
- if ((err = ext4_journal_get_write_access(handle, bh))) {
- brelse(bh);
+ if ((err = ext4_journal_get_write_access(handle, sb, bh,
+ EXT4_JTR_NONE)))
break;
- }
lock_buffer(bh);
memcpy(bh->b_data, data, size);
if (rest)
memset(bh->b_data + size, 0, rest);
+ es = (struct ext4_super_block *) bh->b_data;
+ es->s_block_group_nr = cpu_to_le16(group);
+ if (ext4_has_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(sb, es);
set_buffer_uptodate(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -1222,7 +1266,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
gdb_num);
BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
+ err = ext4_journal_get_write_access(handle, sb, gdb_bh,
+ EXT4_JTR_NONE);
if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
err = reserve_backup_gdb(handle, resize_inode, group);
@@ -1243,7 +1288,7 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
if (unlikely(!bh))
return NULL;
if (!bh_uptodate_or_lock(bh)) {
- if (bh_submit_read(bh) < 0) {
+ if (ext4_read_bh(bh, 0, NULL) < 0) {
brelse(bh);
return NULL;
}
@@ -1343,6 +1388,17 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
return err;
}
+static void ext4_add_overhead(struct super_block *sb,
+ const ext4_fsblk_t overhead)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+
+ sbi->s_overhead += overhead;
+ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
+ smp_wmb();
+}
+
/*
* ext4_update_super() updates the super block so that the newly added
* groups can be seen by the filesystem.
@@ -1382,6 +1438,7 @@ static void ext4_update_super(struct super_block *sb,
reserved_blocks *= blocks_count;
do_div(reserved_blocks, 100);
+ lock_buffer(sbi->s_sbh);
ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
@@ -1419,6 +1476,8 @@ static void ext4_update_super(struct super_block *sb,
* active. */
ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
reserved_blocks);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
/* Update the free space counts */
percpu_counter_add(&sbi->s_freeclusters_counter,
@@ -1441,9 +1500,18 @@ static void ext4_update_super(struct super_block *sb,
}
/*
- * Update the fs overhead information
+ * Update the fs overhead information.
+ *
+ * For bigalloc, if the superblock already has a properly calculated
+ * overhead, update it with a value based on numbers already computed
+ * above for the newly allocated capacity.
*/
- ext4_calculate_overhead(sb);
+ if (ext4_has_feature_bigalloc(sb) && (sbi->s_overhead != 0))
+ ext4_add_overhead(sb,
+ EXT4_NUM_B2C(sbi, blocks_count - free_blocks));
+ else
+ ext4_calculate_overhead(sb);
+ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: added group %u:"
@@ -1496,7 +1564,8 @@ static int ext4_flex_group_add(struct super_block *sb,
}
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
if (err)
goto exit_journal;
@@ -1513,7 +1582,7 @@ static int ext4_flex_group_add(struct super_block *sb,
ext4_update_super(sb, flex_gd);
- err = ext4_handle_dirty_super(handle, sb);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
exit_journal:
err2 = ext4_journal_stop(handle);
@@ -1709,21 +1778,25 @@ static int ext4_group_extend_no_check(struct super_block *sb,
}
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE);
if (err) {
ext4_warning(sb, "error %d on journal write access", err);
goto errout;
}
+ lock_buffer(EXT4_SB(sb)->s_sbh);
ext4_blocks_count_set(es, o_blocks_count + add);
ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(EXT4_SB(sb)->s_sbh);
ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
o_blocks_count + add);
/* We add the blocks to the bitmap and set the group need init bit */
err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
if (err)
goto errout;
- ext4_handle_dirty_super(handle, sb);
+ ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
o_blocks_count + add);
errout:
@@ -1806,8 +1879,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count + add, add);
/* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, o_blocks_count + add - 1);
- if (!bh) {
+ bh = ext4_sb_bread(sb, o_blocks_count + add - 1, 0);
+ if (IS_ERR(bh)) {
ext4_warning(sb, "can't read last block, resize aborted");
return -ENOSPC;
}
@@ -1868,16 +1941,20 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
return PTR_ERR(handle);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
if (err)
goto errout;
+ lock_buffer(sbi->s_sbh);
ext4_clear_feature_resize_inode(sb);
ext4_set_feature_meta_bg(sb);
sbi->s_es->s_first_meta_bg =
cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
- err = ext4_handle_dirty_super(handle, sb);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
if (err) {
ext4_std_error(sb, err);
goto errout;
@@ -1932,13 +2009,23 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
int meta_bg;
/* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, n_blocks_count - 1);
- if (!bh) {
+ bh = ext4_sb_bread(sb, n_blocks_count - 1, 0);
+ if (IS_ERR(bh)) {
ext4_warning(sb, "can't read last block, resize aborted");
return -ENOSPC;
}
brelse(bh);
+ /*
+ * For bigalloc, trim the requested size to the nearest cluster
+ * boundary to avoid creating an unusable filesystem. We do this
+ * silently, instead of returning an error, to avoid breaking
+ * callers that blindly resize the filesystem to the full size of
+ * the underlying block device.
+ */
+ if (ext4_has_feature_bigalloc(sb))
+ n_blocks_count &= ~((1 << EXT4_CLUSTER_BITS(sb)) - 1);
+
retry:
o_blocks_count = ext4_blocks_count(es);
@@ -2040,7 +2127,7 @@ retry:
goto out;
}
- if (ext4_blocks_count(es) == n_blocks_count)
+ if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0)
goto out;
err = ext4_alloc_flex_bg_array(sb, n_group + 1);
@@ -2062,7 +2149,7 @@ retry:
*/
while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
flexbg_size)) {
- if (jiffies - last_update_time > HZ * 10) {
+ if (time_is_before_jiffies(last_update_time + HZ * 10)) {
if (last_update_time)
ext4_msg(sb, KERN_INFO,
"resized to %llu blocks",
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0c7c4adb664e..7cdd2138c897 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -39,13 +39,15 @@
#include <linux/log2.h>
#include <linux/crc16.h>
#include <linux/dax.h>
-#include <linux/cleancache.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
-
+#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/fsnotify.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "ext4.h"
#include "ext4_extents.h" /* Needed for trace points definition */
@@ -59,68 +61,82 @@
#include <trace/events/ext4.h>
static struct ext4_lazy_init *ext4_li_info;
-static struct mutex ext4_li_mtx;
+static DEFINE_MUTEX(ext4_li_mtx);
static struct ratelimit_state ext4_mount_msg_ratelimit;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
static int ext4_show_options(struct seq_file *seq, struct dentry *root);
-static int ext4_commit_super(struct super_block *sb, int sync);
-static void ext4_mark_recovery_complete(struct super_block *sb,
+static void ext4_update_super(struct super_block *sb);
+static int ext4_commit_super(struct super_block *sb);
+static int ext4_mark_recovery_complete(struct super_block *sb,
struct ext4_super_block *es);
-static void ext4_clear_journal_err(struct super_block *sb,
- struct ext4_super_block *es);
+static int ext4_clear_journal_err(struct super_block *sb,
+ struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
-static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
-static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
-static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static struct inode *ext4_get_journal_inode(struct super_block *sb,
unsigned int journal_inum);
+static int ext4_validate_options(struct fs_context *fc);
+static int ext4_check_opt_consistency(struct fs_context *fc,
+ struct super_block *sb);
+static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);
+static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
+static int ext4_get_tree(struct fs_context *fc);
+static int ext4_reconfigure(struct fs_context *fc);
+static void ext4_fc_free(struct fs_context *fc);
+static int ext4_init_fs_context(struct fs_context *fc);
+static const struct fs_parameter_spec ext4_param_specs[];
/*
* Lock ordering
*
- * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
- * i_mmap_rwsem (inode->i_mmap_rwsem)!
- *
* page fault path:
- * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
- * page lock -> i_data_sem (rw)
+ * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
+ * -> page lock -> i_data_sem (rw)
*
* buffered write path:
- * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> mmap_lock
* sb_start_write -> i_mutex -> transaction start -> page lock ->
* i_data_sem (rw)
*
* truncate:
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
+ * page lock
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
* i_data_sem (rw)
*
* direct IO:
- * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> mmap_lock
* sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
*
* writepages:
* transaction start -> page lock(s) -> i_data_sem (rw)
*/
+static const struct fs_context_operations ext4_context_ops = {
+ .parse_param = ext4_parse_param,
+ .get_tree = ext4_get_tree,
+ .reconfigure = ext4_reconfigure,
+ .free = ext4_fc_free,
+};
+
+
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext2",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "ext2",
+ .init_fs_context = ext4_init_fs_context,
+ .parameters = ext4_param_specs,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext2");
MODULE_ALIAS("ext2");
@@ -131,37 +147,120 @@ MODULE_ALIAS("ext2");
static struct file_system_type ext3_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext3",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "ext3",
+ .init_fs_context = ext4_init_fs_context,
+ .parameters = ext4_param_specs,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+
+static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io)
+{
+ /*
+ * buffer's verified bit is no longer valid after reading from
+ * disk again due to write out error, clear it to make sure we
+ * recheck the buffer contents.
+ */
+ clear_buffer_verified(bh);
+
+ bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
+ get_bh(bh);
+ submit_bh(REQ_OP_READ | op_flags, bh);
+}
+
+void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io)
+{
+ BUG_ON(!buffer_locked(bh));
+
+ if (ext4_buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return;
+ }
+ __ext4_read_bh(bh, op_flags, end_io);
+}
+
+int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io)
+{
+ BUG_ON(!buffer_locked(bh));
+
+ if (ext4_buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return 0;
+ }
+
+ __ext4_read_bh(bh, op_flags, end_io);
+
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ return 0;
+ return -EIO;
+}
+
+int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
+{
+ lock_buffer(bh);
+ if (!wait) {
+ ext4_read_bh_nowait(bh, op_flags, NULL);
+ return 0;
+ }
+ return ext4_read_bh(bh, op_flags, NULL);
+}
+
/*
- * This works like sb_bread() except it uses ERR_PTR for error
+ * This works like __bread_gfp() except it uses ERR_PTR for error
* returns. Currently with sb_bread it's impossible to distinguish
* between ENOMEM and EIO situations (since both result in a NULL
* return.
*/
-struct buffer_head *
-ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
+static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
+ sector_t block,
+ blk_opf_t op_flags, gfp_t gfp)
{
- struct buffer_head *bh = sb_getblk(sb, block);
+ struct buffer_head *bh;
+ int ret;
+ bh = sb_getblk_gfp(sb, block, gfp);
if (bh == NULL)
return ERR_PTR(-ENOMEM);
if (ext4_buffer_uptodate(bh))
return bh;
- ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return bh;
- put_bh(bh);
- return ERR_PTR(-EIO);
+
+ ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
+ if (ret) {
+ put_bh(bh);
+ return ERR_PTR(ret);
+ }
+ return bh;
+}
+
+struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
+ blk_opf_t op_flags)
+{
+ return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
+}
+
+struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
+ sector_t block)
+{
+ return __ext4_sb_bread_gfp(sb, block, 0, 0);
+}
+
+void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
+{
+ struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
+
+ if (likely(bh)) {
+ if (trylock_buffer(bh))
+ ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
+ brelse(bh);
+ }
}
static int ext4_verify_csum_type(struct super_block *sb,
@@ -173,8 +272,8 @@ static int ext4_verify_csum_type(struct super_block *sb,
return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
}
-static __le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es)
+__le32 ext4_superblock_csum(struct super_block *sb,
+ struct ext4_super_block *es)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct ext4_super_block, s_checksum);
@@ -316,10 +415,8 @@ void ext4_itable_unused_set(struct super_block *sb,
bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}
-static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
+static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
{
- time64_t now = ktime_get_real_seconds();
-
now = clamp_val(now, 0, (1ull << 40) - 1);
*lo = cpu_to_le32(lower_32_bits(now));
@@ -331,50 +428,11 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
}
#define ext4_update_tstamp(es, tstamp) \
- __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
+ __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
+ ktime_get_real_seconds())
#define ext4_get_tstamp(es, tstamp) \
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
-static void __save_error_info(struct super_block *sb, const char *func,
- unsigned int line)
-{
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-
- EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
- if (bdev_read_only(sb->s_bdev))
- return;
- es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_update_tstamp(es, s_last_error_time);
- strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
- es->s_last_error_line = cpu_to_le32(line);
- if (es->s_last_error_errcode == 0)
- es->s_last_error_errcode = EXT4_ERR_EFSCORRUPTED;
- if (!es->s_first_error_time) {
- es->s_first_error_time = es->s_last_error_time;
- es->s_first_error_time_hi = es->s_last_error_time_hi;
- strncpy(es->s_first_error_func, func,
- sizeof(es->s_first_error_func));
- es->s_first_error_line = cpu_to_le32(line);
- es->s_first_error_ino = es->s_last_error_ino;
- es->s_first_error_block = es->s_last_error_block;
- es->s_first_error_errcode = es->s_last_error_errcode;
- }
- /*
- * Start the daily error reporting function if it hasn't been
- * started already
- */
- if (!es->s_error_count)
- mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
- le32_add_cpu(&es->s_error_count, 1);
-}
-
-static void save_error_info(struct super_block *sb, const char *func,
- unsigned int line)
-{
- __save_error_info(sb, func, line);
- ext4_commit_super(sb, 1);
-}
-
/*
* The del_gendisk() function uninitializes the disk-specific data
* structures, including the bdi structure, without telling anyone
@@ -414,12 +472,160 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
spin_unlock(&sbi->s_md_lock);
}
+/*
+ * This writepage callback for write_cache_pages()
+ * takes care of a few cases after page cleaning.
+ *
+ * write_cache_pages() already checks for dirty pages
+ * and calls clear_page_dirty_for_io(), which we want,
+ * to write protect the pages.
+ *
+ * However, we may have to redirty a page (see below.)
+ */
+static int ext4_journalled_writepage_callback(struct page *page,
+ struct writeback_control *wbc,
+ void *data)
+{
+ transaction_t *transaction = (transaction_t *) data;
+ struct buffer_head *bh, *head;
+ struct journal_head *jh;
+
+ bh = head = page_buffers(page);
+ do {
+ /*
+ * We have to redirty a page in these cases:
+ * 1) If buffer is dirty, it means the page was dirty because it
+ * contains a buffer that needs checkpointing. So the dirty bit
+ * needs to be preserved so that checkpointing writes the buffer
+ * properly.
+ * 2) If buffer is not part of the committing transaction
+ * (we may have just accidentally come across this buffer because
+ * inode range tracking is not exact) or if the currently running
+ * transaction already contains this buffer as well, dirty bit
+ * needs to be preserved so that the buffer gets writeprotected
+ * properly on running transaction's commit.
+ */
+ jh = bh2jh(bh);
+ if (buffer_dirty(bh) ||
+ (jh && (jh->b_transaction != transaction ||
+ jh->b_next_transaction))) {
+ redirty_page_for_writepage(wbc, page);
+ goto out;
+ }
+ } while ((bh = bh->b_this_page) != head);
+
+out:
+ return AOP_WRITEPAGE_ACTIVATE;
+}
+
+static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
+ };
+
+ return write_cache_pages(mapping, &wbc,
+ ext4_journalled_writepage_callback,
+ jinode->i_transaction);
+}
+
+static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ int ret;
+
+ if (ext4_should_journal_data(jinode->i_vfs_inode))
+ ret = ext4_journalled_submit_inode_data_buffers(jinode);
+ else
+ ret = jbd2_journal_submit_inode_data_buffers(jinode);
+
+ return ret;
+}
+
+static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ int ret = 0;
+
+ if (!ext4_should_journal_data(jinode->i_vfs_inode))
+ ret = jbd2_journal_finish_inode_data_buffers(jinode);
+
+ return ret;
+}
+
static bool system_going_down(void)
{
return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
|| system_state == SYSTEM_RESTART;
}
+struct ext4_err_translation {
+ int code;
+ int errno;
+};
+
+#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }
+
+static struct ext4_err_translation err_translation[] = {
+ EXT4_ERR_TRANSLATE(EIO),
+ EXT4_ERR_TRANSLATE(ENOMEM),
+ EXT4_ERR_TRANSLATE(EFSBADCRC),
+ EXT4_ERR_TRANSLATE(EFSCORRUPTED),
+ EXT4_ERR_TRANSLATE(ENOSPC),
+ EXT4_ERR_TRANSLATE(ENOKEY),
+ EXT4_ERR_TRANSLATE(EROFS),
+ EXT4_ERR_TRANSLATE(EFBIG),
+ EXT4_ERR_TRANSLATE(EEXIST),
+ EXT4_ERR_TRANSLATE(ERANGE),
+ EXT4_ERR_TRANSLATE(EOVERFLOW),
+ EXT4_ERR_TRANSLATE(EBUSY),
+ EXT4_ERR_TRANSLATE(ENOTDIR),
+ EXT4_ERR_TRANSLATE(ENOTEMPTY),
+ EXT4_ERR_TRANSLATE(ESHUTDOWN),
+ EXT4_ERR_TRANSLATE(EFAULT),
+};
+
+static int ext4_errno_to_code(int errno)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(err_translation); i++)
+ if (err_translation[i].errno == errno)
+ return err_translation[i].code;
+ return EXT4_ERR_UNKNOWN;
+}
+
+static void save_error_info(struct super_block *sb, int error,
+ __u32 ino, __u64 block,
+ const char *func, unsigned int line)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* We default to EFSCORRUPTED error... */
+ if (error == 0)
+ error = EFSCORRUPTED;
+
+ spin_lock(&sbi->s_error_lock);
+ sbi->s_add_error_count++;
+ sbi->s_last_error_code = error;
+ sbi->s_last_error_line = line;
+ sbi->s_last_error_ino = ino;
+ sbi->s_last_error_block = block;
+ sbi->s_last_error_func = func;
+ sbi->s_last_error_time = ktime_get_real_seconds();
+ if (!sbi->s_first_error_time) {
+ sbi->s_first_error_code = error;
+ sbi->s_first_error_line = line;
+ sbi->s_first_error_ino = ino;
+ sbi->s_first_error_block = block;
+ sbi->s_first_error_func = func;
+ sbi->s_first_error_time = sbi->s_last_error_time;
+ }
+ spin_unlock(&sbi->s_error_lock);
+}
+
/* Deal with the reporting of failure conditions on a filesystem such as
* inconsistencies detected or read IO failures.
*
@@ -433,43 +639,113 @@ static bool system_going_down(void)
* We'll just use the jbd2_journal_abort() error code to record an error in
* the journal instead. On recovery, the journal will complain about
* that error until we've noted it down and cleared it.
+ *
+ * If force_ro is set, we unconditionally force the filesystem into an
+ * ABORT|READONLY state, unless the error response on the fs has been set to
+ * panic in which case we take the easy way out and panic immediately. This is
+ * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
+ * at a critical moment in log management.
*/
-
-static void ext4_handle_error(struct super_block *sb)
+static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
+ __u32 ino, __u64 block,
+ const char *func, unsigned int line)
{
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+ bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT);
+
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
if (test_opt(sb, WARN_ON_ERROR))
WARN_ON_ONCE(1);
- if (sb_rdonly(sb))
- return;
-
- if (!test_opt(sb, ERRORS_CONT)) {
- journal_t *journal = EXT4_SB(sb)->s_journal;
-
- EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ if (!continue_fs && !sb_rdonly(sb)) {
+ ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
if (journal)
jbd2_journal_abort(journal, -EIO);
}
+
+ if (!bdev_read_only(sb->s_bdev)) {
+ save_error_info(sb, error, ino, block, func, line);
+ /*
+ * In case the fs should keep running, we need to writeout
+ * superblock through the journal. Due to lock ordering
+ * constraints, it may not be safe to do it right here so we
+ * defer superblock flushing to a workqueue.
+ */
+ if (continue_fs && journal)
+ schedule_work(&EXT4_SB(sb)->s_error_work);
+ else
+ ext4_commit_super(sb);
+ }
+
/*
* We force ERRORS_RO behavior when system is rebooting. Otherwise we
* could panic during 'reboot -f' as the underlying device got already
* disabled.
*/
- if (test_opt(sb, ERRORS_RO) || system_going_down()) {
- ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
- /*
- * Make sure updated value of ->s_mount_flags will be visible
- * before ->s_flags update
- */
- smp_wmb();
- sb->s_flags |= SB_RDONLY;
- } else if (test_opt(sb, ERRORS_PANIC)) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
+ if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
}
+
+ if (sb_rdonly(sb) || continue_fs)
+ return;
+
+ ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible before
+ * ->s_flags update
+ */
+ smp_wmb();
+ sb->s_flags |= SB_RDONLY;
+}
+
+static void flush_stashed_error_work(struct work_struct *work)
+{
+ struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
+ s_error_work);
+ journal_t *journal = sbi->s_journal;
+ handle_t *handle;
+
+ /*
+ * If the journal is still running, we have to write out superblock
+ * through the journal to avoid collisions of other journalled sb
+ * updates.
+ *
+ * We use directly jbd2 functions here to avoid recursing back into
+ * ext4 error handling code during handling of previous errors.
+ */
+ if (!sb_rdonly(sbi->s_sb) && journal) {
+ struct buffer_head *sbh = sbi->s_sbh;
+ handle = jbd2_journal_start(journal, 1);
+ if (IS_ERR(handle))
+ goto write_directly;
+ if (jbd2_journal_get_write_access(handle, sbh)) {
+ jbd2_journal_stop(handle);
+ goto write_directly;
+ }
+ ext4_update_super(sbi->s_sb);
+ if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
+ ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+
+ if (jbd2_journal_dirty_metadata(handle, sbh)) {
+ jbd2_journal_stop(handle);
+ goto write_directly;
+ }
+ jbd2_journal_stop(handle);
+ ext4_notify_error_sysfs(sbi);
+ return;
+ }
+write_directly:
+ /*
+ * Write through journal failed. Write sb directly to get error info
+ * out and hope for the best.
+ */
+ ext4_commit_super(sbi->s_sb);
+ ext4_notify_error_sysfs(sbi);
}
#define ext4_error_ratelimit(sb) \
@@ -477,7 +753,8 @@ static void ext4_handle_error(struct super_block *sb)
"EXT4-fs error")
void __ext4_error(struct super_block *sb, const char *function,
- unsigned int line, const char *fmt, ...)
+ unsigned int line, bool force_ro, int error, __u64 block,
+ const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -495,24 +772,22 @@ void __ext4_error(struct super_block *sb, const char *function,
sb->s_id, function, line, current->comm, &vaf);
va_end(args);
}
- save_error_info(sb, function, line);
- ext4_handle_error(sb);
+ fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);
+
+ ext4_handle_error(sb, force_ro, error, 0, block, function, line);
}
void __ext4_error_inode(struct inode *inode, const char *function,
- unsigned int line, ext4_fsblk_t block,
+ unsigned int line, ext4_fsblk_t block, int error,
const char *fmt, ...)
{
va_list args;
struct va_format vaf;
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return;
trace_ext4_error(inode->i_sb, function, line);
- es->s_last_error_ino = cpu_to_le32(inode->i_ino);
- es->s_last_error_block = cpu_to_le64(block);
if (ext4_error_ratelimit(inode->i_sb)) {
va_start(args, fmt);
vaf.fmt = fmt;
@@ -529,8 +804,10 @@ void __ext4_error_inode(struct inode *inode, const char *function,
current->comm, &vaf);
va_end(args);
}
- save_error_info(inode->i_sb, function, line);
- ext4_handle_error(inode->i_sb);
+ fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED);
+
+ ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
+ function, line);
}
void __ext4_error_file(struct file *file, const char *function,
@@ -539,7 +816,6 @@ void __ext4_error_file(struct file *file, const char *function,
{
va_list args;
struct va_format vaf;
- struct ext4_super_block *es;
struct inode *inode = file_inode(file);
char pathname[80], *path;
@@ -547,8 +823,6 @@ void __ext4_error_file(struct file *file, const char *function,
return;
trace_ext4_error(inode->i_sb, function, line);
- es = EXT4_SB(inode->i_sb)->s_es;
- es->s_last_error_ino = cpu_to_le32(inode->i_ino);
if (ext4_error_ratelimit(inode->i_sb)) {
path = file_path(file, pathname, sizeof(pathname));
if (IS_ERR(path))
@@ -570,8 +844,10 @@ void __ext4_error_file(struct file *file, const char *function,
current->comm, path, &vaf);
va_end(args);
}
- save_error_info(inode->i_sb, function, line);
- ext4_handle_error(inode->i_sb);
+ fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED);
+
+ ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
+ function, line);
}
const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -614,66 +890,6 @@ const char *ext4_decode_error(struct super_block *sb, int errno,
return errstr;
}
-void ext4_set_errno(struct super_block *sb, int err)
-{
- if (err < 0)
- err = -err;
-
- switch (err) {
- case EIO:
- err = EXT4_ERR_EIO;
- break;
- case ENOMEM:
- err = EXT4_ERR_ENOMEM;
- break;
- case EFSBADCRC:
- err = EXT4_ERR_EFSBADCRC;
- break;
- case EFSCORRUPTED:
- err = EXT4_ERR_EFSCORRUPTED;
- break;
- case ENOSPC:
- err = EXT4_ERR_ENOSPC;
- break;
- case ENOKEY:
- err = EXT4_ERR_ENOKEY;
- break;
- case EROFS:
- err = EXT4_ERR_EROFS;
- break;
- case EFBIG:
- err = EXT4_ERR_EFBIG;
- break;
- case EEXIST:
- err = EXT4_ERR_EEXIST;
- break;
- case ERANGE:
- err = EXT4_ERR_ERANGE;
- break;
- case EOVERFLOW:
- err = EXT4_ERR_EOVERFLOW;
- break;
- case EBUSY:
- err = EXT4_ERR_EBUSY;
- break;
- case ENOTDIR:
- err = EXT4_ERR_ENOTDIR;
- break;
- case ENOTEMPTY:
- err = EXT4_ERR_ENOTEMPTY;
- break;
- case ESHUTDOWN:
- err = EXT4_ERR_ESHUTDOWN;
- break;
- case EFAULT:
- err = EXT4_ERR_EFAULT;
- break;
- default:
- err = EXT4_ERR_UNKNOWN;
- }
- EXT4_SB(sb)->s_es->s_last_error_errcode = err;
-}
-
/* __ext4_std_error decodes expected errors from journaling functions
* automatically and invokes the appropriate error response. */
@@ -697,58 +913,9 @@ void __ext4_std_error(struct super_block *sb, const char *function,
printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
sb->s_id, function, line, errstr);
}
+ fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED);
- ext4_set_errno(sb, -errno);
- save_error_info(sb, function, line);
- ext4_handle_error(sb);
-}
-
-/*
- * ext4_abort is a much stronger failure handler than ext4_error. The
- * abort function may be used to deal with unrecoverable failures such
- * as journal IO errors or ENOMEM at a critical moment in log management.
- *
- * We unconditionally force the filesystem into an ABORT|READONLY state,
- * unless the error response on the fs has been set to panic in which
- * case we take the easy way out and panic immediately.
- */
-
-void __ext4_abort(struct super_block *sb, const char *function,
- unsigned int line, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
- return;
-
- save_error_info(sb, function, line);
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n",
- sb->s_id, function, line, &vaf);
- va_end(args);
-
- if (sb_rdonly(sb) == 0) {
- ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
- EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
- /*
- * Make sure updated value of ->s_mount_flags will be visible
- * before ->s_flags update
- */
- smp_wmb();
- sb->s_flags |= SB_RDONLY;
- if (EXT4_SB(sb)->s_journal)
- jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
- save_error_info(sb, function, line);
- }
- if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
- panic("EXT4-fs panic from previous error\n");
- }
+ ext4_handle_error(sb, false, -errno, 0, 0, function, line);
}
void __ext4_msg(struct super_block *sb,
@@ -757,19 +924,29 @@ void __ext4_msg(struct super_block *sb,
struct va_format vaf;
va_list args;
- if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
- return;
+ if (sb) {
+ atomic_inc(&EXT4_SB(sb)->s_msg_count);
+ if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state),
+ "EXT4-fs"))
+ return;
+ }
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+ if (sb)
+ printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+ else
+ printk("%sEXT4-fs: %pV\n", prefix, &vaf);
va_end(args);
}
-#define ext4_warning_ratelimit(sb) \
- ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
- "EXT4-fs warning")
+static int ext4_warning_ratelimit(struct super_block *sb)
+{
+ atomic_inc(&EXT4_SB(sb)->s_warning_count);
+ return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+ "EXT4-fs warning");
+}
void __ext4_warning(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
@@ -815,16 +992,11 @@ __acquires(bitlock)
{
struct va_format vaf;
va_list args;
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
return;
trace_ext4_error(sb, function, line);
- es->s_last_error_ino = cpu_to_le32(ino);
- es->s_last_error_block = cpu_to_le64(block);
- __save_error_info(sb, function, line);
-
if (ext4_error_ratelimit(sb)) {
va_start(args, fmt);
vaf.fmt = fmt;
@@ -840,17 +1012,19 @@ __acquires(bitlock)
va_end(args);
}
- if (test_opt(sb, WARN_ON_ERROR))
- WARN_ON_ONCE(1);
-
if (test_opt(sb, ERRORS_CONT)) {
- ext4_commit_super(sb, 0);
+ if (test_opt(sb, WARN_ON_ERROR))
+ WARN_ON_ONCE(1);
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ if (!bdev_read_only(sb->s_bdev)) {
+ save_error_info(sb, EFSCORRUPTED, ino, block, function,
+ line);
+ schedule_work(&EXT4_SB(sb)->s_error_work);
+ }
return;
}
-
ext4_unlock_group(sb, grp);
- ext4_commit_super(sb, 1);
- ext4_handle_error(sb);
+ ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line);
/*
* We only get here in the ERRORS_RO case; relocking the group
* may be dangerous, but nothing bad will happen since the
@@ -927,7 +1101,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
{
struct block_device *bdev;
- char b[BDEVNAME_SIZE];
bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
if (IS_ERR(bdev))
@@ -935,8 +1108,9 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
return bdev;
fail:
- ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
- __bdevname(dev, b), PTR_ERR(bdev));
+ ext4_msg(sb, KERN_ERR,
+ "failed to open journal device unknown-block(%u,%u) %ld",
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
return NULL;
}
@@ -951,10 +1125,10 @@ static void ext4_blkdev_put(struct block_device *bdev)
static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
{
struct block_device *bdev;
- bdev = sbi->journal_bdev;
+ bdev = sbi->s_journal_bdev;
if (bdev) {
ext4_blkdev_put(bdev);
- sbi->journal_bdev = NULL;
+ sbi->s_journal_bdev = NULL;
}
}
@@ -1019,22 +1193,37 @@ static void ext4_put_super(struct super_block *sb)
int aborted = 0;
int i, err;
+ /*
+ * Unregister sysfs before destroying jbd2 journal.
+ * Since we could still access attr_journal_task attribute via sysfs
+ * path which could have sbi->s_journal->j_task as NULL
+ * Unregister sysfs before flush sbi->s_error_work.
+ * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
+ * read metadata verify failed then will queue error work.
+ * flush_stashed_error_work will call start_this_handle may trigger
+ * BUG_ON.
+ */
+ ext4_unregister_sysfs(sb);
+
+ if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
+ ext4_msg(sb, KERN_INFO, "unmounting filesystem.");
+
ext4_unregister_li_request(sb);
ext4_quota_off_umount(sb);
+ flush_work(&sbi->s_error_work);
destroy_workqueue(sbi->rsv_conversion_wq);
+ ext4_release_orphan_info(sb);
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
if ((err < 0) && !aborted) {
- ext4_set_errno(sb, -err);
- ext4_abort(sb, "Couldn't clean up the journal");
+ ext4_abort(sb, -err, "Couldn't clean up the journal");
}
}
- ext4_unregister_sysfs(sb);
ext4_es_unregister_shrinker(sbi);
del_timer_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
@@ -1043,10 +1232,11 @@ static void ext4_put_super(struct super_block *sb)
if (!sb_rdonly(sb) && !aborted) {
ext4_clear_feature_journal_needs_recovery(sb);
+ ext4_clear_feature_orphan_present(sb);
es->s_state = cpu_to_le16(sbi->s_mount_state);
}
if (!sb_rdonly(sb))
- ext4_commit_super(sb, 1);
+ ext4_commit_super(sb);
rcu_read_lock();
group_desc = rcu_dereference(sbi->s_group_desc);
@@ -1064,6 +1254,7 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
percpu_free_rwsem(&sbi->s_writepages_rwsem);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -1076,18 +1267,18 @@ static void ext4_put_super(struct super_block *sb)
* in-memory list had better be clean by this point. */
if (!list_empty(&sbi->s_orphan))
dump_orphan_list(sb, sbi);
- J_ASSERT(list_empty(&sbi->s_orphan));
+ ASSERT(list_empty(&sbi->s_orphan));
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+ if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
/*
* Invalidate the journal device's buffers. We don't want them
* floating about in memory - the physical journal device may
* hotswapped, and it breaks the `ro-after' testing code.
*/
- sync_blockdev(sbi->journal_bdev);
- invalidate_bdev(sbi->journal_bdev);
+ sync_blockdev(sbi->s_journal_bdev);
+ invalidate_bdev(sbi->s_journal_bdev);
ext4_blkdev_remove(sbi);
}
@@ -1097,8 +1288,8 @@ static void ext4_put_super(struct super_block *sb)
ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
sbi->s_ea_block_cache = NULL;
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
+ ext4_stop_mmpd(sbi);
+
brelse(sbi->s_sbh);
sb->s_fs_info = NULL;
/*
@@ -1110,9 +1301,10 @@ static void ext4_put_super(struct super_block *sb)
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
-#ifdef CONFIG_UNICODE
- utf8_unload(sbi->s_encoding);
+ fs_put_dax(sbi->s_daxdev, NULL);
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
+#if IS_ENABLED(CONFIG_UNICODE)
+ utf8_unload(sb->s_encoding);
#endif
kfree(sbi);
}
@@ -1126,13 +1318,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
{
struct ext4_inode_info *ei;
- ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
inode_set_iversion(&ei->vfs_inode, 1);
spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
+ atomic_set(&ei->i_prealloc_active, 0);
spin_lock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
@@ -1154,6 +1347,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_datasync_tid = 0;
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ ext4_fc_init_inode(&ei->vfs_inode);
+ mutex_init(&ei->i_fc_lock);
return &ei->vfs_inode;
}
@@ -1171,6 +1366,10 @@ static int ext4_drop_inode(struct inode *inode)
static void ext4_free_in_core_inode(struct inode *inode)
{
fscrypt_free_inode(inode);
+ if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
+ pr_warn("%s: inode %ld still in fc list",
+ __func__, inode->i_ino);
+ }
kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
}
@@ -1185,17 +1384,23 @@ static void ext4_destroy_inode(struct inode *inode)
true);
dump_stack();
}
+
+ if (EXT4_I(inode)->i_reserved_data_blocks)
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
+ inode->i_ino, EXT4_I(inode),
+ EXT4_I(inode)->i_reserved_data_blocks);
}
static void init_once(void *foo)
{
- struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
+ struct ext4_inode_info *ei = foo;
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
init_rwsem(&ei->i_data_sem);
- init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
+ ext4_fc_init_inode(&ei->vfs_inode);
}
static int __init init_inodecache(void)
@@ -1224,9 +1429,10 @@ static void destroy_inodecache(void)
void ext4_clear_inode(struct inode *inode)
{
+ ext4_fc_del(inode);
invalidate_inode_buffers(inode);
clear_inode(inode);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
@@ -1283,146 +1489,6 @@ static int ext4_nfs_commit_metadata(struct inode *inode)
return ext4_write_inode(inode, &wbc);
}
-/*
- * Try to release metadata pages (indirect blocks, directories) which are
- * mapped via the block device. Since these pages could have journal heads
- * which would prevent try_to_free_buffers() from freeing them, we must use
- * jbd2 layer's try_to_free_buffers() function to release them.
- */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
- gfp_t wait)
-{
- journal_t *journal = EXT4_SB(sb)->s_journal;
-
- WARN_ON(PageChecked(page));
- if (!page_has_buffers(page))
- return 0;
- if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page,
- wait & ~__GFP_DIRECT_RECLAIM);
- return try_to_free_buffers(page);
-}
-
-#ifdef CONFIG_FS_ENCRYPTION
-static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
-{
- return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
-}
-
-static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
- void *fs_data)
-{
- handle_t *handle = fs_data;
- int res, res2, credits, retries = 0;
-
- /*
- * Encrypting the root directory is not allowed because e2fsck expects
- * lost+found to exist and be unencrypted, and encrypting the root
- * directory would imply encrypting the lost+found directory as well as
- * the filename "lost+found" itself.
- */
- if (inode->i_ino == EXT4_ROOT_INO)
- return -EPERM;
-
- if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
- return -EINVAL;
-
- res = ext4_convert_inline_data(inode);
- if (res)
- return res;
-
- /*
- * If a journal handle was specified, then the encryption context is
- * being set on a new inode via inheritance and is part of a larger
- * transaction to create the inode. Otherwise the encryption context is
- * being set on an existing inode in its own transaction. Only in the
- * latter case should the "retry on ENOSPC" logic be used.
- */
-
- if (handle) {
- res = ext4_xattr_set_handle(handle, inode,
- EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
- ctx, len, 0);
- if (!res) {
- ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
- ext4_clear_inode_state(inode,
- EXT4_STATE_MAY_INLINE_DATA);
- /*
- * Update inode->i_flags - S_ENCRYPTED will be enabled,
- * S_DAX may be disabled
- */
- ext4_set_inode_flags(inode);
- }
- return res;
- }
-
- res = dquot_initialize(inode);
- if (res)
- return res;
-retry:
- res = ext4_xattr_set_credits(inode, len, false /* is_create */,
- &credits);
- if (res)
- return res;
-
- handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
- ctx, len, 0);
- if (!res) {
- ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
- /*
- * Update inode->i_flags - S_ENCRYPTED will be enabled,
- * S_DAX may be disabled
- */
- ext4_set_inode_flags(inode);
- res = ext4_mark_inode_dirty(handle, inode);
- if (res)
- EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
- }
- res2 = ext4_journal_stop(handle);
-
- if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- if (!res)
- res = res2;
- return res;
-}
-
-static bool ext4_dummy_context(struct inode *inode)
-{
- return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
-}
-
-static bool ext4_has_stable_inodes(struct super_block *sb)
-{
- return ext4_has_feature_stable_inodes(sb);
-}
-
-static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
- int *ino_bits_ret, int *lblk_bits_ret)
-{
- *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
- *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
-}
-
-static const struct fscrypt_operations ext4_cryptops = {
- .key_prefix = "ext4:",
- .get_context = ext4_get_context,
- .set_context = ext4_set_context,
- .dummy_context = ext4_dummy_context,
- .empty_dir = ext4_empty_dir,
- .max_namelen = EXT4_NAME_LEN,
- .has_stable_inodes = ext4_has_stable_inodes,
- .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
-};
-#endif
-
#ifdef CONFIG_QUOTA
static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])
@@ -1434,14 +1500,12 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
const struct path *path);
-static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
-static int ext4_enable_quotas(struct super_block *sb);
static struct dquot **ext4_get_dquots(struct inode *inode)
{
@@ -1487,14 +1551,12 @@ static const struct super_operations ext4_sops = {
.freeze_fs = ext4_freeze,
.unfreeze_fs = ext4_unfreeze,
.statfs = ext4_statfs,
- .remount_fs = ext4_remount,
.show_options = ext4_show_options,
#ifdef CONFIG_QUOTA
.quota_read = ext4_quota_read,
.quota_write = ext4_quota_write,
.get_dquots = ext4_get_dquots,
#endif
- .bdev_try_to_free_page = bdev_try_to_free_page,
};
static const struct export_operations ext4_export_ops = {
@@ -1506,231 +1568,195 @@ static const struct export_operations ext4_export_ops = {
enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
- Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+ Opt_resgid, Opt_resuid, Opt_sb,
Opt_nouid32, Opt_debug, Opt_removed,
- Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+ Opt_user_xattr, Opt_acl,
Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
- Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+ Opt_inlinecrypt,
+ Opt_usrjquota, Opt_grpjquota, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
+ Opt_usrquota, Opt_grpquota, Opt_prjquota,
+ Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
- Opt_nowarn_on_error, Opt_mblk_io_submit,
- Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
+ Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
+ Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
+ Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
+#ifdef CONFIG_EXT4_DEBUG
+ Opt_fc_debug_max_replay, Opt_fc_debug_force
+#endif
};
-static const match_table_t tokens = {
- {Opt_bsd_df, "bsddf"},
- {Opt_minix_df, "minixdf"},
- {Opt_grpid, "grpid"},
- {Opt_grpid, "bsdgroups"},
- {Opt_nogrpid, "nogrpid"},
- {Opt_nogrpid, "sysvgroups"},
- {Opt_resgid, "resgid=%u"},
- {Opt_resuid, "resuid=%u"},
- {Opt_sb, "sb=%u"},
- {Opt_err_cont, "errors=continue"},
- {Opt_err_panic, "errors=panic"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_nouid32, "nouid32"},
- {Opt_debug, "debug"},
- {Opt_removed, "oldalloc"},
- {Opt_removed, "orlov"},
- {Opt_user_xattr, "user_xattr"},
- {Opt_nouser_xattr, "nouser_xattr"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_noload, "norecovery"},
- {Opt_noload, "noload"},
- {Opt_removed, "nobh"},
- {Opt_removed, "bh"},
- {Opt_commit, "commit=%u"},
- {Opt_min_batch_time, "min_batch_time=%u"},
- {Opt_max_batch_time, "max_batch_time=%u"},
- {Opt_journal_dev, "journal_dev=%u"},
- {Opt_journal_path, "journal_path=%s"},
- {Opt_journal_checksum, "journal_checksum"},
- {Opt_nojournal_checksum, "nojournal_checksum"},
- {Opt_journal_async_commit, "journal_async_commit"},
- {Opt_abort, "abort"},
- {Opt_data_journal, "data=journal"},
- {Opt_data_ordered, "data=ordered"},
- {Opt_data_writeback, "data=writeback"},
- {Opt_data_err_abort, "data_err=abort"},
- {Opt_data_err_ignore, "data_err=ignore"},
- {Opt_offusrjquota, "usrjquota="},
- {Opt_usrjquota, "usrjquota=%s"},
- {Opt_offgrpjquota, "grpjquota="},
- {Opt_grpjquota, "grpjquota=%s"},
- {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
- {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
- {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
- {Opt_grpquota, "grpquota"},
- {Opt_noquota, "noquota"},
- {Opt_quota, "quota"},
- {Opt_usrquota, "usrquota"},
- {Opt_prjquota, "prjquota"},
- {Opt_barrier, "barrier=%u"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_i_version, "i_version"},
- {Opt_dax, "dax"},
- {Opt_stripe, "stripe=%u"},
- {Opt_delalloc, "delalloc"},
- {Opt_warn_on_error, "warn_on_error"},
- {Opt_nowarn_on_error, "nowarn_on_error"},
- {Opt_lazytime, "lazytime"},
- {Opt_nolazytime, "nolazytime"},
- {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"},
- {Opt_nodelalloc, "nodelalloc"},
- {Opt_removed, "mblk_io_submit"},
- {Opt_removed, "nomblk_io_submit"},
- {Opt_block_validity, "block_validity"},
- {Opt_noblock_validity, "noblock_validity"},
- {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
- {Opt_journal_ioprio, "journal_ioprio=%u"},
- {Opt_auto_da_alloc, "auto_da_alloc=%u"},
- {Opt_auto_da_alloc, "auto_da_alloc"},
- {Opt_noauto_da_alloc, "noauto_da_alloc"},
- {Opt_dioread_nolock, "dioread_nolock"},
- {Opt_dioread_lock, "nodioread_nolock"},
- {Opt_dioread_lock, "dioread_lock"},
- {Opt_discard, "discard"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_init_itable, "init_itable=%u"},
- {Opt_init_itable, "init_itable"},
- {Opt_noinit_itable, "noinit_itable"},
- {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
- {Opt_test_dummy_encryption, "test_dummy_encryption"},
- {Opt_nombcache, "nombcache"},
- {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
- {Opt_removed, "check=none"}, /* mount option from ext2/3 */
- {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
- {Opt_removed, "reservation"}, /* mount option from ext2/3 */
- {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
- {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */
- {Opt_err, NULL},
+static const struct constant_table ext4_param_errors[] = {
+ {"continue", EXT4_MOUNT_ERRORS_CONT},
+ {"panic", EXT4_MOUNT_ERRORS_PANIC},
+ {"remount-ro", EXT4_MOUNT_ERRORS_RO},
+ {}
};
-static ext4_fsblk_t get_sb_block(void **data)
-{
- ext4_fsblk_t sb_block;
- char *options = (char *) *data;
-
- if (!options || strncmp(options, "sb=", 3) != 0)
- return 1; /* Default location */
-
- options += 3;
- /* TODO: use simple_strtoll with >32bit ext4 */
- sb_block = simple_strtoul(options, &options, 0);
- if (*options && *options != ',') {
- printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
- (char *) *data);
- return 1;
- }
- if (*options == ',')
- options++;
- *data = (void *) options;
-
- return sb_block;
-}
-
-#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
-static const char deprecated_msg[] =
- "Mount option \"%s\" will be removed by %s\n"
- "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+static const struct constant_table ext4_param_data[] = {
+ {"journal", EXT4_MOUNT_JOURNAL_DATA},
+ {"ordered", EXT4_MOUNT_ORDERED_DATA},
+ {"writeback", EXT4_MOUNT_WRITEBACK_DATA},
+ {}
+};
-#ifdef CONFIG_QUOTA
-static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
- int ret = -1;
+static const struct constant_table ext4_param_data_err[] = {
+ {"abort", Opt_data_err_abort},
+ {"ignore", Opt_data_err_ignore},
+ {}
+};
- if (sb_any_quota_loaded(sb) && !old_qname) {
- ext4_msg(sb, KERN_ERR,
- "Cannot change journaled "
- "quota options when quota turned on");
- return -1;
- }
- if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_INFO, "Journaled quota options "
- "ignored when QUOTA feature is enabled");
- return 1;
- }
- qname = match_strdup(args);
- if (!qname) {
- ext4_msg(sb, KERN_ERR,
- "Not enough memory for storing quotafile name");
- return -1;
- }
- if (old_qname) {
- if (strcmp(old_qname, qname) == 0)
- ret = 1;
- else
- ext4_msg(sb, KERN_ERR,
- "%s quota file already specified",
- QTYPE2NAME(qtype));
- goto errout;
- }
- if (strchr(qname, '/')) {
- ext4_msg(sb, KERN_ERR,
- "quotafile must be on filesystem root");
- goto errout;
- }
- rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
- set_opt(sb, QUOTA);
- return 1;
-errout:
- kfree(qname);
- return ret;
-}
+static const struct constant_table ext4_param_jqfmt[] = {
+ {"vfsold", QFMT_VFS_OLD},
+ {"vfsv0", QFMT_VFS_V0},
+ {"vfsv1", QFMT_VFS_V1},
+ {}
+};
-static int clear_qf_name(struct super_block *sb, int qtype)
-{
+static const struct constant_table ext4_param_dax[] = {
+ {"always", Opt_dax_always},
+ {"inode", Opt_dax_inode},
+ {"never", Opt_dax_never},
+ {}
+};
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- char *old_qname = get_qf_name(sb, sbi, qtype);
+/* String parameter that allows empty argument */
+#define fsparam_string_empty(NAME, OPT) \
+ __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
- if (sb_any_quota_loaded(sb) && old_qname) {
- ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
- " when quota turned on");
- return -1;
- }
- rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
- synchronize_rcu();
- kfree(old_qname);
- return 1;
-}
+/*
+ * Mount option specification
+ * We don't use fsparam_flag_no because of the way we set the
+ * options and the way we show them in _ext4_show_options(). To
+ * keep the changes to a minimum, let's keep the negative options
+ * separate for now.
+ */
+static const struct fs_parameter_spec ext4_param_specs[] = {
+ fsparam_flag ("bsddf", Opt_bsd_df),
+ fsparam_flag ("minixdf", Opt_minix_df),
+ fsparam_flag ("grpid", Opt_grpid),
+ fsparam_flag ("bsdgroups", Opt_grpid),
+ fsparam_flag ("nogrpid", Opt_nogrpid),
+ fsparam_flag ("sysvgroups", Opt_nogrpid),
+ fsparam_u32 ("resgid", Opt_resgid),
+ fsparam_u32 ("resuid", Opt_resuid),
+ fsparam_u32 ("sb", Opt_sb),
+ fsparam_enum ("errors", Opt_errors, ext4_param_errors),
+ fsparam_flag ("nouid32", Opt_nouid32),
+ fsparam_flag ("debug", Opt_debug),
+ fsparam_flag ("oldalloc", Opt_removed),
+ fsparam_flag ("orlov", Opt_removed),
+ fsparam_flag ("user_xattr", Opt_user_xattr),
+ fsparam_flag ("acl", Opt_acl),
+ fsparam_flag ("norecovery", Opt_noload),
+ fsparam_flag ("noload", Opt_noload),
+ fsparam_flag ("bh", Opt_removed),
+ fsparam_flag ("nobh", Opt_removed),
+ fsparam_u32 ("commit", Opt_commit),
+ fsparam_u32 ("min_batch_time", Opt_min_batch_time),
+ fsparam_u32 ("max_batch_time", Opt_max_batch_time),
+ fsparam_u32 ("journal_dev", Opt_journal_dev),
+ fsparam_bdev ("journal_path", Opt_journal_path),
+ fsparam_flag ("journal_checksum", Opt_journal_checksum),
+ fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum),
+ fsparam_flag ("journal_async_commit",Opt_journal_async_commit),
+ fsparam_flag ("abort", Opt_abort),
+ fsparam_enum ("data", Opt_data, ext4_param_data),
+ fsparam_enum ("data_err", Opt_data_err,
+ ext4_param_data_err),
+ fsparam_string_empty
+ ("usrjquota", Opt_usrjquota),
+ fsparam_string_empty
+ ("grpjquota", Opt_grpjquota),
+ fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_flag ("quota", Opt_quota),
+ fsparam_flag ("noquota", Opt_noquota),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag ("prjquota", Opt_prjquota),
+ fsparam_flag ("barrier", Opt_barrier),
+ fsparam_u32 ("barrier", Opt_barrier),
+ fsparam_flag ("nobarrier", Opt_nobarrier),
+ fsparam_flag ("i_version", Opt_removed),
+ fsparam_flag ("dax", Opt_dax),
+ fsparam_enum ("dax", Opt_dax_type, ext4_param_dax),
+ fsparam_u32 ("stripe", Opt_stripe),
+ fsparam_flag ("delalloc", Opt_delalloc),
+ fsparam_flag ("nodelalloc", Opt_nodelalloc),
+ fsparam_flag ("warn_on_error", Opt_warn_on_error),
+ fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error),
+ fsparam_u32 ("debug_want_extra_isize",
+ Opt_debug_want_extra_isize),
+ fsparam_flag ("mblk_io_submit", Opt_removed),
+ fsparam_flag ("nomblk_io_submit", Opt_removed),
+ fsparam_flag ("block_validity", Opt_block_validity),
+ fsparam_flag ("noblock_validity", Opt_noblock_validity),
+ fsparam_u32 ("inode_readahead_blks",
+ Opt_inode_readahead_blks),
+ fsparam_u32 ("journal_ioprio", Opt_journal_ioprio),
+ fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc),
+ fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc),
+ fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc),
+ fsparam_flag ("dioread_nolock", Opt_dioread_nolock),
+ fsparam_flag ("nodioread_nolock", Opt_dioread_lock),
+ fsparam_flag ("dioread_lock", Opt_dioread_lock),
+ fsparam_flag ("discard", Opt_discard),
+ fsparam_flag ("nodiscard", Opt_nodiscard),
+ fsparam_u32 ("init_itable", Opt_init_itable),
+ fsparam_flag ("init_itable", Opt_init_itable),
+ fsparam_flag ("noinit_itable", Opt_noinit_itable),
+#ifdef CONFIG_EXT4_DEBUG
+ fsparam_flag ("fc_debug_force", Opt_fc_debug_force),
+ fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay),
#endif
+ fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb),
+ fsparam_flag ("test_dummy_encryption",
+ Opt_test_dummy_encryption),
+ fsparam_string ("test_dummy_encryption",
+ Opt_test_dummy_encryption),
+ fsparam_flag ("inlinecrypt", Opt_inlinecrypt),
+ fsparam_flag ("nombcache", Opt_nombcache),
+ fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */
+ fsparam_flag ("prefetch_block_bitmaps",
+ Opt_removed),
+ fsparam_flag ("no_prefetch_block_bitmaps",
+ Opt_no_prefetch_block_bitmaps),
+ fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan),
+ fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */
+ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */
+ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */
+ fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */
+ fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */
+ {}
+};
+
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
#define MOPT_SET 0x0001
#define MOPT_CLEAR 0x0002
#define MOPT_NOSUPPORT 0x0004
#define MOPT_EXPLICIT 0x0008
-#define MOPT_CLEAR_ERR 0x0010
-#define MOPT_GTE0 0x0020
#ifdef CONFIG_QUOTA
#define MOPT_Q 0
-#define MOPT_QFMT 0x0040
+#define MOPT_QFMT 0x0010
#else
#define MOPT_Q MOPT_NOSUPPORT
#define MOPT_QFMT MOPT_NOSUPPORT
#endif
-#define MOPT_DATAJ 0x0080
-#define MOPT_NO_EXT2 0x0100
-#define MOPT_NO_EXT3 0x0200
+#define MOPT_NO_EXT2 0x0020
+#define MOPT_NO_EXT3 0x0040
#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
-#define MOPT_STRING 0x0400
+#define MOPT_SKIP 0x0080
+#define MOPT_2 0x0100
static const struct mount_opts {
int token;
@@ -1755,6 +1781,7 @@ static const struct mount_opts {
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
{Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
+ {Opt_commit, 0, MOPT_NO_EXT2},
{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
@@ -1763,46 +1790,25 @@ static const struct mount_opts {
EXT4_MOUNT_JOURNAL_CHECKSUM),
MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
- {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
- {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
- {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
- {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2},
- {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2},
+ {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2},
{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
- {Opt_commit, 0, MOPT_GTE0},
- {Opt_max_batch_time, 0, MOPT_GTE0},
- {Opt_min_batch_time, 0, MOPT_GTE0},
- {Opt_inode_readahead_blks, 0, MOPT_GTE0},
- {Opt_init_itable, 0, MOPT_GTE0},
- {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
- {Opt_stripe, 0, MOPT_GTE0},
- {Opt_resuid, 0, MOPT_GTE0},
- {Opt_resgid, 0, MOPT_GTE0},
- {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0},
- {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING},
- {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0},
- {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
- {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
- {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
- MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_dax_type, 0, MOPT_EXT4_ONLY},
+ {Opt_journal_dev, 0, MOPT_NO_EXT2},
+ {Opt_journal_path, 0, MOPT_NO_EXT2},
+ {Opt_journal_ioprio, 0, MOPT_NO_EXT2},
+ {Opt_data, 0, MOPT_NO_EXT2},
{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
- {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
- {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
#else
{Opt_acl, 0, MOPT_NOSUPPORT},
- {Opt_noacl, 0, MOPT_NOSUPPORT},
#endif
{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
- {Opt_debug_want_extra_isize, 0, MOPT_GTE0},
{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
MOPT_SET | MOPT_Q},
@@ -1815,370 +1821,1019 @@ static const struct mount_opts {
MOPT_CLEAR | MOPT_Q},
{Opt_usrjquota, 0, MOPT_Q},
{Opt_grpjquota, 0, MOPT_Q},
- {Opt_offusrjquota, 0, MOPT_Q},
- {Opt_offgrpjquota, 0, MOPT_Q},
- {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
- {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
- {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
- {Opt_max_dir_size_kb, 0, MOPT_GTE0},
- {Opt_test_dummy_encryption, 0, MOPT_GTE0},
+ {Opt_jqfmt, 0, MOPT_QFMT},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+ {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
+ MOPT_SET},
+#ifdef CONFIG_EXT4_DEBUG
+ {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
+ MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
+#endif
{Opt_err, 0, 0}
};
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
static const struct ext4_sb_encodings {
__u16 magic;
char *name;
- char *version;
+ unsigned int version;
} ext4_sb_encoding_map[] = {
- {EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"},
+ {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)},
};
-static int ext4_sb_read_encoding(const struct ext4_super_block *es,
- const struct ext4_sb_encodings **encoding,
- __u16 *flags)
+static const struct ext4_sb_encodings *
+ext4_sb_read_encoding(const struct ext4_super_block *es)
{
__u16 magic = le16_to_cpu(es->s_encoding);
int i;
for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
if (magic == ext4_sb_encoding_map[i].magic)
- break;
+ return &ext4_sb_encoding_map[i];
+
+ return NULL;
+}
+#endif
+
+#define EXT4_SPEC_JQUOTA (1 << 0)
+#define EXT4_SPEC_JQFMT (1 << 1)
+#define EXT4_SPEC_DATAJ (1 << 2)
+#define EXT4_SPEC_SB_BLOCK (1 << 3)
+#define EXT4_SPEC_JOURNAL_DEV (1 << 4)
+#define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5)
+#define EXT4_SPEC_s_want_extra_isize (1 << 7)
+#define EXT4_SPEC_s_max_batch_time (1 << 8)
+#define EXT4_SPEC_s_min_batch_time (1 << 9)
+#define EXT4_SPEC_s_inode_readahead_blks (1 << 10)
+#define EXT4_SPEC_s_li_wait_mult (1 << 11)
+#define EXT4_SPEC_s_max_dir_size_kb (1 << 12)
+#define EXT4_SPEC_s_stripe (1 << 13)
+#define EXT4_SPEC_s_resuid (1 << 14)
+#define EXT4_SPEC_s_resgid (1 << 15)
+#define EXT4_SPEC_s_commit_interval (1 << 16)
+#define EXT4_SPEC_s_fc_debug_max_replay (1 << 17)
+#define EXT4_SPEC_s_sb_block (1 << 18)
+#define EXT4_SPEC_mb_optimize_scan (1 << 19)
+
+struct ext4_fs_context {
+ char *s_qf_names[EXT4_MAXQUOTAS];
+ struct fscrypt_dummy_policy dummy_enc_policy;
+ int s_jquota_fmt; /* Format of quota to use */
+#ifdef CONFIG_EXT4_DEBUG
+ int s_fc_debug_max_replay;
+#endif
+ unsigned short qname_spec;
+ unsigned long vals_s_flags; /* Bits to set in s_flags */
+ unsigned long mask_s_flags; /* Bits changed in s_flags */
+ unsigned long journal_devnum;
+ unsigned long s_commit_interval;
+ unsigned long s_stripe;
+ unsigned int s_inode_readahead_blks;
+ unsigned int s_want_extra_isize;
+ unsigned int s_li_wait_mult;
+ unsigned int s_max_dir_size_kb;
+ unsigned int journal_ioprio;
+ unsigned int vals_s_mount_opt;
+ unsigned int mask_s_mount_opt;
+ unsigned int vals_s_mount_opt2;
+ unsigned int mask_s_mount_opt2;
+ unsigned long vals_s_mount_flags;
+ unsigned long mask_s_mount_flags;
+ unsigned int opt_flags; /* MOPT flags */
+ unsigned int spec;
+ u32 s_max_batch_time;
+ u32 s_min_batch_time;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ ext4_fsblk_t s_sb_block;
+};
+
+static void ext4_fc_free(struct fs_context *fc)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ int i;
+
+ if (!ctx)
+ return;
+
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ kfree(ctx->s_qf_names[i]);
+
+ fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);
+ kfree(ctx);
+}
+
+int ext4_init_fs_context(struct fs_context *fc)
+{
+ struct ext4_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ fc->fs_private = ctx;
+ fc->ops = &ext4_context_ops;
+
+ return 0;
+}
+
+#ifdef CONFIG_QUOTA
+/*
+ * Note the name of the specified quota file.
+ */
+static int note_qf_name(struct fs_context *fc, int qtype,
+ struct fs_parameter *param)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ char *qname;
- if (i >= ARRAY_SIZE(ext4_sb_encoding_map))
+ if (param->size < 1) {
+ ext4_msg(NULL, KERN_ERR, "Missing quota name");
+ return -EINVAL;
+ }
+ if (strchr(param->string, '/')) {
+ ext4_msg(NULL, KERN_ERR,
+ "quotafile must be on filesystem root");
return -EINVAL;
+ }
+ if (ctx->s_qf_names[qtype]) {
+ if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) {
+ ext4_msg(NULL, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ return -EINVAL;
+ }
+ return 0;
+ }
- *encoding = &ext4_sb_encoding_map[i];
- *flags = le16_to_cpu(es->s_encoding_flags);
+ qname = kmemdup_nul(param->string, param->size, GFP_KERNEL);
+ if (!qname) {
+ ext4_msg(NULL, KERN_ERR,
+ "Not enough memory for storing quotafile name");
+ return -ENOMEM;
+ }
+ ctx->s_qf_names[qtype] = qname;
+ ctx->qname_spec |= 1 << qtype;
+ ctx->spec |= EXT4_SPEC_JQUOTA;
+ return 0;
+}
+
+/*
+ * Clear the name of the specified quota file.
+ */
+static int unnote_qf_name(struct fs_context *fc, int qtype)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ if (ctx->s_qf_names[qtype])
+ kfree(ctx->s_qf_names[qtype]);
+
+ ctx->s_qf_names[qtype] = NULL;
+ ctx->qname_spec |= 1 << qtype;
+ ctx->spec |= EXT4_SPEC_JQUOTA;
return 0;
}
#endif
-static int handle_mount_opt(struct super_block *sb, char *opt, int token,
- substring_t *args, unsigned long *journal_devnum,
- unsigned int *journal_ioprio, int is_remount)
+static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
+ struct ext4_fs_context *ctx)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "test_dummy_encryption option not supported");
+ return -EINVAL;
+ }
+ err = fscrypt_parse_test_dummy_encryption(param,
+ &ctx->dummy_enc_policy);
+ if (err == -EINVAL) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Value of option \"%s\" is unrecognized", param->key);
+ } else if (err == -EEXIST) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Conflicting test_dummy_encryption options");
+ return -EINVAL;
+ }
+ return err;
+}
+
+#define EXT4_SET_CTX(name) \
+static inline void ctx_set_##name(struct ext4_fs_context *ctx, \
+ unsigned long flag) \
+{ \
+ ctx->mask_s_##name |= flag; \
+ ctx->vals_s_##name |= flag; \
+}
+
+#define EXT4_CLEAR_CTX(name) \
+static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \
+ unsigned long flag) \
+{ \
+ ctx->mask_s_##name |= flag; \
+ ctx->vals_s_##name &= ~flag; \
+}
+
+#define EXT4_TEST_CTX(name) \
+static inline unsigned long \
+ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \
+{ \
+ return (ctx->vals_s_##name & flag); \
+}
+
+EXT4_SET_CTX(flags); /* set only */
+EXT4_SET_CTX(mount_opt);
+EXT4_CLEAR_CTX(mount_opt);
+EXT4_TEST_CTX(mount_opt);
+EXT4_SET_CTX(mount_opt2);
+EXT4_CLEAR_CTX(mount_opt2);
+EXT4_TEST_CTX(mount_opt2);
+
+static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit)
+{
+ set_bit(bit, &ctx->mask_s_mount_flags);
+ set_bit(bit, &ctx->vals_s_mount_flags);
+}
+
+static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
const struct mount_opts *m;
+ int is_remount;
kuid_t uid;
kgid_t gid;
- int arg = 0;
+ int token;
-#ifdef CONFIG_QUOTA
- if (token == Opt_usrjquota)
- return set_qf_name(sb, USRQUOTA, &args[0]);
- else if (token == Opt_grpjquota)
- return set_qf_name(sb, GRPQUOTA, &args[0]);
- else if (token == Opt_offusrjquota)
- return clear_qf_name(sb, USRQUOTA);
- else if (token == Opt_offgrpjquota)
- return clear_qf_name(sb, GRPQUOTA);
-#endif
- switch (token) {
- case Opt_noacl:
- case Opt_nouser_xattr:
- ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
- break;
- case Opt_sb:
- return 1; /* handled by get_sb_block() */
- case Opt_removed:
- ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
- return 1;
- case Opt_abort:
- sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
- return 1;
- case Opt_i_version:
- sb->s_flags |= SB_I_VERSION;
- return 1;
- case Opt_lazytime:
- sb->s_flags |= SB_LAZYTIME;
- return 1;
- case Opt_nolazytime:
- sb->s_flags &= ~SB_LAZYTIME;
- return 1;
- }
+ token = fs_parse(fc, ext4_param_specs, param, &result);
+ if (token < 0)
+ return token;
+ is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
for (m = ext4_mount_opts; m->token != Opt_err; m++)
if (token == m->token)
break;
- if (m->token == Opt_err) {
- ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
- "or missing value", opt);
- return -1;
- }
+ ctx->opt_flags |= m->flags;
- if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Mount option \"%s\" incompatible with ext2", opt);
- return -1;
- }
- if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Mount option \"%s\" incompatible with ext3", opt);
- return -1;
- }
-
- if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
- return -1;
- if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
- return -1;
if (m->flags & MOPT_EXPLICIT) {
if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
- set_opt2(sb, EXPLICIT_DELALLOC);
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC);
} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
- set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
+ ctx_set_mount_opt2(ctx,
+ EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM);
} else
- return -1;
- }
- if (m->flags & MOPT_CLEAR_ERR)
- clear_opt(sb, ERRORS_MASK);
- if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
- ext4_msg(sb, KERN_ERR, "Cannot change quota "
- "options when quota turned on");
- return -1;
+ return -EINVAL;
}
if (m->flags & MOPT_NOSUPPORT) {
- ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
- } else if (token == Opt_commit) {
- if (arg == 0)
- arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
- else if (arg > INT_MAX / HZ) {
- ext4_msg(sb, KERN_ERR,
+ ext4_msg(NULL, KERN_ERR, "%s option not supported",
+ param->key);
+ return 0;
+ }
+
+ switch (token) {
+#ifdef CONFIG_QUOTA
+ case Opt_usrjquota:
+ if (!*param->string)
+ return unnote_qf_name(fc, USRQUOTA);
+ else
+ return note_qf_name(fc, USRQUOTA, param);
+ case Opt_grpjquota:
+ if (!*param->string)
+ return unnote_qf_name(fc, GRPQUOTA);
+ else
+ return note_qf_name(fc, GRPQUOTA, param);
+#endif
+ case Opt_sb:
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Ignoring %s option on remount", param->key);
+ } else {
+ ctx->s_sb_block = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_sb_block;
+ }
+ return 0;
+ case Opt_removed:
+ ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option",
+ param->key);
+ return 0;
+ case Opt_abort:
+ ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED);
+ return 0;
+ case Opt_inlinecrypt:
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ ctx_set_flags(ctx, SB_INLINECRYPT);
+#else
+ ext4_msg(NULL, KERN_ERR, "inline encryption not supported");
+#endif
+ return 0;
+ case Opt_errors:
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK);
+ ctx_set_mount_opt(ctx, result.uint_32);
+ return 0;
+#ifdef CONFIG_QUOTA
+ case Opt_jqfmt:
+ ctx->s_jquota_fmt = result.uint_32;
+ ctx->spec |= EXT4_SPEC_JQFMT;
+ return 0;
+#endif
+ case Opt_data:
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
+ ctx_set_mount_opt(ctx, result.uint_32);
+ ctx->spec |= EXT4_SPEC_DATAJ;
+ return 0;
+ case Opt_commit:
+ if (result.uint_32 == 0)
+ ctx->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ else if (result.uint_32 > INT_MAX / HZ) {
+ ext4_msg(NULL, KERN_ERR,
"Invalid commit interval %d, "
"must be smaller than %d",
- arg, INT_MAX / HZ);
- return -1;
+ result.uint_32, INT_MAX / HZ);
+ return -EINVAL;
}
- sbi->s_commit_interval = HZ * arg;
- } else if (token == Opt_debug_want_extra_isize) {
- if ((arg & 1) ||
- (arg < 4) ||
- (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) {
- ext4_msg(sb, KERN_ERR,
- "Invalid want_extra_isize %d", arg);
- return -1;
+ ctx->s_commit_interval = HZ * result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_commit_interval;
+ return 0;
+ case Opt_debug_want_extra_isize:
+ if ((result.uint_32 & 1) || (result.uint_32 < 4)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Invalid want_extra_isize %d", result.uint_32);
+ return -EINVAL;
}
- sbi->s_want_extra_isize = arg;
- } else if (token == Opt_max_batch_time) {
- sbi->s_max_batch_time = arg;
- } else if (token == Opt_min_batch_time) {
- sbi->s_min_batch_time = arg;
- } else if (token == Opt_inode_readahead_blks) {
- if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
- ext4_msg(sb, KERN_ERR,
+ ctx->s_want_extra_isize = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_want_extra_isize;
+ return 0;
+ case Opt_max_batch_time:
+ ctx->s_max_batch_time = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_max_batch_time;
+ return 0;
+ case Opt_min_batch_time:
+ ctx->s_min_batch_time = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_min_batch_time;
+ return 0;
+ case Opt_inode_readahead_blks:
+ if (result.uint_32 &&
+ (result.uint_32 > (1 << 30) ||
+ !is_power_of_2(result.uint_32))) {
+ ext4_msg(NULL, KERN_ERR,
"EXT4-fs: inode_readahead_blks must be "
"0 or a power of 2 smaller than 2^31");
- return -1;
+ return -EINVAL;
}
- sbi->s_inode_readahead_blks = arg;
- } else if (token == Opt_init_itable) {
- set_opt(sb, INIT_INODE_TABLE);
- if (!args->from)
- arg = EXT4_DEF_LI_WAIT_MULT;
- sbi->s_li_wait_mult = arg;
- } else if (token == Opt_max_dir_size_kb) {
- sbi->s_max_dir_size_kb = arg;
- } else if (token == Opt_stripe) {
- sbi->s_stripe = arg;
- } else if (token == Opt_resuid) {
- uid = make_kuid(current_user_ns(), arg);
+ ctx->s_inode_readahead_blks = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_inode_readahead_blks;
+ return 0;
+ case Opt_init_itable:
+ ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE);
+ ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ if (param->type == fs_value_is_string)
+ ctx->s_li_wait_mult = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_li_wait_mult;
+ return 0;
+ case Opt_max_dir_size_kb:
+ ctx->s_max_dir_size_kb = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
+ return 0;
+#ifdef CONFIG_EXT4_DEBUG
+ case Opt_fc_debug_max_replay:
+ ctx->s_fc_debug_max_replay = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay;
+ return 0;
+#endif
+ case Opt_stripe:
+ ctx->s_stripe = result.uint_32;
+ ctx->spec |= EXT4_SPEC_s_stripe;
+ return 0;
+ case Opt_resuid:
+ uid = make_kuid(current_user_ns(), result.uint_32);
if (!uid_valid(uid)) {
- ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
- return -1;
+ ext4_msg(NULL, KERN_ERR, "Invalid uid value %d",
+ result.uint_32);
+ return -EINVAL;
}
- sbi->s_resuid = uid;
- } else if (token == Opt_resgid) {
- gid = make_kgid(current_user_ns(), arg);
+ ctx->s_resuid = uid;
+ ctx->spec |= EXT4_SPEC_s_resuid;
+ return 0;
+ case Opt_resgid:
+ gid = make_kgid(current_user_ns(), result.uint_32);
if (!gid_valid(gid)) {
- ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
- return -1;
+ ext4_msg(NULL, KERN_ERR, "Invalid gid value %d",
+ result.uint_32);
+ return -EINVAL;
}
- sbi->s_resgid = gid;
- } else if (token == Opt_journal_dev) {
+ ctx->s_resgid = gid;
+ ctx->spec |= EXT4_SPEC_s_resgid;
+ return 0;
+ case Opt_journal_dev:
if (is_remount) {
- ext4_msg(sb, KERN_ERR,
+ ext4_msg(NULL, KERN_ERR,
"Cannot specify journal on remount");
- return -1;
+ return -EINVAL;
}
- *journal_devnum = arg;
- } else if (token == Opt_journal_path) {
- char *journal_path;
+ ctx->journal_devnum = result.uint_32;
+ ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
+ return 0;
+ case Opt_journal_path:
+ {
struct inode *journal_inode;
struct path path;
int error;
if (is_remount) {
- ext4_msg(sb, KERN_ERR,
+ ext4_msg(NULL, KERN_ERR,
"Cannot specify journal on remount");
- return -1;
- }
- journal_path = match_strdup(&args[0]);
- if (!journal_path) {
- ext4_msg(sb, KERN_ERR, "error: could not dup "
- "journal device string");
- return -1;
+ return -EINVAL;
}
- error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ error = fs_lookup_param(fc, param, 1, &path);
if (error) {
- ext4_msg(sb, KERN_ERR, "error: could not find "
- "journal device path: error %d", error);
- kfree(journal_path);
- return -1;
+ ext4_msg(NULL, KERN_ERR, "error: could not find "
+ "journal device path");
+ return -EINVAL;
}
journal_inode = d_inode(path.dentry);
- if (!S_ISBLK(journal_inode->i_mode)) {
- ext4_msg(sb, KERN_ERR, "error: journal path %s "
- "is not a block device", journal_path);
- path_put(&path);
- kfree(journal_path);
- return -1;
- }
-
- *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
path_put(&path);
- kfree(journal_path);
- } else if (token == Opt_journal_ioprio) {
- if (arg > 7) {
- ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
+ return 0;
+ }
+ case Opt_journal_ioprio:
+ if (result.uint_32 > 7) {
+ ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority"
" (must be 0-7)");
- return -1;
+ return -EINVAL;
}
- *journal_ioprio =
- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
- } else if (token == Opt_test_dummy_encryption) {
-#ifdef CONFIG_FS_ENCRYPTION
- sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
- ext4_msg(sb, KERN_WARNING,
- "Test dummy encryption mode enabled");
+ ctx->journal_ioprio =
+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32);
+ ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
+ return 0;
+ case Opt_test_dummy_encryption:
+ return ext4_parse_test_dummy_encryption(param, ctx);
+ case Opt_dax:
+ case Opt_dax_type:
+#ifdef CONFIG_FS_DAX
+ {
+ int type = (token == Opt_dax) ?
+ Opt_dax : result.uint_32;
+
+ switch (type) {
+ case Opt_dax:
+ case Opt_dax_always:
+ ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
+ break;
+ case Opt_dax_never:
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
+ break;
+ case Opt_dax_inode:
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
+ /* Strictly for printing options */
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE);
+ break;
+ }
+ return 0;
+ }
#else
- ext4_msg(sb, KERN_WARNING,
- "Test dummy encryption mount option ignored");
+ ext4_msg(NULL, KERN_INFO, "dax option not supported");
+ return -EINVAL;
#endif
- } else if (m->flags & MOPT_DATAJ) {
- if (is_remount) {
- if (!sbi->s_journal)
- ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
- else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
- ext4_msg(sb, KERN_ERR,
- "Cannot change data mode on remount");
- return -1;
- }
+ case Opt_data_err:
+ if (result.uint_32 == Opt_data_err_abort)
+ ctx_set_mount_opt(ctx, m->mount_opt);
+ else if (result.uint_32 == Opt_data_err_ignore)
+ ctx_clear_mount_opt(ctx, m->mount_opt);
+ return 0;
+ case Opt_mb_optimize_scan:
+ if (result.int_32 == 1) {
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
+ ctx->spec |= EXT4_SPEC_mb_optimize_scan;
+ } else if (result.int_32 == 0) {
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
+ ctx->spec |= EXT4_SPEC_mb_optimize_scan;
} else {
- clear_opt(sb, DATA_FLAGS);
- sbi->s_mount_opt |= m->mount_opt;
- }
-#ifdef CONFIG_QUOTA
- } else if (m->flags & MOPT_QFMT) {
- if (sb_any_quota_loaded(sb) &&
- sbi->s_jquota_fmt != m->mount_opt) {
- ext4_msg(sb, KERN_ERR, "Cannot change journaled "
- "quota options when quota turned on");
- return -1;
- }
- if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_INFO,
- "Quota format mount options ignored "
- "when QUOTA feature is enabled");
- return 1;
+ ext4_msg(NULL, KERN_WARNING,
+ "mb_optimize_scan should be set to 0 or 1.");
+ return -EINVAL;
}
- sbi->s_jquota_fmt = m->mount_opt;
-#endif
- } else if (token == Opt_dax) {
-#ifdef CONFIG_FS_DAX
- ext4_msg(sb, KERN_WARNING,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- sbi->s_mount_opt |= m->mount_opt;
-#else
- ext4_msg(sb, KERN_INFO, "dax option not supported");
- return -1;
-#endif
- } else if (token == Opt_data_err_abort) {
- sbi->s_mount_opt |= m->mount_opt;
- } else if (token == Opt_data_err_ignore) {
- sbi->s_mount_opt &= ~m->mount_opt;
- } else {
- if (!args->from)
- arg = 1;
+ return 0;
+ }
+
+ /*
+ * At this point we should only be getting options requiring MOPT_SET,
+ * or MOPT_CLEAR. Anything else is a bug
+ */
+ if (m->token == Opt_err) {
+ ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s",
+ param->key);
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ else {
+ unsigned int set = 0;
+
+ if ((param->type == fs_value_is_flag) ||
+ result.uint_32 > 0)
+ set = 1;
+
if (m->flags & MOPT_CLEAR)
- arg = !arg;
+ set = !set;
else if (unlikely(!(m->flags & MOPT_SET))) {
- ext4_msg(sb, KERN_WARNING,
- "buggy handling of option %s", opt);
+ ext4_msg(NULL, KERN_WARNING,
+ "buggy handling of option %s",
+ param->key);
WARN_ON(1);
- return -1;
+ return -EINVAL;
+ }
+ if (m->flags & MOPT_2) {
+ if (set != 0)
+ ctx_set_mount_opt2(ctx, m->mount_opt);
+ else
+ ctx_clear_mount_opt2(ctx, m->mount_opt);
+ } else {
+ if (set != 0)
+ ctx_set_mount_opt(ctx, m->mount_opt);
+ else
+ ctx_clear_mount_opt(ctx, m->mount_opt);
}
- if (arg != 0)
- sbi->s_mount_opt |= m->mount_opt;
- else
- sbi->s_mount_opt &= ~m->mount_opt;
}
- return 1;
+
+ return 0;
}
-static int parse_options(char *options, struct super_block *sb,
- unsigned long *journal_devnum,
- unsigned int *journal_ioprio,
- int is_remount)
+static int parse_options(struct fs_context *fc, char *options)
{
- struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
- char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
- substring_t args[MAX_OPT_ARGS];
- int token;
+ struct fs_parameter param;
+ int ret;
+ char *key;
if (!options)
- return 1;
+ return 0;
- while ((p = strsep(&options, ",")) != NULL) {
- if (!*p)
- continue;
- /*
- * Initialize args struct so we know whether arg was
- * found; some options take optional arguments.
- */
- args[0].to = args[0].from = NULL;
- token = match_token(p, tokens, args);
- if (handle_mount_opt(sb, p, token, args, journal_devnum,
- journal_ioprio, is_remount) < 0)
- return 0;
+ while ((key = strsep(&options, ",")) != NULL) {
+ if (*key) {
+ size_t v_len = 0;
+ char *value = strchr(key, '=');
+
+ param.type = fs_value_is_flag;
+ param.string = NULL;
+
+ if (value) {
+ if (value == key)
+ continue;
+
+ *value++ = 0;
+ v_len = strlen(value);
+ param.string = kmemdup_nul(value, v_len,
+ GFP_KERNEL);
+ if (!param.string)
+ return -ENOMEM;
+ param.type = fs_value_is_string;
+ }
+
+ param.key = key;
+ param.size = v_len;
+
+ ret = ext4_parse_param(fc, &param);
+ if (param.string)
+ kfree(param.string);
+ if (ret < 0)
+ return ret;
+ }
}
+
+ ret = ext4_validate_options(fc);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static int parse_apply_sb_mount_options(struct super_block *sb,
+ struct ext4_fs_context *m_ctx)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char *s_mount_opts = NULL;
+ struct ext4_fs_context *s_ctx = NULL;
+ struct fs_context *fc = NULL;
+ int ret = -ENOMEM;
+
+ if (!sbi->s_es->s_mount_opts[0])
+ return 0;
+
+ s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
+ sizeof(sbi->s_es->s_mount_opts),
+ GFP_KERNEL);
+ if (!s_mount_opts)
+ return ret;
+
+ fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+ if (!fc)
+ goto out_free;
+
+ s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
+ if (!s_ctx)
+ goto out_free;
+
+ fc->fs_private = s_ctx;
+ fc->s_fs_info = sbi;
+
+ ret = parse_options(fc, s_mount_opts);
+ if (ret < 0)
+ goto parse_failed;
+
+ ret = ext4_check_opt_consistency(fc, sb);
+ if (ret < 0) {
+parse_failed:
+ ext4_msg(sb, KERN_WARNING,
+ "failed to parse options in superblock: %s",
+ s_mount_opts);
+ ret = 0;
+ goto out_free;
+ }
+
+ if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV)
+ m_ctx->journal_devnum = s_ctx->journal_devnum;
+ if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
+ m_ctx->journal_ioprio = s_ctx->journal_ioprio;
+
+ ext4_apply_options(fc, sb);
+ ret = 0;
+
+out_free:
+ if (fc) {
+ ext4_fc_free(fc);
+ kfree(fc);
+ }
+ kfree(s_mount_opts);
+ return ret;
+}
+
+static void ext4_apply_quota_options(struct fs_context *fc,
+ struct super_block *sb)
+{
#ifdef CONFIG_QUOTA
+ bool quota_feature = ext4_has_feature_quota(sb);
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char *qname;
+ int i;
+
+ if (quota_feature)
+ return;
+
+ if (ctx->spec & EXT4_SPEC_JQUOTA) {
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (!(ctx->qname_spec & (1 << i)))
+ continue;
+
+ qname = ctx->s_qf_names[i]; /* May be NULL */
+ if (qname)
+ set_opt(sb, QUOTA);
+ ctx->s_qf_names[i] = NULL;
+ qname = rcu_replace_pointer(sbi->s_qf_names[i], qname,
+ lockdep_is_held(&sb->s_umount));
+ if (qname)
+ kfree_rcu(qname);
+ }
+ }
+
+ if (ctx->spec & EXT4_SPEC_JQFMT)
+ sbi->s_jquota_fmt = ctx->s_jquota_fmt;
+#endif
+}
+
+/*
+ * Check quota settings consistency.
+ */
+static int ext4_check_quota_consistency(struct fs_context *fc,
+ struct super_block *sb)
+{
+#ifdef CONFIG_QUOTA
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ bool quota_feature = ext4_has_feature_quota(sb);
+ bool quota_loaded = sb_any_quota_loaded(sb);
+ bool usr_qf_name, grp_qf_name, usrquota, grpquota;
+ int quota_flags, i;
+
/*
* We do the test below only for project quotas. 'usrquota' and
* 'grpquota' mount options are allowed even without quota feature
* to support legacy quotas in quota files.
*/
- if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) {
- ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. "
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) &&
+ !ext4_has_feature_project(sb)) {
+ ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. "
"Cannot enable project quota enforcement.");
- return 0;
+ return -EINVAL;
}
- usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
- grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
- if (usr_qf_name || grp_qf_name) {
- if (test_opt(sb, USRQUOTA) && usr_qf_name)
- clear_opt(sb, USRQUOTA);
- if (test_opt(sb, GRPQUOTA) && grp_qf_name)
- clear_opt(sb, GRPQUOTA);
+ quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
+ EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA;
+ if (quota_loaded &&
+ ctx->mask_s_mount_opt & quota_flags &&
+ !ctx_test_mount_opt(ctx, quota_flags))
+ goto err_quota_change;
+
+ if (ctx->spec & EXT4_SPEC_JQUOTA) {
+
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (!(ctx->qname_spec & (1 << i)))
+ continue;
+
+ if (quota_loaded &&
+ !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i])
+ goto err_jquota_change;
+
+ if (sbi->s_qf_names[i] && ctx->s_qf_names[i] &&
+ strcmp(get_qf_name(sb, sbi, i),
+ ctx->s_qf_names[i]) != 0)
+ goto err_jquota_specified;
+ }
- if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
- ext4_msg(sb, KERN_ERR, "old and new quota "
- "format mixing");
+ if (quota_feature) {
+ ext4_msg(NULL, KERN_INFO,
+ "Journaled quota options ignored when "
+ "QUOTA feature is enabled");
return 0;
}
+ }
- if (!sbi->s_jquota_fmt) {
- ext4_msg(sb, KERN_ERR, "journaled quota format "
- "not specified");
+ if (ctx->spec & EXT4_SPEC_JQFMT) {
+ if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded)
+ goto err_jquota_change;
+ if (quota_feature) {
+ ext4_msg(NULL, KERN_INFO, "Quota format mount options "
+ "ignored when QUOTA feature is enabled");
return 0;
}
}
+
+ /* Make sure we don't mix old and new quota format */
+ usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) ||
+ ctx->s_qf_names[USRQUOTA]);
+ grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) ||
+ ctx->s_qf_names[GRPQUOTA]);
+
+ usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
+ test_opt(sb, USRQUOTA));
+
+ grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) ||
+ test_opt(sb, GRPQUOTA));
+
+ if (usr_qf_name) {
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
+ usrquota = false;
+ }
+ if (grp_qf_name) {
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
+ grpquota = false;
+ }
+
+ if (usr_qf_name || grp_qf_name) {
+ if (usrquota || grpquota) {
+ ext4_msg(NULL, KERN_ERR, "old and new quota "
+ "format mixing");
+ return -EINVAL;
+ }
+
+ if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) {
+ ext4_msg(NULL, KERN_ERR, "journaled quota format "
+ "not specified");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+
+err_quota_change:
+ ext4_msg(NULL, KERN_ERR,
+ "Cannot change quota options when quota turned on");
+ return -EINVAL;
+err_jquota_change:
+ ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota "
+ "options when quota turned on");
+ return -EINVAL;
+err_jquota_specified:
+ ext4_msg(NULL, KERN_ERR, "%s quota file already specified",
+ QTYPE2NAME(i));
+ return -EINVAL;
+#else
+ return 0;
+#endif
+}
+
+static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
+ struct super_block *sb)
+{
+ const struct ext4_fs_context *ctx = fc->fs_private;
+ const struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))
+ return 0;
+
+ if (!ext4_has_feature_encrypt(sb)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "test_dummy_encryption requires encrypt feature");
+ return -EINVAL;
+ }
+ /*
+ * This mount option is just for testing, and it's not worthwhile to
+ * implement the extra complexity (e.g. RCU protection) that would be
+ * needed to allow it to be set or changed during remount. We do allow
+ * it to be specified during remount, but only if there is no change.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
+ &ctx->dummy_enc_policy))
+ return 0;
+ ext4_msg(NULL, KERN_WARNING,
+ "Can't set or change test_dummy_encryption on remount");
+ return -EINVAL;
+ }
+ /* Also make sure s_mount_opts didn't contain a conflicting value. */
+ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) {
+ if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
+ &ctx->dummy_enc_policy))
+ return 0;
+ ext4_msg(NULL, KERN_WARNING,
+ "Conflicting test_dummy_encryption options");
+ return -EINVAL;
+ }
+ /*
+ * fscrypt_add_test_dummy_key() technically changes the super_block, so
+ * technically it should be delayed until ext4_apply_options() like the
+ * other changes. But since we never get here for remounts (see above),
+ * and this is the last chance to report errors, we do it here.
+ */
+ err = fscrypt_add_test_dummy_key(sb, &ctx->dummy_enc_policy);
+ if (err)
+ ext4_msg(NULL, KERN_WARNING,
+ "Error adding test dummy encryption key [%d]", err);
+ return err;
+}
+
+static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx,
+ struct super_block *sb)
+{
+ if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) ||
+ /* if already set, it was already verified to be the same */
+ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy))
+ return;
+ EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy;
+ memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy));
+ ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
+}
+
+static int ext4_check_opt_consistency(struct fs_context *fc,
+ struct super_block *sb)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = fc->s_fs_info;
+ int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
+ int err;
+
+ if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Mount option(s) incompatible with ext2");
+ return -EINVAL;
+ }
+ if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Mount option(s) incompatible with ext3");
+ return -EINVAL;
+ }
+
+ if (ctx->s_want_extra_isize >
+ (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) {
+ ext4_msg(NULL, KERN_ERR,
+ "Invalid want_extra_isize %d",
+ ctx->s_want_extra_isize);
+ return -EINVAL;
+ }
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) {
+ int blocksize =
+ BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
+ if (blocksize < PAGE_SIZE)
+ ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an "
+ "experimental mount option 'dioread_nolock' "
+ "for blocksize < PAGE_SIZE");
+ }
+
+ err = ext4_check_test_dummy_encryption(fc, sb);
+ if (err)
+ return err;
+
+ if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) {
+ if (!sbi->s_journal) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Remounting file system with no journal "
+ "so ignoring journalled data option");
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
+ } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) !=
+ test_opt(sb, DATA_FLAGS)) {
+ ext4_msg(NULL, KERN_ERR, "Cannot change data mode "
+ "on remount");
+ return -EINVAL;
+ }
+ }
+
+ if (is_remount) {
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
+ (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
+ ext4_msg(NULL, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -EINVAL;
+ }
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
+ (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
+fail_dax_change_remount:
+ ext4_msg(NULL, KERN_ERR, "can't change "
+ "dax mount option while remounting");
+ return -EINVAL;
+ } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) &&
+ (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) {
+ goto fail_dax_change_remount;
+ } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) &&
+ ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) {
+ goto fail_dax_change_remount;
+ }
+ }
+
+ return ext4_check_quota_consistency(fc, sb);
+}
+
+static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi = fc->s_fs_info;
+
+ sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
+ sbi->s_mount_opt |= ctx->vals_s_mount_opt;
+ sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2;
+ sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2;
+ sbi->s_mount_flags &= ~ctx->mask_s_mount_flags;
+ sbi->s_mount_flags |= ctx->vals_s_mount_flags;
+ sb->s_flags &= ~ctx->mask_s_flags;
+ sb->s_flags |= ctx->vals_s_flags;
+
+#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
+ APPLY(s_commit_interval);
+ APPLY(s_stripe);
+ APPLY(s_max_batch_time);
+ APPLY(s_min_batch_time);
+ APPLY(s_want_extra_isize);
+ APPLY(s_inode_readahead_blks);
+ APPLY(s_max_dir_size_kb);
+ APPLY(s_li_wait_mult);
+ APPLY(s_resgid);
+ APPLY(s_resuid);
+
+#ifdef CONFIG_EXT4_DEBUG
+ APPLY(s_fc_debug_max_replay);
+#endif
+
+ ext4_apply_quota_options(fc, sb);
+ ext4_apply_test_dummy_encryption(ctx, sb);
+}
+
+
+static int ext4_validate_options(struct fs_context *fc)
+{
+#ifdef CONFIG_QUOTA
+ struct ext4_fs_context *ctx = fc->fs_private;
+ char *usr_qf_name, *grp_qf_name;
+
+ usr_qf_name = ctx->s_qf_names[USRQUOTA];
+ grp_qf_name = ctx->s_qf_names[GRPQUOTA];
+
+ if (usr_qf_name || grp_qf_name) {
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name)
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name)
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
+
+ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
+ ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) {
+ ext4_msg(NULL, KERN_ERR, "old and new quota "
+ "format mixing");
+ return -EINVAL;
+ }
+ }
#endif
return 1;
}
@@ -2220,12 +2875,12 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
static const char *token2str(int token)
{
- const struct match_token *t;
+ const struct fs_parameter_spec *spec;
- for (t = tokens; t->token != Opt_err; t++)
- if (t->token == token && !strchr(t->pattern, '='))
+ for (spec = ext4_param_specs; spec->name != NULL; spec++)
+ if (spec->opt == token && !spec->type)
break;
- return t->pattern;
+ return spec->name;
}
/*
@@ -2251,7 +2906,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
for (m = ext4_mount_opts; m->token != Opt_err; m++) {
int want_set = m->flags & MOPT_SET;
if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
- (m->flags & MOPT_CLEAR_ERR))
+ m->flags & MOPT_SKIP)
continue;
if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
continue; /* skip if same as the default */
@@ -2283,8 +2938,6 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
- if (sb->s_flags & SB_I_VERSION)
- SEQ_OPTS_PUTS("i_version");
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
if (nodefs || EXT4_MOUNT_DATA_FLAGS &
@@ -2308,8 +2961,30 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
if (test_opt(sb, DATA_ERR_ABORT))
SEQ_OPTS_PUTS("data_err=abort");
- if (DUMMY_ENCRYPTION_ENABLED(sbi))
- SEQ_OPTS_PUTS("test_dummy_encryption");
+
+ fscrypt_show_test_dummy_encryption(seq, sep, sb);
+
+ if (sb->s_flags & SB_INLINECRYPT)
+ SEQ_OPTS_PUTS("inlinecrypt");
+
+ if (test_opt(sb, DAX_ALWAYS)) {
+ if (IS_EXT2_SB(sb))
+ SEQ_OPTS_PUTS("dax");
+ else
+ SEQ_OPTS_PUTS("dax=always");
+ } else if (test_opt2(sb, DAX_NEVER)) {
+ SEQ_OPTS_PUTS("dax=never");
+ } else if (test_opt2(sb, DAX_INODE)) {
+ SEQ_OPTS_PUTS("dax=inode");
+ }
+
+ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
+ !test_opt2(sb, MB_OPTIMIZE_SCAN)) {
+ SEQ_OPTS_PUTS("mb_optimize_scan=0");
+ } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
+ test_opt2(sb, MB_OPTIMIZE_SCAN)) {
+ SEQ_OPTS_PUTS("mb_optimize_scan=1");
+ }
ext4_show_quota_options(seq, sb);
return 0;
@@ -2341,6 +3016,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
ext4_msg(sb, KERN_ERR, "revision level too high, "
"forcing read-only mode");
err = -EROFS;
+ goto done;
}
if (read_only)
goto done;
@@ -2369,10 +3045,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
ext4_update_tstamp(es, s_mtime);
- if (sbi->s_journal)
+ if (sbi->s_journal) {
ext4_set_feature_journal_needs_recovery(sb);
+ if (ext4_has_feature_orphan_file(sb))
+ ext4_set_feature_orphan_present(sb);
+ }
- err = ext4_commit_super(sb, 1);
+ err = ext4_commit_super(sb);
done:
if (test_opt(sb, DEBUG))
printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
@@ -2382,8 +3061,6 @@ done:
EXT4_BLOCKS_PER_GROUP(sb),
EXT4_INODES_PER_GROUP(sb),
sbi->s_mount_opt, sbi->s_mount_opt2);
-
- cleancache_init_fs(sb);
return err;
}
@@ -2652,165 +3329,6 @@ static int ext4_check_descriptors(struct super_block *sb,
return 1;
}
-/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
- * the superblock) which were deleted from all directories, but held open by
- * a process at the time of a crash. We walk the list and try to delete these
- * inodes at recovery time (only with a read-write filesystem).
- *
- * In order to keep the orphan inode chain consistent during traversal (in
- * case of crash during recovery), we link each inode into the superblock
- * orphan list_head and handle it the same way as an inode deletion during
- * normal operation (which journals the operations for us).
- *
- * We only do an iget() and an iput() on each inode, which is very safe if we
- * accidentally point at an in-use or already deleted inode. The worst that
- * can happen in this case is that we get a "bit already cleared" message from
- * ext4_free_inode(). The only reason we would point at a wrong inode is if
- * e2fsck was run on this filesystem, and it must have already done the orphan
- * inode cleanup for us, so we can safely abort without any further action.
- */
-static void ext4_orphan_cleanup(struct super_block *sb,
- struct ext4_super_block *es)
-{
- unsigned int s_flags = sb->s_flags;
- int ret, nr_orphans = 0, nr_truncates = 0;
-#ifdef CONFIG_QUOTA
- int quota_update = 0;
- int i;
-#endif
- if (!es->s_last_orphan) {
- jbd_debug(4, "no orphan inodes to clean up\n");
- return;
- }
-
- if (bdev_read_only(sb->s_bdev)) {
- ext4_msg(sb, KERN_ERR, "write access "
- "unavailable, skipping orphan cleanup");
- return;
- }
-
- /* Check if feature set would not allow a r/w mount */
- if (!ext4_feature_set_ok(sb, 0)) {
- ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
- "unknown ROCOMPAT features");
- return;
- }
-
- if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- /* don't clear list on RO mount w/ errors */
- if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
- ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
- "clearing orphan list.\n");
- es->s_last_orphan = 0;
- }
- jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
- return;
- }
-
- if (s_flags & SB_RDONLY) {
- ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
- sb->s_flags &= ~SB_RDONLY;
- }
-#ifdef CONFIG_QUOTA
- /* Needed for iput() to work correctly and not trash data */
- sb->s_flags |= SB_ACTIVE;
-
- /*
- * Turn on quotas which were not enabled for read-only mounts if
- * filesystem has quota feature, so that they are updated correctly.
- */
- if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
- int ret = ext4_enable_quotas(sb);
-
- if (!ret)
- quota_update = 1;
- else
- ext4_msg(sb, KERN_ERR,
- "Cannot turn on quotas: error %d", ret);
- }
-
- /* Turn on journaled quotas used for old sytle */
- for (i = 0; i < EXT4_MAXQUOTAS; i++) {
- if (EXT4_SB(sb)->s_qf_names[i]) {
- int ret = ext4_quota_on_mount(sb, i);
-
- if (!ret)
- quota_update = 1;
- else
- ext4_msg(sb, KERN_ERR,
- "Cannot turn on journaled "
- "quota: type %d: error %d", i, ret);
- }
- }
-#endif
-
- while (es->s_last_orphan) {
- struct inode *inode;
-
- /*
- * We may have encountered an error during cleanup; if
- * so, skip the rest.
- */
- if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
- es->s_last_orphan = 0;
- break;
- }
-
- inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
- if (IS_ERR(inode)) {
- es->s_last_orphan = 0;
- break;
- }
-
- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
- dquot_initialize(inode);
- if (inode->i_nlink) {
- if (test_opt(sb, DEBUG))
- ext4_msg(sb, KERN_DEBUG,
- "%s: truncating inode %lu to %lld bytes",
- __func__, inode->i_ino, inode->i_size);
- jbd_debug(2, "truncating inode %lu to %lld bytes\n",
- inode->i_ino, inode->i_size);
- inode_lock(inode);
- truncate_inode_pages(inode->i_mapping, inode->i_size);
- ret = ext4_truncate(inode);
- if (ret)
- ext4_std_error(inode->i_sb, ret);
- inode_unlock(inode);
- nr_truncates++;
- } else {
- if (test_opt(sb, DEBUG))
- ext4_msg(sb, KERN_DEBUG,
- "%s: deleting unreferenced inode %lu",
- __func__, inode->i_ino);
- jbd_debug(2, "deleting unreferenced inode %lu\n",
- inode->i_ino);
- nr_orphans++;
- }
- iput(inode); /* The delete magic happens here! */
- }
-
-#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
-
- if (nr_orphans)
- ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
- PLURAL(nr_orphans));
- if (nr_truncates)
- ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
- PLURAL(nr_truncates));
-#ifdef CONFIG_QUOTA
- /* Turn off quotas if they were enabled for orphan cleanup */
- if (quota_update) {
- for (i = 0; i < EXT4_MAXQUOTAS; i++) {
- if (sb_dqopt(sb)->files[i])
- dquot_quota_off(sb, i);
- }
- }
-#endif
- sb->s_flags = s_flags; /* Restore SB_RDONLY status */
-}
-
/*
* Maximal extent format file size.
* Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -2863,17 +3381,18 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
*/
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
- loff_t res = EXT4_NDIR_BLOCKS;
+ loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
int meta_blocks;
- loff_t upper_limit;
- /* This is calculated to be the largest file size for a dense, block
+ unsigned int ppb = 1 << (bits - 2);
+
+ /*
+ * This is calculated to be the largest file size for a dense, block
* mapped file such that the file's total number of 512-byte sectors,
* including data and all indirect blocks, does not exceed (2^48 - 1).
*
* __u32 i_blocks_lo and _u16 i_blocks_high represent the total
* number of 512-byte sectors of the file.
*/
-
if (!has_huge_files) {
/*
* !has_huge_files or implies that the inode i_block field
@@ -2896,23 +3415,38 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
}
+ /* Compute how many blocks we can address by block tree */
+ res += ppb;
+ res += ppb * ppb;
+ res += ((loff_t)ppb) * ppb * ppb;
+ /* Compute how many metadata blocks are needed */
+ meta_blocks = 1;
+ meta_blocks += 1 + ppb;
+ meta_blocks += 1 + ppb + ppb * ppb;
+ /* Does block tree limit file size? */
+ if (res + meta_blocks <= upper_limit)
+ goto check_lfs;
+
+ res = upper_limit;
+ /* How many metadata blocks are needed for addressing upper_limit? */
+ upper_limit -= EXT4_NDIR_BLOCKS;
/* indirect blocks */
meta_blocks = 1;
+ upper_limit -= ppb;
/* double indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2));
- /* tripple indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
- upper_limit -= meta_blocks;
- upper_limit <<= bits;
-
- res += 1LL << (bits-2);
- res += 1LL << (2*(bits-2));
- res += 1LL << (3*(bits-2));
+ if (upper_limit < ppb * ppb) {
+ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
+ res -= meta_blocks;
+ goto check_lfs;
+ }
+ meta_blocks += 1 + ppb;
+ upper_limit -= ppb * ppb;
+ /* tripple indirect blocks for the rest */
+ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
+ DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
+ res -= meta_blocks;
+check_lfs:
res <<= bits;
- if (res > upper_limit)
- res = upper_limit;
-
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
@@ -2990,7 +3524,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
* Returns 1 if this filesystem can be mounted as requested,
* 0 if it cannot be.
*/
-static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+int ext4_feature_set_ok(struct super_block *sb, int readonly)
{
if (ext4_has_unknown_ext4_incompat_features(sb)) {
ext4_msg(sb, KERN_ERR,
@@ -3001,7 +3535,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
-#ifndef CONFIG_UNICODE
+#if !IS_ENABLED(CONFIG_UNICODE)
if (ext4_has_feature_casefold(sb)) {
ext4_msg(sb, KERN_ERR,
"Filesystem with casefold feature cannot be "
@@ -3096,15 +3630,34 @@ static void print_daily_error_info(struct timer_list *t)
static int ext4_run_li_request(struct ext4_li_request *elr)
{
struct ext4_group_desc *gdp = NULL;
- ext4_group_t group, ngroups;
- struct super_block *sb;
- unsigned long timeout = 0;
+ struct super_block *sb = elr->lr_super;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t group = elr->lr_next_group;
+ unsigned int prefetch_ios = 0;
int ret = 0;
+ u64 start_time;
+
+ if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
+ elr->lr_next_group = ext4_mb_prefetch(sb, group,
+ EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
+ if (prefetch_ios)
+ ext4_mb_prefetch_fini(sb, elr->lr_next_group,
+ prefetch_ios);
+ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
+ prefetch_ios);
+ if (group >= elr->lr_next_group) {
+ ret = 1;
+ if (elr->lr_first_not_zeroed != ngroups &&
+ !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
+ elr->lr_next_group = elr->lr_first_not_zeroed;
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ ret = 0;
+ }
+ }
+ return ret;
+ }
- sb = elr->lr_super;
- ngroups = EXT4_SB(sb)->s_groups_count;
-
- for (group = elr->lr_next_group; group < ngroups; group++) {
+ for (; group < ngroups; group++) {
gdp = ext4_get_group_desc(sb, group, NULL);
if (!gdp) {
ret = 1;
@@ -3119,13 +3672,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- timeout = jiffies;
+ start_time = ktime_get_real_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
+ trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- timeout = (jiffies - timeout) *
- elr->lr_sbi->s_li_wait_mult;
- elr->lr_timeout = timeout;
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
@@ -3139,15 +3692,11 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
*/
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
- struct ext4_sb_info *sbi;
-
if (!elr)
return;
- sbi = elr->lr_sbi;
-
list_del(&elr->lr_request);
- sbi->s_li_request = NULL;
+ EXT4_SB(elr->lr_super)->s_li_request = NULL;
kfree(elr);
}
@@ -3178,12 +3727,13 @@ static struct task_struct *ext4_lazyinit_task;
*/
static int ext4_lazyinit_thread(void *arg)
{
- struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+ struct ext4_lazy_init *eli = arg;
struct list_head *pos, *n;
struct ext4_li_request *elr;
unsigned long next_wakeup, cur;
BUG_ON(NULL == eli);
+ set_freezable();
cont_thread:
while (true) {
@@ -3228,8 +3778,7 @@ cont_thread:
}
if (!progress) {
elr->lr_next_sched = jiffies +
- (prandom_u32()
- % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
+ prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ);
}
if (time_before(elr->lr_next_sched, next_wakeup))
next_wakeup = elr->lr_next_sched;
@@ -3356,7 +3905,6 @@ static int ext4_li_info_new(void)
static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
ext4_group_t start)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_li_request *elr;
elr = kzalloc(sizeof(*elr), GFP_KERNEL);
@@ -3364,16 +3912,21 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
return NULL;
elr->lr_super = sb;
- elr->lr_sbi = sbi;
- elr->lr_next_group = start;
+ elr->lr_first_not_zeroed = start;
+ if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ elr->lr_next_group = start;
+ } else {
+ elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
+ }
/*
* Randomize first schedule time of the request to
* spread the inode table initialization requests
* better.
*/
- elr->lr_next_sched = jiffies + (prandom_u32() %
- (EXT4_DEF_LI_MAX_START_DELAY * HZ));
+ elr->lr_next_sched = jiffies + prandom_u32_max(
+ EXT4_DEF_LI_MAX_START_DELAY * HZ);
return elr;
}
@@ -3395,8 +3948,9 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
- if (first_not_zeroed == ngroups || sb_rdonly(sb) ||
- !test_opt(sb, INIT_INODE_TABLE))
+ if (sb_rdonly(sb) ||
+ (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
+ (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
@@ -3513,9 +4067,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_fsblk_t first_block, last_block, b;
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int s, j, count = 0;
+ int has_super = ext4_bg_has_super(sb, grp);
if (!ext4_has_feature_bigalloc(sb))
- return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+ return (has_super + ext4_bg_num_gdb(sb, grp) +
+ (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
sbi->s_itb_per_group + 2);
first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
@@ -3607,9 +4163,10 @@ int ext4_calculate_overhead(struct super_block *sb)
* Add the internal journal blocks whether the journal has been
* loaded or not
*/
- if (sbi->s_journal && !sbi->journal_bdev)
- overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
- else if (ext4_has_feature_journal(sb) && !sbi->s_journal) {
+ if (sbi->s_journal && !sbi->s_journal_bdev)
+ overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
+ else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
+ /* j_inum for internal journal is non-zero */
j_inode = ext4_get_journal_inode(sb, j_inum);
if (j_inode) {
j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
@@ -3655,124 +4212,75 @@ static void ext4_set_resv_clusters(struct super_block *sb)
atomic64_set(&sbi->s_resv_clusters, resv_clusters);
}
-static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+static const char *ext4_quota_mode(struct super_block *sb)
{
- struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
- char *orig_data = kstrdup(data, GFP_KERNEL);
- struct buffer_head *bh, **group_desc;
- struct ext4_super_block *es = NULL;
- struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- struct flex_groups **flex_groups;
- ext4_fsblk_t block;
- ext4_fsblk_t sb_block = get_sb_block(&data);
- ext4_fsblk_t logical_sb_block;
- unsigned long offset = 0;
- unsigned long journal_devnum = 0;
- unsigned long def_mount_opts;
- struct inode *root;
- const char *descr;
- int ret = -ENOMEM;
- int blocksize, clustersize;
- unsigned int db_count;
- unsigned int i;
- int needs_recovery, has_huge_files, has_bigalloc;
- __u64 blocks_count;
- int err = 0;
- unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
- ext4_group_t first_not_zeroed;
+#ifdef CONFIG_QUOTA
+ if (!ext4_quota_capable(sb))
+ return "none";
- if ((data && !orig_data) || !sbi)
- goto out_free_base;
+ if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
+ return "journalled";
+ else
+ return "writeback";
+#else
+ return "disabled";
+#endif
+}
- sbi->s_daxdev = dax_dev;
- sbi->s_blockgroup_lock =
- kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
- if (!sbi->s_blockgroup_lock)
- goto out_free_base;
+static void ext4_setup_csum_trigger(struct super_block *sb,
+ enum ext4_journal_trigger_type type,
+ void (*trigger)(
+ struct jbd2_buffer_trigger_type *type,
+ struct buffer_head *bh,
+ void *mapped_data,
+ size_t size))
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
- sb->s_fs_info = sbi;
- sbi->s_sb = sb;
- sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
- sbi->s_sb_block = sb_block;
- if (sb->s_bdev->bd_part)
- sbi->s_sectors_written_start =
- part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
+ sbi->s_journal_triggers[type].sb = sb;
+ sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger;
+}
- /* Cleanup superblock name */
- strreplace(sb->s_id, '/', '!');
+static void ext4_free_sbi(struct ext4_sb_info *sbi)
+{
+ if (!sbi)
+ return;
- /* -EINVAL is default */
- ret = -EINVAL;
- blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
- if (!blocksize) {
- ext4_msg(sb, KERN_ERR, "unable to set blocksize");
- goto out_fail;
- }
+ kfree(sbi->s_blockgroup_lock);
+ fs_put_dax(sbi->s_daxdev, NULL);
+ kfree(sbi);
+}
- /*
- * The ext4 superblock will not be buffer aligned for other than 1kB
- * block sizes. We need to calculate the offset from buffer start.
- */
- if (blocksize != EXT4_MIN_BLOCK_SIZE) {
- logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
- offset = do_div(logical_sb_block, blocksize);
- } else {
- logical_sb_block = sb_block;
- }
+static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi;
- if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
- ext4_msg(sb, KERN_ERR, "unable to read superblock");
- goto out_fail;
- }
- /*
- * Note: s_es must be initialized as soon as possible because
- * some ext4 macro-instructions depend on its value
- */
- es = (struct ext4_super_block *) (bh->b_data + offset);
- sbi->s_es = es;
- sb->s_magic = le16_to_cpu(es->s_magic);
- if (sb->s_magic != EXT4_SUPER_MAGIC)
- goto cantfind_ext4;
- sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
+ return NULL;
- /* Warn if metadata_csum and gdt_csum are both set. */
- if (ext4_has_feature_metadata_csum(sb) &&
- ext4_has_feature_gdt_csum(sb))
- ext4_warning(sb, "metadata_csum and uninit_bg are "
- "redundant flags; please run fsck.");
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+ NULL, NULL);
- /* Check for a known checksum algorithm */
- if (!ext4_verify_csum_type(sb, es)) {
- ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
- "unknown checksum algorithm.");
- silent = 1;
- goto cantfind_ext4;
- }
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
- /* Load the checksum driver */
- sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(sbi->s_chksum_driver)) {
- ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
- ret = PTR_ERR(sbi->s_chksum_driver);
- sbi->s_chksum_driver = NULL;
- goto failed_mount;
- }
+ if (!sbi->s_blockgroup_lock)
+ goto err_out;
- /* Check superblock checksum */
- if (!ext4_superblock_csum_verify(sb, es)) {
- ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
- "invalid superblock checksum. Run e2fsck?");
- silent = 1;
- ret = -EFSBADCRC;
- goto cantfind_ext4;
- }
+ sb->s_fs_info = sbi;
+ sbi->s_sb = sb;
+ return sbi;
+err_out:
+ fs_put_dax(sbi->s_daxdev, NULL);
+ kfree(sbi);
+ return NULL;
+}
- /* Precompute checksum seed for all metadata */
- if (ext4_has_feature_csum_seed(sb))
- sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
- else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
- sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
- sizeof(es->s_uuid));
+static void ext4_set_def_opts(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ unsigned long def_mount_opts;
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -3785,10 +4293,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sb, NO_UID32);
/* xattr user namespace & acls are now defaulted on */
set_opt(sb, XATTR_USER);
- set_opt(sb, DIOREAD_NOLOCK);
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
+ if (ext4_has_feature_fast_commit(sb))
+ set_opt2(sb, JOURNAL_FAST_COMMIT);
/* don't forget to enable journal_csum when metadata_csum is enabled. */
if (ext4_has_metadata_csum(sb))
set_opt(sb, JOURNAL_CHECKSUM);
@@ -3800,9 +4309,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
set_opt(sb, WRITEBACK_DATA);
- if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+ if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
set_opt(sb, ERRORS_PANIC);
- else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
+ else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
set_opt(sb, ERRORS_CONT);
else
set_opt(sb, ERRORS_RO);
@@ -3811,12 +4320,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (def_mount_opts & EXT4_DEFM_DISCARD)
set_opt(sb, DISCARD);
- sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
- sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
- sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
- sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
- sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
-
if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
set_opt(sb, BARRIER);
@@ -3828,20 +4331,96 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
- /*
- * set default s_li_wait_mult for lazyinit, for the case there is
- * no mount option specified.
- */
- sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ if (sb->s_blocksize == PAGE_SIZE)
+ set_opt(sb, DIOREAD_NOLOCK);
+}
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
- if (blocksize < EXT4_MIN_BLOCK_SIZE ||
- blocksize > EXT4_MAX_BLOCK_SIZE) {
- ext4_msg(sb, KERN_ERR,
- "Unsupported filesystem blocksize %d (%d log_block_size)",
- blocksize, le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
+static int ext4_handle_clustersize(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int clustersize;
+
+ /* Handle clustersize */
+ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+ if (ext4_has_feature_bigalloc(sb)) {
+ if (clustersize < sb->s_blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "cluster size (%d) smaller than "
+ "block size (%lu)", clustersize, sb->s_blocksize);
+ return -EINVAL;
+ }
+ sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+ le32_to_cpu(es->s_log_block_size);
+ sbi->s_clusters_per_group =
+ le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
+ ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+ "clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group,
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ } else {
+ if (clustersize != sb->s_blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "fragment/cluster size (%d) != "
+ "block size (%lu)", clustersize, sb->s_blocksize);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
+ sbi->s_blocks_per_group);
+ return -EINVAL;
+ }
+ sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+ sbi->s_cluster_bits = 0;
}
+ sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
+
+ /* Do we have standard group size of clustersize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == clustersize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
+
+ return 0;
+}
+
+static void ext4_fast_commit_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* Initialize fast commit stuff */
+ atomic_set(&sbi->s_fc_subtid, 0);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
+ sbi->s_fc_bytes = 0;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ sbi->s_fc_ineligible_tid = 0;
+ spin_lock_init(&sbi->s_fc_lock);
+ memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+ sbi->s_fc_replay_state.fc_regions = NULL;
+ sbi->s_fc_replay_state.fc_regions_size = 0;
+ sbi->s_fc_replay_state.fc_regions_used = 0;
+ sbi->s_fc_replay_state.fc_regions_valid = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes = NULL;
+ sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
+}
+
+static int ext4_inode_info_init(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -3852,16 +4431,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
sbi->s_first_ino);
- goto failed_mount;
+ return -EINVAL;
}
if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
(!is_power_of_2(sbi->s_inode_size)) ||
- (sbi->s_inode_size > blocksize)) {
+ (sbi->s_inode_size > sb->s_blocksize)) {
ext4_msg(sb, KERN_ERR,
"unsupported inode size: %d",
sbi->s_inode_size);
- ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
- goto failed_mount;
+ ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize);
+ return -EINVAL;
}
/*
* i_atime_extra is the last extra field available for
@@ -3879,6 +4458,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_time_min = EXT4_TIMESTAMP_MIN;
}
+
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
@@ -3890,7 +4470,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (v > max) {
ext4_msg(sb, KERN_ERR,
"bad s_want_extra_isize: %d", v);
- goto failed_mount;
+ return -EINVAL;
}
if (sbi->s_want_extra_isize < v)
sbi->s_want_extra_isize = v;
@@ -3899,101 +4479,112 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (v > max) {
ext4_msg(sb, KERN_ERR,
"bad s_min_extra_isize: %d", v);
- goto failed_mount;
+ return -EINVAL;
}
if (sbi->s_want_extra_isize < v)
sbi->s_want_extra_isize = v;
}
}
- if (sbi->s_es->s_mount_opts[0]) {
- char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
- sizeof(sbi->s_es->s_mount_opts),
- GFP_KERNEL);
- if (!s_mount_opts)
- goto failed_mount;
- if (!parse_options(s_mount_opts, sb, &journal_devnum,
- &journal_ioprio, 0)) {
- ext4_msg(sb, KERN_WARNING,
- "failed to parse options in superblock: %s",
- s_mount_opts);
- }
- kfree(s_mount_opts);
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
+{
+ const struct ext4_sb_encodings *encoding_info;
+ struct unicode_map *encoding;
+ __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
+
+ if (!ext4_has_feature_casefold(sb) || sb->s_encoding)
+ return 0;
+
+ encoding_info = ext4_sb_read_encoding(es);
+ if (!encoding_info) {
+ ext4_msg(sb, KERN_ERR,
+ "Encoding requested by superblock is unknown");
+ return -EINVAL;
}
- sbi->s_def_mount_opt = sbi->s_mount_opt;
- if (!parse_options((char *) data, sb, &journal_devnum,
- &journal_ioprio, 0))
- goto failed_mount;
-#ifdef CONFIG_UNICODE
- if (ext4_has_feature_casefold(sb) && !sbi->s_encoding) {
- const struct ext4_sb_encodings *encoding_info;
- struct unicode_map *encoding;
- __u16 encoding_flags;
+ encoding = utf8_load(encoding_info->version);
+ if (IS_ERR(encoding)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with superblock charset: %s-%u.%u.%u "
+ "not supported by the kernel. flags: 0x%x.",
+ encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
+ return -EINVAL;
+ }
+ ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
+ "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
- if (ext4_has_feature_encrypt(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Can't mount with encoding and encryption");
- goto failed_mount;
- }
+ sb->s_encoding = encoding;
+ sb->s_encoding_flags = encoding_flags;
- if (ext4_sb_read_encoding(es, &encoding_info,
- &encoding_flags)) {
- ext4_msg(sb, KERN_ERR,
- "Encoding requested by superblock is unknown");
- goto failed_mount;
- }
+ return 0;
+}
+#else
+static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
+{
+ return 0;
+}
+#endif
- encoding = utf8_load(encoding_info->version);
- if (IS_ERR(encoding)) {
- ext4_msg(sb, KERN_ERR,
- "can't mount with superblock charset: %s-%s "
- "not supported by the kernel. flags: 0x%x.",
- encoding_info->name, encoding_info->version,
- encoding_flags);
- goto failed_mount;
- }
- ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
- "%s-%s with flags 0x%hx", encoding_info->name,
- encoding_info->version?:"\b", encoding_flags);
+static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* Warn if metadata_csum and gdt_csum are both set. */
+ if (ext4_has_feature_metadata_csum(sb) &&
+ ext4_has_feature_gdt_csum(sb))
+ ext4_warning(sb, "metadata_csum and uninit_bg are "
+ "redundant flags; please run fsck.");
- sbi->s_encoding = encoding;
- sbi->s_encoding_flags = encoding_flags;
+ /* Check for a known checksum algorithm */
+ if (!ext4_verify_csum_type(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "unknown checksum algorithm.");
+ return -EINVAL;
}
-#endif
+ ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
+ ext4_orphan_file_block_trigger);
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n");
- clear_opt(sb, DIOREAD_NOLOCK);
- if (test_opt2(sb, EXPLICIT_DELALLOC)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and delalloc");
- goto failed_mount;
- }
- if (test_opt(sb, DIOREAD_NOLOCK)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dioread_nolock");
- goto failed_mount;
- }
- if (test_opt(sb, DAX)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- goto failed_mount;
- }
- if (ext4_has_feature_encrypt(sb)) {
- ext4_msg(sb, KERN_WARNING,
- "encrypted files will use data=ordered "
- "instead of data journaling mode");
- }
- if (test_opt(sb, DELALLOC))
- clear_opt(sb, DELALLOC);
- } else {
- sb->s_iflags |= SB_I_CGROUPWB;
+ /* Load the checksum driver */
+ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(sbi->s_chksum_driver)) {
+ int ret = PTR_ERR(sbi->s_chksum_driver);
+ ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+ sbi->s_chksum_driver = NULL;
+ return ret;
}
- sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
+ /* Check superblock checksum */
+ if (!ext4_superblock_csum_verify(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "invalid superblock checksum. Run e2fsck?");
+ return -EFSBADCRC;
+ }
+ /* Precompute checksum seed for all metadata */
+ if (ext4_has_feature_csum_seed(sb))
+ sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
+ else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
+ sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+ sizeof(es->s_uuid));
+ return 0;
+}
+
+static int ext4_check_feature_compatibility(struct super_block *sb,
+ struct ext4_super_block *es,
+ int silent)
+{
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
(ext4_has_compat_features(sb) ||
ext4_has_ro_compat_features(sb) ||
@@ -4007,7 +4598,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (ext4_has_feature_64bit(sb)) {
ext4_msg(sb, KERN_ERR,
"The Hurd can't support 64-bit file systems");
- goto failed_mount;
+ return -EINVAL;
}
/*
@@ -4017,7 +4608,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (ext4_has_feature_ea_inode(sb)) {
ext4_msg(sb, KERN_ERR,
"ea_inode feature is not supported for Hurd");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4031,10 +4622,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* it's actually an ext[34] filesystem.
*/
if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
- goto failed_mount;
+ return -EINVAL;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
"to feature incompatibilities");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4048,10 +4639,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* it's actually an ext4 filesystem.
*/
if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
- goto failed_mount;
+ return -EINVAL;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
"to feature incompatibilities");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4061,37 +4652,483 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
- goto failed_mount;
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ext4_geometry_check(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ __u64 blocks_count;
+
+ /* check blocks count against device size */
+ blocks_count = sb_bdev_nr_blocks(sb);
+ if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+ "exceeds size of device (%llu blocks)",
+ ext4_blocks_count(es), blocks_count);
+ return -EINVAL;
+ }
+
+ /*
+ * It makes no sense for the first data block to be beyond the end
+ * of the filesystem.
+ */
+ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+ "block %u is beyond end of filesystem (%llu)",
+ le32_to_cpu(es->s_first_data_block),
+ ext4_blocks_count(es));
+ return -EINVAL;
+ }
+ if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
+ (sbi->s_cluster_ratio == 1)) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+ "block is 0 with a 1k block and cluster size");
+ return -EINVAL;
+ }
+
+ blocks_count = (ext4_blocks_count(es) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT4_BLOCKS_PER_GROUP(sb) - 1);
+ do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+ if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+ ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
+ "(block count %llu, first data block %u, "
+ "blocks per group %lu)", blocks_count,
+ ext4_blocks_count(es),
+ le32_to_cpu(es->s_first_data_block),
+ EXT4_BLOCKS_PER_GROUP(sb));
+ return -EINVAL;
+ }
+ sbi->s_groups_count = blocks_count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+ if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
+ le32_to_cpu(es->s_inodes_count)) {
+ ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
+ le32_to_cpu(es->s_inodes_count),
+ ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void ext4_group_desc_free(struct ext4_sb_info *sbi)
+{
+ struct buffer_head **group_desc;
+ int i;
+
+ rcu_read_lock();
+ group_desc = rcu_dereference(sbi->s_group_desc);
+ for (i = 0; i < sbi->s_gdb_count; i++)
+ brelse(group_desc[i]);
+ kvfree(group_desc);
+ rcu_read_unlock();
+}
+
+static int ext4_group_desc_init(struct super_block *sb,
+ struct ext4_super_block *es,
+ ext4_fsblk_t logical_sb_block,
+ ext4_group_t *first_not_zeroed)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned int db_count;
+ ext4_fsblk_t block;
+ int ret;
+ int i;
+
+ db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+ EXT4_DESC_PER_BLOCK(sb);
+ if (ext4_has_feature_meta_bg(sb)) {
+ if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
+ ext4_msg(sb, KERN_WARNING,
+ "first meta block group too large: %u "
+ "(group descriptor block count %u)",
+ le32_to_cpu(es->s_first_meta_bg), db_count);
+ return -EINVAL;
+ }
+ }
+ rcu_assign_pointer(sbi->s_group_desc,
+ kvmalloc_array(db_count,
+ sizeof(struct buffer_head *),
+ GFP_KERNEL));
+ if (sbi->s_group_desc == NULL) {
+ ext4_msg(sb, KERN_ERR, "not enough memory");
+ return -ENOMEM;
+ }
+
+ bgl_lock_init(sbi->s_blockgroup_lock);
+
+ /* Pre-read the descriptors into the buffer cache */
+ for (i = 0; i < db_count; i++) {
+ block = descriptor_loc(sb, logical_sb_block, i);
+ ext4_sb_breadahead_unmovable(sb, block);
+ }
+
+ for (i = 0; i < db_count; i++) {
+ struct buffer_head *bh;
+
+ block = descriptor_loc(sb, logical_sb_block, i);
+ bh = ext4_sb_bread_unmovable(sb, block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't read group descriptor %d", i);
+ sbi->s_gdb_count = i;
+ ret = PTR_ERR(bh);
+ goto out;
+ }
+ rcu_read_lock();
+ rcu_dereference(sbi->s_group_desc)[i] = bh;
+ rcu_read_unlock();
+ }
+ sbi->s_gdb_count = db_count;
+ if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
+ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+ return 0;
+out:
+ ext4_group_desc_free(sbi);
+ return ret;
+}
+
+static int ext4_load_and_init_journal(struct super_block *sb,
+ struct ext4_super_block *es,
+ struct ext4_fs_context *ctx)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ err = ext4_load_journal(sb, es, ctx->journal_devnum);
+ if (err)
+ return err;
+
+ if (ext4_has_feature_64bit(sb) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
+ goto out;
+ }
+
+ if (!set_journal_csum_feature_set(sb)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+ "feature set");
+ goto out;
+ }
+
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "Failed to set fast commit journal feature");
+ goto out;
+ }
+
+ /* We have now updated the journal if required, so we can
+ * validate the data journaling mode. */
+ switch (test_opt(sb, DATA_FLAGS)) {
+ case 0:
+ /* No mode set, assume a default based on the journal
+ * capabilities: ORDERED_DATA if the journal can
+ * cope, else JOURNAL_DATA
+ */
+ if (jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ set_opt(sb, ORDERED_DATA);
+ sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+ } else {
+ set_opt(sb, JOURNAL_DATA);
+ sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+ }
+ break;
+
+ case EXT4_MOUNT_ORDERED_DATA:
+ case EXT4_MOUNT_WRITEBACK_DATA:
+ if (!jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ ext4_msg(sb, KERN_ERR, "Journal does not support "
+ "requested data journaling mode");
+ goto out;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+ test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit in data=ordered mode");
+ goto out;
+ }
+
+ set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
+
+ sbi->s_journal->j_submit_inode_data_buffers =
+ ext4_journal_submit_inode_data_buffers;
+ sbi->s_journal->j_finish_inode_data_buffers =
+ ext4_journal_finish_inode_data_buffers;
+
+ return 0;
+
+out:
+ /* flush s_error_work before journal destroy. */
+ flush_work(&sbi->s_error_work);
+ jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ return -EINVAL;
+}
+
+static int ext4_journal_data_mode_check(struct super_block *sb)
+{
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with "
+ "data=journal disables delayed allocation, "
+ "dioread_nolock, O_DIRECT and fast_commit support!\n");
+ /* can't mount with both data=journal and dioread_nolock. */
+ clear_opt(sb, DIOREAD_NOLOCK);
+ clear_opt2(sb, JOURNAL_FAST_COMMIT);
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ return -EINVAL;
+ }
+ if (test_opt(sb, DAX_ALWAYS)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -EINVAL;
+ }
+ if (ext4_has_feature_encrypt(sb)) {
+ ext4_msg(sb, KERN_WARNING,
+ "encrypted files will use data=ordered "
+ "instead of data journaling mode");
+ }
+ if (test_opt(sb, DELALLOC))
+ clear_opt(sb, DELALLOC);
+ } else {
+ sb->s_iflags |= SB_I_CGROUPWB;
+ }
+
+ return 0;
+}
+
+static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
+ int silent)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es;
+ ext4_fsblk_t logical_sb_block;
+ unsigned long offset = 0;
+ struct buffer_head *bh;
+ int ret = -EINVAL;
+ int blocksize;
+
+ blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
+ if (!blocksize) {
+ ext4_msg(sb, KERN_ERR, "unable to set blocksize");
+ return -EINVAL;
+ }
+
+ /*
+ * The ext4 superblock will not be buffer aligned for other than 1kB
+ * block sizes. We need to calculate the offset from buffer start.
+ */
+ if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ } else {
+ logical_sb_block = sbi->s_sb_block;
+ }
+
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR, "unable to read superblock");
+ return PTR_ERR(bh);
+ }
+ /*
+ * Note: s_es must be initialized as soon as possible because
+ * some ext4 macro-instructions depend on its value
+ */
+ es = (struct ext4_super_block *) (bh->b_data + offset);
+ sbi->s_es = es;
+ sb->s_magic = le16_to_cpu(es->s_magic);
+ if (sb->s_magic != EXT4_SUPER_MAGIC) {
+ if (!silent)
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+ goto out;
+ }
if (le32_to_cpu(es->s_log_block_size) >
(EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
ext4_msg(sb, KERN_ERR,
"Invalid log block size: %u",
le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
+ goto out;
}
if (le32_to_cpu(es->s_log_cluster_size) >
(EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
ext4_msg(sb, KERN_ERR,
"Invalid log cluster size: %u",
le32_to_cpu(es->s_log_cluster_size));
- goto failed_mount;
+ goto out;
}
- if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
+ blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+ /*
+ * If the default block size is not the same as the real block size,
+ * we need to reload it.
+ */
+ if (sb->s_blocksize == blocksize) {
+ *lsb = logical_sb_block;
+ sbi->s_sbh = bh;
+ return 0;
+ }
+
+ /*
+ * bh must be released before kill_bdev(), otherwise
+ * it won't be freed and its page also. kill_bdev()
+ * is called by sb_set_blocksize().
+ */
+ brelse(bh);
+ /* Validate the filesystem blocksize */
+ if (!sb_set_blocksize(sb, blocksize)) {
+ ext4_msg(sb, KERN_ERR, "bad block size %d",
+ blocksize);
+ bh = NULL;
+ goto out;
+ }
+
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try");
+ ret = PTR_ERR(bh);
+ bh = NULL;
+ goto out;
+ }
+ es = (struct ext4_super_block *)(bh->b_data + offset);
+ sbi->s_es = es;
+ if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
+ ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
+ goto out;
+ }
+ *lsb = logical_sb_block;
+ sbi->s_sbh = bh;
+ return 0;
+out:
+ brelse(bh);
+ return ret;
+}
+
+static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+{
+ struct ext4_super_block *es = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct flex_groups **flex_groups;
+ ext4_fsblk_t block;
+ ext4_fsblk_t logical_sb_block;
+ struct inode *root;
+ int ret = -ENOMEM;
+ unsigned int i;
+ int needs_recovery, has_huge_files;
+ int err = 0;
+ ext4_group_t first_not_zeroed;
+ struct ext4_fs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
+
+ /* Set defaults for the variables that will be set during parsing */
+ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
+ ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+
+ sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
+ sbi->s_sectors_written_start =
+ part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
+
+ /* -EINVAL is default */
+ ret = -EINVAL;
+ err = ext4_load_super(sb, &logical_sb_block, silent);
+ if (err)
+ goto out_fail;
+
+ es = sbi->s_es;
+ sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+
+ err = ext4_init_metadata_csum(sb, es);
+ if (err)
+ goto failed_mount;
+
+ ext4_set_def_opts(sb, es);
+
+ sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+ sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+ sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+ sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+ sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+
+ /*
+ * set default s_li_wait_mult for lazyinit, for the case there is
+ * no mount option specified.
+ */
+ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
+ if (ext4_inode_info_init(sb, es))
+ goto failed_mount;
+
+ err = parse_apply_sb_mount_options(sb, ctx);
+ if (err < 0)
+ goto failed_mount;
+
+ sbi->s_def_mount_opt = sbi->s_mount_opt;
+
+ err = ext4_check_opt_consistency(fc, sb);
+ if (err < 0)
+ goto failed_mount;
+
+ ext4_apply_options(fc, sb);
+
+ if (ext4_encoding_init(sb, es))
+ goto failed_mount;
+
+ if (ext4_journal_data_mode_check(sb))
+ goto failed_mount;
+
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+ (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
+
+ /* i_version is always enabled now */
+ sb->s_flags |= SB_I_VERSION;
+
+ if (ext4_check_feature_compatibility(sb, es, silent))
+ goto failed_mount;
+
+ if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) {
ext4_msg(sb, KERN_ERR,
"Number of reserved GDT blocks insanely large: %d",
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
goto failed_mount;
}
- if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+ if (sbi->s_daxdev) {
+ if (sb->s_blocksize == PAGE_SIZE)
+ set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
+ else
+ ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
+ }
+
+ if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
if (ext4_has_feature_inline_data(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
" that may contain inline data");
goto failed_mount;
}
- if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
ext4_msg(sb, KERN_ERR,
"DAX unsupported by block device.");
goto failed_mount;
@@ -4104,32 +5141,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (sb->s_blocksize != blocksize) {
- /* Validate the filesystem blocksize */
- if (!sb_set_blocksize(sb, blocksize)) {
- ext4_msg(sb, KERN_ERR, "bad block size %d",
- blocksize);
- goto failed_mount;
- }
-
- brelse(bh);
- logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
- offset = do_div(logical_sb_block, blocksize);
- bh = sb_bread_unmovable(sb, logical_sb_block);
- if (!bh) {
- ext4_msg(sb, KERN_ERR,
- "Can't read superblock on 2nd try");
- goto failed_mount;
- }
- es = (struct ext4_super_block *)(bh->b_data + offset);
- sbi->s_es = es;
- if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
- ext4_msg(sb, KERN_ERR,
- "Magic mismatch, very weird!");
- goto failed_mount;
- }
- }
-
has_huge_files = ext4_has_feature_huge_file(sb);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
@@ -4151,20 +5162,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
- if (sbi->s_inodes_per_block == 0)
- goto cantfind_ext4;
+ sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb);
+ if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) {
+ if (!silent)
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+ goto failed_mount;
+ }
if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
- sbi->s_inodes_per_group > blocksize * 8) {
+ sbi->s_inodes_per_group > sb->s_blocksize * 8) {
ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
- sbi->s_blocks_per_group);
+ sbi->s_inodes_per_group);
goto failed_mount;
}
sbi->s_itb_per_group = sbi->s_inodes_per_group /
sbi->s_inodes_per_block;
- sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
- sbi->s_sbh = bh;
- sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb);
+ sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
@@ -4189,55 +5202,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
- /* Handle clustersize */
- clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
- has_bigalloc = ext4_has_feature_bigalloc(sb);
- if (has_bigalloc) {
- if (clustersize < blocksize) {
- ext4_msg(sb, KERN_ERR,
- "cluster size (%d) smaller than "
- "block size (%d)", clustersize, blocksize);
- goto failed_mount;
- }
- sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
- le32_to_cpu(es->s_log_block_size);
- sbi->s_clusters_per_group =
- le32_to_cpu(es->s_clusters_per_group);
- if (sbi->s_clusters_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#clusters per group too big: %lu",
- sbi->s_clusters_per_group);
- goto failed_mount;
- }
- if (sbi->s_blocks_per_group !=
- (sbi->s_clusters_per_group * (clustersize / blocksize))) {
- ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
- "clusters per group (%lu) inconsistent",
- sbi->s_blocks_per_group,
- sbi->s_clusters_per_group);
- goto failed_mount;
- }
- } else {
- if (clustersize != blocksize) {
- ext4_msg(sb, KERN_ERR,
- "fragment/cluster size (%d) != "
- "block size (%d)", clustersize, blocksize);
- goto failed_mount;
- }
- if (sbi->s_blocks_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#blocks per group too big: %lu",
- sbi->s_blocks_per_group);
- goto failed_mount;
- }
- sbi->s_clusters_per_group = sbi->s_blocks_per_group;
- sbi->s_cluster_bits = 0;
- }
- sbi->s_cluster_ratio = clustersize / blocksize;
-
- /* Do we have standard group size of clustersize * 8 blocks ? */
- if (sbi->s_blocks_per_group == clustersize << 3)
- set_opt2(sb, STD_GROUP_SIZE);
+ if (ext4_handle_clustersize(sb))
+ goto failed_mount;
/*
* Test whether we have more sectors than will fit in sector_t,
@@ -4251,112 +5217,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
- goto cantfind_ext4;
-
- /* check blocks count against device size */
- blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
- if (blocks_count && ext4_blocks_count(es) > blocks_count) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
- "exceeds size of device (%llu blocks)",
- ext4_blocks_count(es), blocks_count);
- goto failed_mount;
- }
-
- /*
- * It makes no sense for the first data block to be beyond the end
- * of the filesystem.
- */
- if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
- "block %u is beyond end of filesystem (%llu)",
- le32_to_cpu(es->s_first_data_block),
- ext4_blocks_count(es));
- goto failed_mount;
- }
- if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
- (sbi->s_cluster_ratio == 1)) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
- "block is 0 with a 1k block and cluster size");
+ if (ext4_geometry_check(sb, es))
goto failed_mount;
- }
- blocks_count = (ext4_blocks_count(es) -
- le32_to_cpu(es->s_first_data_block) +
- EXT4_BLOCKS_PER_GROUP(sb) - 1);
- do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
- if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
- ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
- "(block count %llu, first data block %u, "
- "blocks per group %lu)", sbi->s_groups_count,
- ext4_blocks_count(es),
- le32_to_cpu(es->s_first_data_block),
- EXT4_BLOCKS_PER_GROUP(sb));
- goto failed_mount;
- }
- sbi->s_groups_count = blocks_count;
- sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
- (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
- if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
- le32_to_cpu(es->s_inodes_count)) {
- ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
- le32_to_cpu(es->s_inodes_count),
- ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
- ret = -EINVAL;
- goto failed_mount;
- }
- db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
- EXT4_DESC_PER_BLOCK(sb);
- if (ext4_has_feature_meta_bg(sb)) {
- if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
- ext4_msg(sb, KERN_WARNING,
- "first meta block group too large: %u "
- "(group descriptor block count %u)",
- le32_to_cpu(es->s_first_meta_bg), db_count);
- goto failed_mount;
- }
- }
- rcu_assign_pointer(sbi->s_group_desc,
- kvmalloc_array(db_count,
- sizeof(struct buffer_head *),
- GFP_KERNEL));
- if (sbi->s_group_desc == NULL) {
- ext4_msg(sb, KERN_ERR, "not enough memory");
- ret = -ENOMEM;
+ err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
+ if (err)
goto failed_mount;
- }
-
- bgl_lock_init(sbi->s_blockgroup_lock);
-
- /* Pre-read the descriptors into the buffer cache */
- for (i = 0; i < db_count; i++) {
- block = descriptor_loc(sb, logical_sb_block, i);
- sb_breadahead(sb, block);
- }
-
- for (i = 0; i < db_count; i++) {
- struct buffer_head *bh;
-
- block = descriptor_loc(sb, logical_sb_block, i);
- bh = sb_bread_unmovable(sb, block);
- if (!bh) {
- ext4_msg(sb, KERN_ERR,
- "can't read group descriptor %d", i);
- db_count = i;
- goto failed_mount2;
- }
- rcu_read_lock();
- rcu_dereference(sbi->s_group_desc)[i] = bh;
- rcu_read_unlock();
- }
- sbi->s_gdb_count = db_count;
- if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
- ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
- ret = -EFSCORRUPTED;
- goto failed_mount2;
- }
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
+ spin_lock_init(&sbi->s_error_lock);
+ INIT_WORK(&sbi->s_error_work, flush_stashed_error_work);
/* Register extent status tree shrinker */
if (ext4_es_register_shrinker(sbi))
@@ -4390,9 +5260,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
+ ext4_fast_commit_init(sb);
+
sb->s_root = NULL;
needs_recovery = (es->s_last_orphan != 0 ||
+ ext4_has_feature_orphan_present(sb) ||
ext4_has_feature_journal_needs_recovery(sb));
if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb))
@@ -4404,101 +5277,46 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* root first: it may be modified in the journal!
*/
if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
- err = ext4_load_journal(sb, es, journal_devnum);
+ err = ext4_load_and_init_journal(sb, es, ctx);
if (err)
goto failed_mount3a;
} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
- goto failed_mount_wq;
+ goto failed_mount3a;
} else {
/* Nojournal mode, all journal mount options are illegal */
if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"journal_checksum, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
}
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"journal_async_commit, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
}
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"commit=%lu, fs mounted w/o journal",
sbi->s_commit_interval / HZ);
- goto failed_mount_wq;
+ goto failed_mount3a;
}
if (EXT4_MOUNT_DATA_FLAGS &
(sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"data=, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
}
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
+ clear_opt2(sb, JOURNAL_FAST_COMMIT);
sbi->s_journal = NULL;
needs_recovery = 0;
- goto no_journal;
}
- if (ext4_has_feature_64bit(sb) &&
- !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_64BIT)) {
- ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
- goto failed_mount_wq;
- }
-
- if (!set_journal_csum_feature_set(sb)) {
- ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
- "feature set");
- goto failed_mount_wq;
- }
-
- /* We have now updated the journal if required, so we can
- * validate the data journaling mode. */
- switch (test_opt(sb, DATA_FLAGS)) {
- case 0:
- /* No mode set, assume a default based on the journal
- * capabilities: ORDERED_DATA if the journal can
- * cope, else JOURNAL_DATA
- */
- if (jbd2_journal_check_available_features
- (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- set_opt(sb, ORDERED_DATA);
- sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
- } else {
- set_opt(sb, JOURNAL_DATA);
- sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
- }
- break;
-
- case EXT4_MOUNT_ORDERED_DATA:
- case EXT4_MOUNT_WRITEBACK_DATA:
- if (!jbd2_journal_check_available_features
- (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- ext4_msg(sb, KERN_ERR, "Journal does not support "
- "requested data journaling mode");
- goto failed_mount_wq;
- }
- default:
- break;
- }
-
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
- test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_async_commit in data=ordered mode");
- goto failed_mount_wq;
- }
-
- set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
-
- sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
-
-no_journal:
if (!test_opt(sb, NO_MBCACHE)) {
sbi->s_ea_block_cache = ext4_xattr_create_cache();
if (!sbi->s_ea_block_cache) {
@@ -4517,24 +5335,27 @@ no_journal:
}
}
- if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
+ if (ext4_has_feature_verity(sb) && sb->s_blocksize != PAGE_SIZE) {
ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
goto failed_mount_wq;
}
- if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
- !ext4_has_feature_encrypt(sb)) {
- ext4_set_feature_encrypt(sb);
- ext4_commit_super(sb, 1);
- }
-
/*
* Get the # of file system overhead blocks from the
* superblock if present.
*/
- if (es->s_overhead_clusters)
- sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
- else {
+ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+ /* ignore the precalculated value if it is ridiculous */
+ if (sbi->s_overhead > ext4_blocks_count(es))
+ sbi->s_overhead = 0;
+ /*
+ * If the bigalloc feature is not enabled recalculating the
+ * overhead doesn't take long, so we might as well just redo
+ * it to make sure we are using the correct value.
+ */
+ if (!ext4_has_feature_bigalloc(sb))
+ sbi->s_overhead = 0;
+ if (sbi->s_overhead == 0) {
err = ext4_calculate_overhead(sb);
if (err)
goto failed_mount_wq;
@@ -4570,11 +5391,6 @@ no_journal:
goto failed_mount4;
}
-#ifdef CONFIG_UNICODE
- if (sbi->s_encoding)
- sb->s_d_op = &ext4_dentry_ops;
-#endif
-
sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext4_msg(sb, KERN_ERR, "get root dentry failed");
@@ -4591,14 +5407,30 @@ no_journal:
ext4_set_resv_clusters(sb);
- err = ext4_setup_system_zone(sb);
- if (err) {
- ext4_msg(sb, KERN_ERR, "failed to initialize system "
- "zone (%d)", err);
- goto failed_mount4a;
+ if (test_opt(sb, BLOCK_VALIDITY)) {
+ err = ext4_setup_system_zone(sb);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize system "
+ "zone (%d)", err);
+ goto failed_mount4a;
+ }
}
+ ext4_fc_replay_cleanup(sb);
ext4_ext_init(sb);
+
+ /*
+ * Enable optimize_scan if number of groups is > threshold. This can be
+ * turned off by passing "mb_optimize_scan=0". This can also be
+ * turned on forcefully by passing "mb_optimize_scan=1".
+ */
+ if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
+ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
+ set_opt2(sb, MB_OPTIMIZE_SCAN);
+ else
+ clear_opt2(sb, MB_OPTIMIZE_SCAN);
+ }
+
err = ext4_mb_init(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
@@ -4606,16 +5438,22 @@ no_journal:
goto failed_mount5;
}
+ /*
+ * We can only set up the journal commit callback once
+ * mballoc is initialized
+ */
+ if (sbi->s_journal)
+ sbi->s_journal->j_commit_callback =
+ ext4_journal_commit_callback;
+
block = ext4_count_free_clusters(sb);
- ext4_free_blocks_count_set(sbi->s_es,
+ ext4_free_blocks_count_set(sbi->s_es,
EXT4_C2B(sbi, block));
- ext4_superblock_csum_set(sb);
err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
GFP_KERNEL);
if (!err) {
unsigned long freei = ext4_count_free_inodes(sb);
sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
- ext4_superblock_csum_set(sb);
err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
GFP_KERNEL);
}
@@ -4626,6 +5464,9 @@ no_journal:
err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
GFP_KERNEL);
if (!err)
+ err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
+ GFP_KERNEL);
+ if (!err)
err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
if (err) {
@@ -4638,6 +5479,7 @@ no_journal:
ext4_msg(sb, KERN_ERR,
"unable to initialize "
"flex_bg meta info!");
+ ret = -ENOMEM;
goto failed_mount6;
}
@@ -4649,46 +5491,47 @@ no_journal:
if (err)
goto failed_mount7;
+ err = ext4_init_orphan_info(sb);
+ if (err)
+ goto failed_mount8;
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount. */
if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
err = ext4_enable_quotas(sb);
if (err)
- goto failed_mount8;
+ goto failed_mount9;
}
#endif /* CONFIG_QUOTA */
+ /*
+ * Save the original bdev mapping's wb_err value which could be
+ * used to detect the metadata async write error.
+ */
+ spin_lock_init(&sbi->s_bdev_wb_lock);
+ errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
+ &sbi->s_bdev_wb_err);
+ sb->s_bdev->bd_super = sb;
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
+ /*
+ * Update the checksum after updating free space/inode counters and
+ * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect
+ * checksum in the buffer cache until it is written out and
+ * e2fsprogs programs trying to open a file system immediately
+ * after it is mounted can fail.
+ */
+ ext4_superblock_csum_set(sb);
if (needs_recovery) {
ext4_msg(sb, KERN_INFO, "recovery complete");
- ext4_mark_recovery_complete(sb, es);
- }
- if (EXT4_SB(sb)->s_journal) {
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
- descr = " journalled data mode";
- else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
- descr = " ordered data mode";
- else
- descr = " writeback data mode";
- } else
- descr = "out journal";
-
- if (test_opt(sb, DISCARD)) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
- if (!blk_queue_discard(q))
- ext4_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but "
- "the device does not support discard");
+ err = ext4_mark_recovery_complete(sb, es);
+ if (err)
+ goto failed_mount9;
}
- if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
- ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
- "Opts: %.*s%s%s", descr,
- (int) sizeof(sbi->s_es->s_mount_opts),
- sbi->s_es->s_mount_opts,
- *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
+ if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
+ ext4_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but the device does not support discard");
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -4697,19 +5540,16 @@ no_journal:
ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+ atomic_set(&sbi->s_warning_count, 0);
+ atomic_set(&sbi->s_msg_count, 0);
- kfree(orig_data);
return 0;
-cantfind_ext4:
- if (!silent)
- ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
- goto failed_mount;
-
-#ifdef CONFIG_QUOTA
+failed_mount9:
+ ext4_release_orphan_info(sb);
failed_mount8:
ext4_unregister_sysfs(sb);
-#endif
+ kobject_put(&sbi->s_kobj);
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
@@ -4726,6 +5566,7 @@ failed_mount6:
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
percpu_free_rwsem(&sbi->s_writepages_rwsem);
failed_mount5:
ext4_ext_release(sb);
@@ -4745,46 +5586,93 @@ failed_mount_wq:
sbi->s_ea_block_cache = NULL;
if (sbi->s_journal) {
+ /* flush s_error_work before journal destroy. */
+ flush_work(&sbi->s_error_work);
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
failed_mount3a:
ext4_es_unregister_shrinker(sbi);
failed_mount3:
+ /* flush s_error_work before sbi destroy */
+ flush_work(&sbi->s_error_work);
del_timer_sync(&sbi->s_err_report);
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
-failed_mount2:
- rcu_read_lock();
- group_desc = rcu_dereference(sbi->s_group_desc);
- for (i = 0; i < db_count; i++)
- brelse(group_desc[i]);
- kvfree(group_desc);
- rcu_read_unlock();
+ ext4_stop_mmpd(sbi);
+ ext4_group_desc_free(sbi);
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
-#ifdef CONFIG_UNICODE
- utf8_unload(sbi->s_encoding);
+#if IS_ENABLED(CONFIG_UNICODE)
+ utf8_unload(sb->s_encoding);
#endif
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
#endif
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
+ /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
+ brelse(sbi->s_sbh);
ext4_blkdev_remove(sbi);
- brelse(bh);
out_fail:
sb->s_fs_info = NULL;
- kfree(sbi->s_blockgroup_lock);
-out_free_base:
- kfree(sbi);
- kfree(orig_data);
- fs_put_dax(dax_dev);
return err ? err : ret;
}
+static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct ext4_fs_context *ctx = fc->fs_private;
+ struct ext4_sb_info *sbi;
+ const char *descr;
+ int ret;
+
+ sbi = ext4_alloc_sbi(sb);
+ if (!sbi)
+ return -ENOMEM;
+
+ fc->s_fs_info = sbi;
+
+ /* Cleanup superblock name */
+ strreplace(sb->s_id, '/', '!');
+
+ sbi->s_sb_block = 1; /* Default super block location */
+ if (ctx->spec & EXT4_SPEC_s_sb_block)
+ sbi->s_sb_block = ctx->s_sb_block;
+
+ ret = __ext4_fill_super(fc, sb);
+ if (ret < 0)
+ goto free_sbi;
+
+ if (sbi->s_journal) {
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ descr = " journalled data mode";
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ descr = " ordered data mode";
+ else
+ descr = " writeback data mode";
+ } else
+ descr = "out journal";
+
+ if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
+ ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+ "Quota mode: %s.", descr, ext4_quota_mode(sb));
+
+ /* Update the s_overhead_clusters if necessary */
+ ext4_update_overhead(sb, false);
+ return 0;
+
+free_sbi:
+ ext4_free_sbi(sbi);
+ fc->s_fs_info = NULL;
+ return ret;
+}
+
+static int ext4_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, ext4_fill_super);
+}
+
/*
* Setup any per-fs journal parameters now. We'll do this both on
* initial mount, once the journal has been initialised but before we've
@@ -4797,6 +5685,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_commit_interval = sbi->s_commit_interval;
journal->j_min_batch_time = sbi->s_min_batch_time;
journal->j_max_batch_time = sbi->s_max_batch_time;
+ ext4_fc_init(sb, journal);
write_lock(&journal->j_state_lock);
if (test_opt(sb, BARRIER))
@@ -4832,7 +5721,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
return NULL;
}
- jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
+ ext4_debug("Journal inode found at %p: %lld bytes\n",
journal_inode, journal_inode->i_size);
if (!S_ISREG(journal_inode->i_mode)) {
ext4_msg(sb, KERN_ERR, "invalid journal inode");
@@ -4848,7 +5737,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
struct inode *journal_inode;
journal_t *journal;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
+ return NULL;
journal_inode = ext4_get_journal_inode(sb, journal_inum);
if (!journal_inode)
@@ -4878,7 +5768,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
struct ext4_super_block *es;
struct block_device *bdev;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
+ return NULL;
bdev = ext4_blkdev_get(j_dev, sb);
if (bdev == NULL)
@@ -4937,9 +5828,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
journal->j_private = sb;
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
- wait_on_buffer(journal->j_sb_buffer);
- if (!buffer_uptodate(journal->j_sb_buffer)) {
+ if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) {
ext4_msg(sb, KERN_ERR, "I/O error on journal device");
goto out_journal;
}
@@ -4949,7 +5838,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
be32_to_cpu(journal->j_superblock->s_nr_users));
goto out_journal;
}
- EXT4_SB(sb)->journal_bdev = bdev;
+ EXT4_SB(sb)->s_journal_bdev = bdev;
ext4_init_journal_params(sb, journal);
return journal;
@@ -4969,8 +5858,10 @@ static int ext4_load_journal(struct super_block *sb,
dev_t journal_dev;
int err = 0;
int really_read_only;
+ int journal_dev_ro;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
+ return -EFSCORRUPTED;
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
@@ -4980,7 +5871,31 @@ static int ext4_load_journal(struct super_block *sb,
} else
journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
- really_read_only = bdev_read_only(sb->s_bdev);
+ if (journal_inum && journal_dev) {
+ ext4_msg(sb, KERN_ERR,
+ "filesystem has both journal inode and journal device!");
+ return -EINVAL;
+ }
+
+ if (journal_inum) {
+ journal = ext4_get_journal(sb, journal_inum);
+ if (!journal)
+ return -EINVAL;
+ } else {
+ journal = ext4_get_dev_journal(sb, journal_dev);
+ if (!journal)
+ return -EINVAL;
+ }
+
+ journal_dev_ro = bdev_read_only(journal->j_dev);
+ really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;
+
+ if (journal_dev_ro && !sb_rdonly(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "journal device read-only, try mounting with '-o ro'");
+ err = -EROFS;
+ goto err_out;
+ }
/*
* Are we loading a blank journal or performing recovery after a
@@ -4995,27 +5910,14 @@ static int ext4_load_journal(struct super_block *sb,
ext4_msg(sb, KERN_ERR, "write access "
"unavailable, cannot proceed "
"(try mounting with noload)");
- return -EROFS;
+ err = -EROFS;
+ goto err_out;
}
ext4_msg(sb, KERN_INFO, "write access will "
"be enabled during recovery");
}
}
- if (journal_inum && journal_dev) {
- ext4_msg(sb, KERN_ERR, "filesystem has both journal "
- "and inode journals!");
- return -EINVAL;
- }
-
- if (journal_inum) {
- if (!(journal = ext4_get_journal(sb, journal_inum)))
- return -EINVAL;
- } else {
- if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
- return -EINVAL;
- }
-
if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
@@ -5035,40 +5937,40 @@ static int ext4_load_journal(struct super_block *sb,
if (err) {
ext4_msg(sb, KERN_ERR, "error loading journal");
- jbd2_journal_destroy(journal);
- return err;
+ goto err_out;
}
EXT4_SB(sb)->s_journal = journal;
- ext4_clear_journal_err(sb, es);
+ err = ext4_clear_journal_err(sb, es);
+ if (err) {
+ EXT4_SB(sb)->s_journal = NULL;
+ jbd2_journal_destroy(journal);
+ return err;
+ }
if (!really_read_only && journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
es->s_journal_dev = cpu_to_le32(journal_devnum);
/* Make sure we flush the recovery flag to disk. */
- ext4_commit_super(sb, 1);
+ ext4_commit_super(sb);
}
return 0;
+
+err_out:
+ jbd2_journal_destroy(journal);
+ return err;
}
-static int ext4_commit_super(struct super_block *sb, int sync)
+/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */
+static void ext4_update_super(struct super_block *sb)
{
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
- int error = 0;
-
- if (!sbh || block_device_ejected(sb))
- return error;
-
- /*
- * The superblock bh should be mapped, but it might not be if the
- * device was hot-removed. Not much we can do but fail the I/O.
- */
- if (!buffer_mapped(sbh))
- return error;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head *sbh = sbi->s_sbh;
+ lock_buffer(sbh);
/*
* If the file system is mounted read-only, don't update the
* superblock write time. This avoids updating the superblock
@@ -5081,27 +5983,80 @@ static int ext4_commit_super(struct super_block *sb, int sync)
*/
if (!(sb->s_flags & SB_RDONLY))
ext4_update_tstamp(es, s_wtime);
- if (sb->s_bdev->bd_part)
- es->s_kbytes_written =
- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part,
- sectors[STAT_WRITE]) -
- EXT4_SB(sb)->s_sectors_written_start) >> 1));
- else
- es->s_kbytes_written =
- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
- if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
+ es->s_kbytes_written =
+ cpu_to_le64(sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
+ sbi->s_sectors_written_start) >> 1));
+ if (percpu_counter_initialized(&sbi->s_freeclusters_counter))
ext4_free_blocks_count_set(es,
- EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
- &EXT4_SB(sb)->s_freeclusters_counter)));
- if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
+ EXT4_C2B(sbi, percpu_counter_sum_positive(
+ &sbi->s_freeclusters_counter)));
+ if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
es->s_free_inodes_count =
cpu_to_le32(percpu_counter_sum_positive(
- &EXT4_SB(sb)->s_freeinodes_counter));
- BUFFER_TRACE(sbh, "marking dirty");
+ &sbi->s_freeinodes_counter));
+ /* Copy error information to the on-disk superblock */
+ spin_lock(&sbi->s_error_lock);
+ if (sbi->s_add_error_count > 0) {
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+ if (!es->s_first_error_time && !es->s_first_error_time_hi) {
+ __ext4_update_tstamp(&es->s_first_error_time,
+ &es->s_first_error_time_hi,
+ sbi->s_first_error_time);
+ strncpy(es->s_first_error_func, sbi->s_first_error_func,
+ sizeof(es->s_first_error_func));
+ es->s_first_error_line =
+ cpu_to_le32(sbi->s_first_error_line);
+ es->s_first_error_ino =
+ cpu_to_le32(sbi->s_first_error_ino);
+ es->s_first_error_block =
+ cpu_to_le64(sbi->s_first_error_block);
+ es->s_first_error_errcode =
+ ext4_errno_to_code(sbi->s_first_error_code);
+ }
+ __ext4_update_tstamp(&es->s_last_error_time,
+ &es->s_last_error_time_hi,
+ sbi->s_last_error_time);
+ strncpy(es->s_last_error_func, sbi->s_last_error_func,
+ sizeof(es->s_last_error_func));
+ es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
+ es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
+ es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
+ es->s_last_error_errcode =
+ ext4_errno_to_code(sbi->s_last_error_code);
+ /*
+ * Start the daily error reporting function if it hasn't been
+ * started already
+ */
+ if (!es->s_error_count)
+ mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
+ le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
+ sbi->s_add_error_count = 0;
+ }
+ spin_unlock(&sbi->s_error_lock);
+
ext4_superblock_csum_set(sb);
- if (sync)
- lock_buffer(sbh);
+ unlock_buffer(sbh);
+}
+
+static int ext4_commit_super(struct super_block *sb)
+{
+ struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+
+ if (!sbh)
+ return -EINVAL;
+ if (block_device_ejected(sb))
+ return -ENODEV;
+
+ ext4_update_super(sb);
+
+ lock_buffer(sbh);
+ /* Buffer got discarded which means block device got invalidated */
+ if (!buffer_mapped(sbh)) {
+ unlock_buffer(sbh);
+ return -EIO;
+ }
+
if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
/*
* Oh, dear. A previous attempt to write the
@@ -5116,19 +6071,21 @@ static int ext4_commit_super(struct super_block *sb, int sync)
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
- mark_buffer_dirty(sbh);
- if (sync) {
- unlock_buffer(sbh);
- error = __sync_dirty_buffer(sbh,
- REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
- if (buffer_write_io_error(sbh)) {
- ext4_msg(sb, KERN_ERR, "I/O error while writing "
- "superblock");
- clear_buffer_write_io_error(sbh);
- set_buffer_uptodate(sbh);
- }
+ get_bh(sbh);
+ /* Clear potential dirty bit if it was journalled update */
+ clear_buffer_dirty(sbh);
+ sbh->b_end_io = end_buffer_write_sync;
+ submit_bh(REQ_OP_WRITE | REQ_SYNC |
+ (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
+ wait_on_buffer(sbh);
+ if (buffer_write_io_error(sbh)) {
+ ext4_msg(sb, KERN_ERR, "I/O error while writing "
+ "superblock");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ return -EIO;
}
- return error;
+ return 0;
}
/*
@@ -5136,26 +6093,39 @@ static int ext4_commit_super(struct super_block *sb, int sync)
* remounting) the filesystem readonly, then we will end up with a
* consistent fs on disk. Record that fact.
*/
-static void ext4_mark_recovery_complete(struct super_block *sb,
- struct ext4_super_block *es)
+static int ext4_mark_recovery_complete(struct super_block *sb,
+ struct ext4_super_block *es)
{
+ int err;
journal_t *journal = EXT4_SB(sb)->s_journal;
if (!ext4_has_feature_journal(sb)) {
- BUG_ON(journal != NULL);
- return;
+ if (journal != NULL) {
+ ext4_error(sb, "Journal got removed while the fs was "
+ "mounted!");
+ return -EFSCORRUPTED;
+ }
+ return 0;
}
jbd2_journal_lock_updates(journal);
- if (jbd2_journal_flush(journal) < 0)
+ err = jbd2_journal_flush(journal, 0);
+ if (err < 0)
goto out;
- if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) {
+ if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
+ ext4_has_feature_orphan_present(sb))) {
+ if (!ext4_orphan_file_empty(sb)) {
+ ext4_error(sb, "Orphan file not empty on read-only fs.");
+ err = -EFSCORRUPTED;
+ goto out;
+ }
ext4_clear_feature_journal_needs_recovery(sb);
- ext4_commit_super(sb, 1);
+ ext4_clear_feature_orphan_present(sb);
+ ext4_commit_super(sb);
}
-
out:
jbd2_journal_unlock_updates(journal);
+ return err;
}
/*
@@ -5163,14 +6133,17 @@ out:
* has recorded an error from a previous lifetime, move that error to the
* main filesystem now.
*/
-static void ext4_clear_journal_err(struct super_block *sb,
+static int ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es)
{
journal_t *journal;
int j_errno;
const char *errstr;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (!ext4_has_feature_journal(sb)) {
+ ext4_error(sb, "Journal got removed while the fs was mounted!");
+ return -EFSCORRUPTED;
+ }
journal = EXT4_SB(sb)->s_journal;
@@ -5190,11 +6163,12 @@ static void ext4_clear_journal_err(struct super_block *sb,
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_commit_super(sb, 1);
+ ext4_commit_super(sb);
jbd2_journal_clear_err(journal);
jbd2_journal_update_sb_errno(journal);
}
+ return 0;
}
/*
@@ -5249,7 +6223,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
needs_barrier = true;
if (needs_barrier) {
int err;
- err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ err = blkdev_issue_flush(sb->s_bdev);
if (!ret)
ret = err;
}
@@ -5283,15 +6257,17 @@ static int ext4_freeze(struct super_block *sb)
* Don't clear the needs_recovery flag if we failed to
* flush the journal.
*/
- error = jbd2_journal_flush(journal);
+ error = jbd2_journal_flush(journal, 0);
if (error < 0)
goto out;
/* Journal blocked and flushed, clear needs_recovery flag. */
ext4_clear_feature_journal_needs_recovery(sb);
+ if (ext4_orphan_file_empty(sb))
+ ext4_clear_feature_orphan_present(sb);
}
- error = ext4_commit_super(sb, 1);
+ error = ext4_commit_super(sb);
out:
if (journal)
/* we rely on upper layer to stop further updates */
@@ -5311,9 +6287,11 @@ static int ext4_unfreeze(struct super_block *sb)
if (EXT4_SB(sb)->s_journal) {
/* Reset the needs_recovery flag before the fs is unlocked. */
ext4_set_feature_journal_needs_recovery(sb);
+ if (ext4_has_feature_orphan_file(sb))
+ ext4_set_feature_orphan_present(sb);
}
- ext4_commit_super(sb, 1);
+ ext4_commit_super(sb);
return 0;
}
@@ -5333,24 +6311,21 @@ struct ext4_mount_options {
#endif
};
-static int ext4_remount(struct super_block *sb, int *flags, char *data)
+static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
{
+ struct ext4_fs_context *ctx = fc->fs_private;
struct ext4_super_block *es;
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned long old_sb_flags;
struct ext4_mount_options old_opts;
- int enable_quota = 0;
ext4_group_t g;
- unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
int err = 0;
#ifdef CONFIG_QUOTA
+ int enable_quota = 0;
int i, j;
char *to_free[EXT4_MAXQUOTAS];
#endif
- char *orig_data = kstrdup(data, GFP_KERNEL);
- if (data && !orig_data)
- return -ENOMEM;
/* Store the original options */
old_sb_flags = sb->s_flags;
@@ -5371,20 +6346,22 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (!old_opts.s_qf_names[i]) {
for (j = 0; j < i; j++)
kfree(old_opts.s_qf_names[j]);
- kfree(orig_data);
return -ENOMEM;
}
} else
old_opts.s_qf_names[i] = NULL;
#endif
- if (sbi->s_journal && sbi->s_journal->j_task->io_context)
- journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) {
+ if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+ ctx->journal_ioprio =
+ sbi->s_journal->j_task->io_context->ioprio;
+ else
+ ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
- if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
- err = -EINVAL;
- goto restore_opts;
}
+ ext4_apply_options(fc, sb);
+
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "changing journal_checksum "
@@ -5405,12 +6382,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
- if (test_opt(sb, DAX)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- err = -EINVAL;
- goto restore_opts;
- }
} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
@@ -5426,14 +6397,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
- ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
- "dax flag with busy inodes while remounting");
- sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
- }
-
- if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
- ext4_abort(sb, "Abort forced by user");
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
+ ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
@@ -5442,19 +6407,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal) {
ext4_init_journal_params(sb, sbi->s_journal);
- set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+ set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
}
- if (*flags & SB_LAZYTIME)
- sb->s_flags |= SB_LAZYTIME;
+ /* Flush outstanding errors before changing fs state */
+ flush_work(&sbi->s_error_work);
- if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
- if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
+ if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
err = -EROFS;
goto restore_opts;
}
- if (*flags & SB_RDONLY) {
+ if (fc->sb_flags & SB_RDONLY) {
err = sync_filesystem(sb);
if (err < 0)
goto restore_opts;
@@ -5477,10 +6442,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
(sbi->s_mount_state & EXT4_VALID_FS))
es->s_state = cpu_to_le16(sbi->s_mount_state);
- if (sbi->s_journal)
+ if (sbi->s_journal) {
+ /*
+ * We let remount-ro finish even if marking fs
+ * as clean failed...
+ */
ext4_mark_recovery_complete(sb, es);
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
+ }
} else {
/* Make sure we can mount this feature set readwrite */
if (ext4_has_feature_readonly(sb) ||
@@ -5511,7 +6479,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
* around from a previously readonly bdev mount,
* require a full umount/remount for now.
*/
- if (es->s_last_orphan) {
+ if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
ext4_msg(sb, KERN_WARNING, "Couldn't "
"remount RDWR because of unprocessed "
"orphan inode list. Please "
@@ -5526,9 +6494,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
* been changed by e2fsck since we originally mounted
* the partition.)
*/
- if (sbi->s_journal)
- ext4_clear_journal_err(sb, es);
- sbi->s_mount_state = le16_to_cpu(es->s_state);
+ if (sbi->s_journal) {
+ err = ext4_clear_journal_err(sb, es);
+ if (err)
+ goto restore_opts;
+ }
+ sbi->s_mount_state = (le16_to_cpu(es->s_state) &
+ ~EXT4_FC_REPLAY);
err = ext4_setup_super(sb, es, 0);
if (err)
@@ -5541,7 +6513,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EROFS;
goto restore_opts;
}
+#ifdef CONFIG_QUOTA
enable_quota = 1;
+#endif
}
}
@@ -5557,9 +6531,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_register_li_request(sb, first_not_zeroed);
}
- ext4_setup_system_zone(sb);
+ /*
+ * Handle creation of system zone data early because it can fail.
+ * Releasing of existing data is done when we are sure remount will
+ * succeed.
+ */
+ if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
+ err = ext4_setup_system_zone(sb);
+ if (err)
+ goto restore_opts;
+ }
+
if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
- err = ext4_commit_super(sb, 1);
+ err = ext4_commit_super(sb);
if (err)
goto restore_opts;
}
@@ -5578,10 +6562,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
}
#endif
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
+ ext4_release_system_zone(sb);
+
+ if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
+ ext4_stop_mmpd(sbi);
- *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
- ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
- kfree(orig_data);
return 0;
restore_opts:
@@ -5593,6 +6579,8 @@ restore_opts:
sbi->s_commit_interval = old_opts.s_commit_interval;
sbi->s_min_batch_time = old_opts.s_min_batch_time;
sbi->s_max_batch_time = old_opts.s_max_batch_time;
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
+ ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < EXT4_MAXQUOTAS; i++) {
@@ -5603,10 +6591,32 @@ restore_opts:
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(to_free[i]);
#endif
- kfree(orig_data);
+ if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
+ ext4_stop_mmpd(sbi);
return err;
}
+static int ext4_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ int ret;
+
+ fc->s_fs_info = EXT4_SB(sb);
+
+ ret = ext4_check_opt_consistency(fc, sb);
+ if (ret < 0)
+ return ret;
+
+ ret = __ext4_remount(fc, sb);
+ if (ret < 0)
+ return ret;
+
+ ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.",
+ ext4_quota_mode(sb));
+
+ return 0;
+}
+
#ifdef CONFIG_QUOTA
static int ext4_statfs_project(struct super_block *sb,
kprojid_t projid, struct kstatfs *buf)
@@ -5622,10 +6632,8 @@ static int ext4_statfs_project(struct super_block *sb,
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = dquot->dq_dqb.dqb_bsoftlimit;
- if (dquot->dq_dqb.dqb_bhardlimit &&
- (!limit || dquot->dq_dqb.dqb_bhardlimit < limit))
- limit = dquot->dq_dqb.dqb_bhardlimit;
+ limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
+ dquot->dq_dqb.dqb_bhardlimit);
limit >>= sb->s_blocksize_bits;
if (limit && buf->f_blocks > limit) {
@@ -5637,11 +6645,8 @@ static int ext4_statfs_project(struct super_block *sb,
(buf->f_blocks - curblock) : 0;
}
- limit = dquot->dq_dqb.dqb_isoftlimit;
- if (dquot->dq_dqb.dqb_ihardlimit &&
- (!limit || dquot->dq_dqb.dqb_ihardlimit < limit))
- limit = dquot->dq_dqb.dqb_ihardlimit;
-
+ limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
+ dquot->dq_dqb.dqb_ihardlimit);
if (limit && buf->f_files > limit) {
buf->f_files = limit;
buf->f_ffree =
@@ -5661,7 +6666,6 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
ext4_fsblk_t overhead = 0, resv_blocks;
- u64 fsid;
s64 bfree;
resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
@@ -5682,10 +6686,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
buf->f_namelen = EXT4_NAME_LEN;
- fsid = le64_to_cpup((void *)es->s_uuid) ^
- le64_to_cpup((void *)es->s_uuid + sizeof(u64));
- buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+ buf->f_fsid = uuid_to_fsid(es->s_uuid);
#ifdef CONFIG_QUOTA
if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
@@ -5763,11 +6764,8 @@ static int ext4_release_dquot(struct dquot *dquot)
static int ext4_mark_dquot_dirty(struct dquot *dquot)
{
struct super_block *sb = dquot->dq_sb;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- /* Are we journaling quotas? */
- if (ext4_has_feature_quota(sb) ||
- sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+ if (ext4_is_quota_journalled(sb)) {
dquot_mark_dquot_dirty(dquot);
return ext4_write_dquot(dquot);
} else {
@@ -5781,7 +6779,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
@@ -5791,16 +6789,6 @@ static int ext4_write_info(struct super_block *sb, int type)
return ret;
}
-/*
- * Turn on quotas during mount time - we need to find
- * the quota file and such...
- */
-static int ext4_quota_on_mount(struct super_block *sb, int type)
-{
- return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type),
- EXT4_SB(sb)->s_jquota_fmt, type);
-}
-
static void lockdep_set_quota_inode(struct inode *inode, int subclass)
{
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -5829,6 +6817,11 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
/* Quotafile not on the same filesystem? */
if (path->dentry->d_sb != sb)
return -EXDEV;
+
+ /* Quota already enabled for this file? */
+ if (IS_NOQUOTA(d_inode(path->dentry)))
+ return -EBUSY;
+
/* Journaling quota? */
if (EXT4_SB(sb)->s_qf_names[type]) {
/* Quotafile not in fs root? */
@@ -5856,7 +6849,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
* otherwise be livelocked...
*/
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
if (err)
return err;
@@ -5864,10 +6857,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
err = dquot_quota_on(sb, type, format_id, path);
- if (err) {
- lockdep_set_quota_inode(path->dentry->d_inode,
- I_DATA_SEM_NORMAL);
- } else {
+ if (!err) {
struct inode *inode = d_inode(path->dentry);
handle_t *handle;
@@ -5883,11 +6873,16 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
S_NOATIME | S_IMMUTABLE);
- ext4_mark_inode_dirty(handle, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
unlock_inode:
inode_unlock(inode);
+ if (err)
+ dquot_quota_off(sb, type);
}
+ if (err)
+ lockdep_set_quota_inode(path->dentry->d_inode,
+ I_DATA_SEM_NORMAL);
return err;
}
@@ -5925,7 +6920,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
}
/* Enable usage tracking for all quota types. */
-static int ext4_enable_quotas(struct super_block *sb)
+int ext4_enable_quotas(struct super_block *sb)
{
int type, err = 0;
unsigned long qf_inums[EXT4_MAXQUOTAS] = {
@@ -5950,8 +6945,19 @@ static int ext4_enable_quotas(struct super_block *sb)
"Failed to enable quota tracking "
"(type=%d, err=%d). Please run "
"e2fsck to fix.", type, err);
- for (type--; type >= 0; type--)
+ for (type--; type >= 0; type--) {
+ struct inode *inode;
+
+ inode = sb_dqopt(sb)->files[type];
+ if (inode)
+ inode = igrab(inode);
dquot_quota_off(sb, type);
+ if (inode) {
+ lockdep_set_quota_inode(inode,
+ I_DATA_SEM_NORMAL);
+ iput(inode);
+ }
+ }
return err;
}
@@ -5985,12 +6991,14 @@ static int ext4_quota_off(struct super_block *sb, int type)
* this is not a hard failure and quotas are already disabled.
*/
handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
- if (IS_ERR(handle))
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
goto out_unlock;
+ }
EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
inode->i_mtime = inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
out_unlock:
inode_unlock(inode);
@@ -6048,12 +7056,12 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
{
struct inode *inode = sb_dqopt(sb)->files[type];
ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
- int err, offset = off & (sb->s_blocksize - 1);
+ int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
int retries = 0;
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
- if (EXT4_SB(sb)->s_journal && !handle) {
+ if (!handle) {
ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
" cancelled because transaction is not started",
(unsigned long long)off, (unsigned long long)len);
@@ -6081,7 +7089,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
if (!bh)
goto out;
BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
if (err) {
brelse(bh);
return err;
@@ -6096,18 +7104,14 @@ out:
if (inode->i_size < off + len) {
i_size_write(inode, off + len);
EXT4_I(inode)->i_disksize = inode->i_size;
- ext4_mark_inode_dirty(handle, inode);
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err2 && !err))
+ err = err2;
}
- return len;
+ return err ? err : len;
}
#endif
-static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
-}
-
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static inline void register_as_ext2(void)
{
@@ -6165,11 +7169,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
}
static struct file_system_type ext4_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext4",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "ext4",
+ .init_fs_context = ext4_init_fs_context,
+ .parameters = ext4_param_specs,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ext4");
@@ -6182,7 +7187,6 @@ static int __init ext4_init_fs(void)
ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
ext4_li_info = NULL;
- mutex_init(&ext4_li_mtx);
/* Build-time check for flags consistency */
ext4_check_flag_values();
@@ -6220,6 +7224,11 @@ static int __init ext4_init_fs(void)
err = init_inodecache();
if (err)
goto out1;
+
+ err = ext4_fc_init_dentry_cache();
+ if (err)
+ goto out05;
+
register_as_ext3();
register_as_ext2();
err = register_filesystem(&ext4_fs_type);
@@ -6230,6 +7239,8 @@ static int __init ext4_init_fs(void)
out:
unregister_as_ext2();
unregister_as_ext3();
+ ext4_fc_destroy_dentry_cache();
+out05:
destroy_inodecache();
out1:
ext4_exit_mballoc();
@@ -6255,6 +7266,7 @@ static void __exit ext4_exit_fs(void)
unregister_as_ext2();
unregister_as_ext3();
unregister_filesystem(&ext4_fs_type);
+ ext4_fc_destroy_dentry_cache();
destroy_inodecache();
ext4_exit_mballoc();
ext4_exit_sysfs();
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index dd05af983092..3d3ed3c38f56 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -27,7 +27,7 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *cpage = NULL;
+ struct buffer_head *bh = NULL;
const void *caddr;
unsigned int max_size;
const char *paddr;
@@ -39,28 +39,88 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
caddr = EXT4_I(inode)->i_data;
max_size = sizeof(EXT4_I(inode)->i_data);
} else {
- cpage = read_mapping_page(inode->i_mapping, 0, NULL);
- if (IS_ERR(cpage))
- return ERR_CAST(cpage);
- caddr = page_address(cpage);
+ bh = ext4_bread(NULL, inode, 0, 0);
+ if (IS_ERR(bh))
+ return ERR_CAST(bh);
+ if (!bh) {
+ EXT4_ERROR_INODE(inode, "bad symlink.");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ caddr = bh->b_data;
max_size = inode->i_sb->s_blocksize;
}
paddr = fscrypt_get_symlink(inode, caddr, max_size, done);
- if (cpage)
- put_page(cpage);
+ brelse(bh);
return paddr;
}
+static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns,
+ const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ ext4_getattr(mnt_userns, path, stat, request_mask, query_flags);
+
+ return fscrypt_symlink_getattr(path, stat);
+}
+
+static void ext4_free_link(void *bh)
+{
+ brelse(bh);
+}
+
+static const char *ext4_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
+{
+ struct buffer_head *bh;
+ char *inline_link;
+
+ /*
+ * Create a new inlined symlink is not supported, just provide a
+ * method to read the leftovers.
+ */
+ if (ext4_has_inline_data(inode)) {
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ inline_link = ext4_read_inline_link(inode);
+ if (!IS_ERR(inline_link))
+ set_delayed_call(callback, kfree_link, inline_link);
+ return inline_link;
+ }
+
+ if (!dentry) {
+ bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT);
+ if (IS_ERR(bh))
+ return ERR_CAST(bh);
+ if (!bh || !ext4_buffer_uptodate(bh))
+ return ERR_PTR(-ECHILD);
+ } else {
+ bh = ext4_bread(NULL, inode, 0, 0);
+ if (IS_ERR(bh))
+ return ERR_CAST(bh);
+ if (!bh) {
+ EXT4_ERROR_INODE(inode, "bad symlink.");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ }
+
+ set_delayed_call(callback, ext4_free_link, bh);
+ nd_terminate_link(bh->b_data, inode->i_size,
+ inode->i_sb->s_blocksize - 1);
+ return bh->b_data;
+}
+
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
- .getattr = ext4_getattr,
+ .getattr = ext4_encrypted_symlink_getattr,
.listxattr = ext4_listxattr,
};
const struct inode_operations ext4_symlink_inode_operations = {
- .get_link = page_get_link,
+ .get_link = ext4_get_link,
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index d218ebdafa4a..d233c24ea342 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -13,6 +13,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
+#include <linux/part_stat.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -23,6 +24,7 @@ typedef enum {
attr_session_write_kbytes,
attr_lifetime_write_kbytes,
attr_reserved_clusters,
+ attr_sra_exceeded_retry_limit,
attr_inode_readahead,
attr_trigger_test_error,
attr_first_error_time,
@@ -61,11 +63,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->s_buddy_cache->i_sb;
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
- return snprintf(buf, PAGE_SIZE, "%lu\n",
- (part_stat_read(sb->s_bdev->bd_part,
- sectors[STAT_WRITE]) -
+ return sysfs_emit(buf, "%lu\n",
+ (part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
sbi->s_sectors_written_start) >> 1);
}
@@ -73,12 +72,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->s_buddy_cache->i_sb;
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(sbi->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part,
- sectors[STAT_WRITE]) -
+ ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
EXT4_SB(sb)->s_sectors_written_start) >> 1)));
}
@@ -134,8 +130,8 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
{
if (!sbi->s_journal)
- return snprintf(buf, PAGE_SIZE, "<none>\n");
- return snprintf(buf, PAGE_SIZE, "%d\n",
+ return sysfs_emit(buf, "<none>\n");
+ return sysfs_emit(buf, "%d\n",
task_pid_vnr(sbi->s_journal->j_task));
}
@@ -188,6 +184,9 @@ static struct ext4_attr ext4_attr_##_name = { \
#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)
+#define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname)
+
#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -204,6 +203,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
EXT4_ATTR_FUNC(session_write_kbytes, 0444);
EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
EXT4_ATTR_FUNC(reserved_clusters, 0644);
+EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
ext4_sb_info, s_inode_readahead_blks);
@@ -214,6 +214,8 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
@@ -225,6 +227,8 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
#ifdef CONFIG_EXT4_DEBUG
EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
#endif
+EXT4_RO_ATTR_SBI_ATOMIC(warning_count, s_warning_count);
+EXT4_RO_ATTR_SBI_ATOMIC(msg_count, s_msg_count);
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode);
EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode);
@@ -239,6 +243,9 @@ EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
+EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -248,6 +255,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(reserved_clusters),
+ ATTR_LIST(sra_exceeded_retry_limit),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
@@ -256,6 +264,8 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_order2_req),
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(mb_max_inode_prealloc),
+ ATTR_LIST(mb_max_linear_groups),
ATTR_LIST(max_writeback_mb_bump),
ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
@@ -266,6 +276,8 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(msg_ratelimit_interval_ms),
ATTR_LIST(msg_ratelimit_burst),
ATTR_LIST(errors_count),
+ ATTR_LIST(warning_count),
+ ATTR_LIST(msg_count),
ATTR_LIST(first_error_ino),
ATTR_LIST(last_error_ino),
ATTR_LIST(first_error_block),
@@ -282,6 +294,9 @@ static struct attribute *ext4_attrs[] = {
#ifdef CONFIG_EXT4_DEBUG
ATTR_LIST(simulate_fail),
#endif
+ ATTR_LIST(mb_prefetch),
+ ATTR_LIST(mb_prefetch_limit),
+ ATTR_LIST(last_trim_minblks),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
@@ -292,14 +307,19 @@ EXT4_ATTR_FEATURE(batched_discard);
EXT4_ATTR_FEATURE(meta_bg_resize);
#ifdef CONFIG_FS_ENCRYPTION
EXT4_ATTR_FEATURE(encryption);
+EXT4_ATTR_FEATURE(test_dummy_encryption_v2);
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
EXT4_ATTR_FEATURE(casefold);
#endif
#ifdef CONFIG_FS_VERITY
EXT4_ATTR_FEATURE(verity);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
+EXT4_ATTR_FEATURE(fast_commit);
+#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
+EXT4_ATTR_FEATURE(encrypted_casefold);
+#endif
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
@@ -307,14 +327,19 @@ static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(meta_bg_resize),
#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
+ ATTR_LIST(test_dummy_encryption_v2),
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
ATTR_LIST(casefold),
#endif
#ifdef CONFIG_FS_VERITY
ATTR_LIST(verity),
#endif
ATTR_LIST(metadata_csum_seed),
+ ATTR_LIST(fast_commit),
+#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
+ ATTR_LIST(encrypted_casefold),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(ext4_feat);
@@ -334,7 +359,7 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
{
- return snprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
((time64_t)hi << 32) + le32_to_cpu(lo));
}
@@ -351,7 +376,7 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
switch (a->attr_id) {
case attr_delayed_allocation_blocks:
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(s64) EXT4_C2B(sbi,
percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
case attr_session_write_kbytes:
@@ -359,50 +384,54 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
case attr_lifetime_write_kbytes:
return lifetime_write_kbytes_show(sbi, buf);
case attr_reserved_clusters:
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)
atomic64_read(&sbi->s_resv_clusters));
+ case attr_sra_exceeded_retry_limit:
+ return sysfs_emit(buf, "%llu\n",
+ (unsigned long long)
+ percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit));
case attr_inode_readahead:
case attr_pointer_ui:
if (!ptr)
return 0;
if (a->attr_ptr == ptr_ext4_super_block_offset)
- return snprintf(buf, PAGE_SIZE, "%u\n",
+ return sysfs_emit(buf, "%u\n",
le32_to_cpup(ptr));
else
- return snprintf(buf, PAGE_SIZE, "%u\n",
+ return sysfs_emit(buf, "%u\n",
*((unsigned int *) ptr));
case attr_pointer_ul:
if (!ptr)
return 0;
- return snprintf(buf, PAGE_SIZE, "%lu\n",
+ return sysfs_emit(buf, "%lu\n",
*((unsigned long *) ptr));
case attr_pointer_u8:
if (!ptr)
return 0;
- return snprintf(buf, PAGE_SIZE, "%u\n",
+ return sysfs_emit(buf, "%u\n",
*((unsigned char *) ptr));
case attr_pointer_u64:
if (!ptr)
return 0;
if (a->attr_ptr == ptr_ext4_super_block_offset)
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
le64_to_cpup(ptr));
else
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
*((unsigned long long *) ptr));
case attr_pointer_string:
if (!ptr)
return 0;
- return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size,
+ return sysfs_emit(buf, "%.*s\n", a->attr_size,
(char *) ptr);
case attr_pointer_atomic:
if (!ptr)
return 0;
- return snprintf(buf, PAGE_SIZE, "%d\n",
+ return sysfs_emit(buf, "%d\n",
atomic_read((atomic_t *) ptr));
case attr_feature:
- return snprintf(buf, PAGE_SIZE, "supported\n");
+ return sysfs_emit(buf, "supported\n");
case attr_first_error_time:
return print_tstamp(buf, sbi->s_es, s_first_error_time);
case attr_last_error_time:
@@ -479,6 +508,11 @@ static struct kobj_type ext4_feat_ktype = {
.release = (void (*)(struct kobject *))kfree,
};
+void ext4_notify_error_sysfs(struct ext4_sb_info *sbi)
+{
+ sysfs_notify(&sbi->s_kobj, NULL, "errors_count");
+}
+
static struct kobject *ext4_root;
static struct kobject *ext4_feat;
@@ -505,8 +539,14 @@ int ext4_register_sysfs(struct super_block *sb)
proc_create_single_data("es_shrinker_info", S_IRUGO,
sbi->s_proc, ext4_seq_es_shrinker_info_show,
sb);
+ proc_create_single_data("fc_info", 0444, sbi->s_proc,
+ ext4_fc_info_show, sb);
proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_ops, sb);
+ proc_create_single_data("mb_stats", 0444, sbi->s_proc,
+ ext4_seq_mb_stats_show, sb);
+ proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc,
+ &ext4_mb_seq_structs_summary_ops, sb);
}
return 0;
}
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index bcbe3668c1d4..ce84aa2786c7 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -11,14 +11,16 @@
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
+ struct address_space *mapping = inode->i_mapping;
+
/*
* We don't need to call ext4_break_layouts() because the blocks we
* are truncating were never visible to userspace.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
- truncate_inode_pages(inode->i_mapping, inode->i_size);
+ filemap_invalidate_lock(mapping);
+ truncate_inode_pages(mapping, inode->i_size);
ext4_truncate(inode);
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
}
/*
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index dc5ec724d889..3c640bd7ecae 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -45,16 +45,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
- void *addr;
page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
NULL);
if (IS_ERR(page))
return PTR_ERR(page);
- addr = kmap_atomic(page);
- memcpy(buf, addr + offset_in_page(pos), n);
- kunmap_atomic(addr);
+ memcpy_from_page(buf, page, offset_in_page(pos), n);
put_page(page);
@@ -72,6 +69,9 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
static int pagecache_write(struct inode *inode, const void *buf, size_t count,
loff_t pos)
{
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+
if (pos + count > inode->i_sb->s_maxbytes)
return -EFBIG;
@@ -80,20 +80,15 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
void *fsdata;
- void *addr;
int res;
- res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
- &page, &fsdata);
+ res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
if (res)
return res;
- addr = kmap_atomic(page);
- memcpy(addr + offset_in_page(pos), buf, n);
- kunmap_atomic(addr);
+ memcpy_to_page(page, offset_in_page(pos), buf, n);
- res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
- page, fsdata);
+ res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata);
if (res < 0)
return res;
if (res != n)
@@ -113,6 +108,9 @@ static int ext4_begin_enable_verity(struct file *filp)
handle_t *handle;
int err;
+ if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EINVAL;
+
if (ext4_verity_in_progress(inode))
return -EBUSY;
@@ -198,55 +196,76 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
struct inode *inode = file_inode(filp);
const int credits = 2; /* superblock and inode for ext4_orphan_del() */
handle_t *handle;
+ struct ext4_iloc iloc;
int err = 0;
- int err2;
-
- if (desc != NULL) {
- /* Succeeded; write the verity descriptor. */
- err = ext4_write_verity_descriptor(inode, desc, desc_size,
- merkle_tree_size);
- /* Write all pages before clearing VERITY_IN_PROGRESS. */
- if (!err)
- err = filemap_write_and_wait(inode->i_mapping);
- }
+ /*
+ * If an error already occurred (which fs/verity/ signals by passing
+ * desc == NULL), then only clean-up is needed.
+ */
+ if (desc == NULL)
+ goto cleanup;
- /* If we failed, truncate anything we wrote past i_size. */
- if (desc == NULL || err)
- ext4_truncate(inode);
+ /* Append the verity descriptor. */
+ err = ext4_write_verity_descriptor(inode, desc, desc_size,
+ merkle_tree_size);
+ if (err)
+ goto cleanup;
/*
- * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
- * deleting the inode from the orphan list, even if something failed.
- * If everything succeeded, we'll also set the verity bit in the same
- * transaction.
+ * Write all pages (both data and verity metadata). Note that this must
+ * happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages
+ * beyond i_size won't be written properly. For crash consistency, this
+ * also must happen before the verity inode flag gets persisted.
*/
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto cleanup;
- ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+ /*
+ * Finally, set the verity inode flag and remove the inode from the
+ * orphan list (in a single transaction).
+ */
handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
- return PTR_ERR(handle);
+ err = PTR_ERR(handle);
+ goto cleanup;
}
- err2 = ext4_orphan_del(handle, inode);
- if (err2)
- goto out_stop;
+ err = ext4_orphan_del(handle, inode);
+ if (err)
+ goto stop_and_cleanup;
- if (desc != NULL && !err) {
- struct ext4_iloc iloc;
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto stop_and_cleanup;
+
+ ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+ ext4_set_inode_flags(inode, false);
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (err)
+ goto stop_and_cleanup;
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
- goto out_stop;
- ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
- ext4_set_inode_flags(inode);
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
- }
-out_stop:
ext4_journal_stop(handle);
- return err ?: err2;
+
+ ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+ return 0;
+
+stop_and_cleanup:
+ ext4_journal_stop(handle);
+cleanup:
+ /*
+ * Verity failed to be enabled, so clean up by truncating any verity
+ * metadata that was written beyond i_size (both from cache and from
+ * disk), removing the inode from the orphan list (if it wasn't done
+ * already), and clearing EXT4_STATE_VERITY_IN_PROGRESS.
+ */
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+ ext4_orphan_del(NULL, inode);
+ ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+ return err;
}
static int ext4_get_verity_descriptor_location(struct inode *inode,
@@ -279,16 +298,14 @@ static int ext4_get_verity_descriptor_location(struct inode *inode,
last_extent = path[path->p_depth].p_ext;
if (!last_extent) {
EXT4_ERROR_INODE(inode, "verity file has no extents");
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return -EFSCORRUPTED;
}
end_lblk = le32_to_cpu(last_extent->ee_block) +
ext4_ext_get_actual_len(last_extent);
desc_size_pos = (u64)end_lblk << inode->i_blkbits;
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (desc_size_pos < sizeof(desc_size_disk))
goto bad;
@@ -342,37 +359,6 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
return desc_size;
}
-/*
- * Prefetch some pages from the file's Merkle tree.
- *
- * This is basically a stripped-down version of __do_page_cache_readahead()
- * which works on pages past i_size.
- */
-static void ext4_merkle_tree_readahead(struct address_space *mapping,
- pgoff_t start_index, unsigned long count)
-{
- LIST_HEAD(pages);
- unsigned int nr_pages = 0;
- struct page *page;
- pgoff_t index;
- struct blk_plug plug;
-
- for (index = start_index; index < start_index + count; index++) {
- page = xa_load(&mapping->i_pages, index);
- if (!page || xa_is_value(page)) {
- page = __page_cache_alloc(readahead_gfp_mask(mapping));
- if (!page)
- break;
- page->index = index;
- list_add(&page->lru, &pages);
- nr_pages++;
- }
- }
- blk_start_plug(&plug);
- ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true);
- blk_finish_plug(&plug);
-}
-
static struct page *ext4_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
@@ -383,11 +369,12 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
+
if (page)
put_page(page);
else if (num_ra_pages > 1)
- ext4_merkle_tree_readahead(inode->i_mapping, index,
- num_ra_pages);
+ page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
page = read_mapping_page(inode->i_mapping, index, NULL);
}
return page;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8cac7d95c3ad..36d6ba7190b6 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -93,6 +93,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = {
#ifdef CONFIG_EXT4_FS_SECURITY
[EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
#endif
+ [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler,
};
const struct xattr_handler *ext4_xattr_handlers[] = {
@@ -105,6 +106,7 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
#ifdef CONFIG_EXT4_FS_SECURITY
&ext4_xattr_security_handler,
#endif
+ &ext4_xattr_hurd_handler,
NULL
};
@@ -245,7 +247,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
bh->b_data);
errout:
if (error)
- __ext4_error_inode(inode, function, line, 0,
+ __ext4_error_inode(inode, function, line, 0, -error,
"corrupted xattr block %llu",
(unsigned long long) bh->b_blocknr);
else
@@ -269,7 +271,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header));
errout:
if (error)
- __ext4_error_inode(inode, function, line, 0,
+ __ext4_error_inode(inode, function, line, 0, -error,
"corrupted in-inode xattr");
return error;
}
@@ -434,6 +436,21 @@ error:
return err;
}
+/* Remove entry from mbcache when EA inode is getting evicted */
+void ext4_evict_ea_inode(struct inode *inode)
+{
+ struct mb_cache_entry *oe;
+
+ if (!EA_INODE_CACHE(inode))
+ return;
+ /* Wait for entry to get unused so that we can remove it */
+ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode),
+ ext4_xattr_inode_get_hash(inode), inode->i_ino))) {
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(EA_INODE_CACHE(inode), oe);
+ }
+}
+
static int
ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
struct ext4_xattr_entry *entry, void *buffer,
@@ -789,9 +806,13 @@ static void ext4_xattr_update_super_block(handle_t *handle,
return;
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
+ if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE) == 0) {
+ lock_buffer(EXT4_SB(sb)->s_sbh);
ext4_set_feature_xattr(sb);
- ext4_handle_dirty_super(handle, sb);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(EXT4_SB(sb)->s_sbh);
+ ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
}
}
@@ -970,10 +991,8 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
int ref_change)
{
- struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
struct ext4_iloc iloc;
s64 ref_count;
- u32 hash;
int ret;
inode_lock(ea_inode);
@@ -996,14 +1015,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
set_nlink(ea_inode, 1);
ext4_orphan_del(handle, ea_inode);
-
- if (ea_inode_cache) {
- hash = ext4_xattr_inode_get_hash(ea_inode);
- mb_cache_entry_create(ea_inode_cache,
- GFP_NOFS, hash,
- ea_inode->i_ino,
- true /* reusable */);
- }
}
} else {
WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
@@ -1016,12 +1027,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
clear_nlink(ea_inode);
ext4_orphan_add(handle, ea_inode);
-
- if (ea_inode_cache) {
- hash = ext4_xattr_inode_get_hash(ea_inode);
- mb_cache_entry_delete(ea_inode_cache, hash,
- ea_inode->i_ino);
- }
}
}
@@ -1164,7 +1169,8 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
continue;
}
if (err > 0) {
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle,
+ parent->i_sb, bh, EXT4_JTR_NONE);
if (err) {
ext4_warning_inode(ea_inode,
"Re-get write access err=%d",
@@ -1225,10 +1231,12 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
int error = 0;
BUFFER_TRACE(bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
+retry_ref:
lock_buffer(bh);
hash = le32_to_cpu(BHDR(bh)->h_hash);
ref = le32_to_cpu(BHDR(bh)->h_refcount);
@@ -1238,9 +1246,18 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
* This must happen under buffer lock for
* ext4_xattr_block_set() to reliably detect freed block
*/
- if (ea_block_cache)
- mb_cache_entry_delete(ea_block_cache, hash,
- bh->b_blocknr);
+ if (ea_block_cache) {
+ struct mb_cache_entry *oe;
+
+ oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
+ bh->b_blocknr);
+ if (oe) {
+ unlock_buffer(bh);
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto retry_ref;
+ }
+ }
get_bh(bh);
unlock_buffer(bh);
@@ -1327,7 +1344,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
int blocksize = ea_inode->i_sb->s_blocksize;
int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
int csize, wsize = 0;
- int ret = 0;
+ int ret = 0, ret2 = 0;
int retries = 0;
retry:
@@ -1354,8 +1371,7 @@ retry:
block = 0;
while (wsize < bufsize) {
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
csize = (bufsize - wsize) > blocksize ? blocksize :
bufsize - wsize;
bh = ext4_getblk(handle, ea_inode, block, 0);
@@ -1367,7 +1383,8 @@ retry:
"ext4_getblk() return bh = NULL");
return -EFSCORRUPTED;
}
- ret = ext4_journal_get_write_access(handle, bh);
+ ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (ret)
goto out;
@@ -1385,7 +1402,9 @@ retry:
ext4_update_i_disksize(ea_inode, wsize);
inode_unlock(ea_inode);
- ext4_mark_inode_dirty(handle, ea_inode);
+ ret2 = ext4_mark_inode_dirty(handle, ea_inode);
+ if (unlikely(ret2 && !ret))
+ ret = ret2;
out:
brelse(bh);
@@ -1456,6 +1475,9 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
if (!ce)
return NULL;
+ WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
+ !(current->flags & PF_MEMALLOC_NOFS));
+
ea_data = kvmalloc(value_len, GFP_KERNEL);
if (!ea_data) {
mb_cache_entry_put(ea_inode_cache, ce);
@@ -1608,7 +1630,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
* If storing the value in an external inode is an option,
* reserve space for xattr entries/names in the external
* attribute block so that a long value does not occupy the
- * whole space and prevent futher entries being added.
+ * whole space and prevent further entries being added.
*/
if (ext4_has_feature_ea_inode(inode->i_sb) &&
new_size && is_block &&
@@ -1800,8 +1822,11 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
if (EXT4_I(inode)->i_file_acl) {
/* The inode already has an extended attribute block. */
bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
- if (IS_ERR(bs->bh))
- return PTR_ERR(bs->bh);
+ if (IS_ERR(bs->bh)) {
+ error = PTR_ERR(bs->bh);
+ bs->bh = NULL;
+ return error;
+ }
ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
@@ -1842,8 +1867,11 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
#define header(x) ((struct ext4_xattr_header *)(x))
if (s->base) {
+ int offset = (char *)s->here - bs->bh->b_data;
+
BUFFER_TRACE(bs->bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, bs->bh);
+ error = ext4_journal_get_write_access(handle, sb, bs->bh,
+ EXT4_JTR_NONE);
if (error)
goto cleanup;
lock_buffer(bs->bh);
@@ -1856,9 +1884,20 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
* ext4_xattr_block_set() to reliably detect modified
* block
*/
- if (ea_block_cache)
- mb_cache_entry_delete(ea_block_cache, hash,
- bs->bh->b_blocknr);
+ if (ea_block_cache) {
+ struct mb_cache_entry *oe;
+
+ oe = mb_cache_entry_delete_or_get(ea_block_cache,
+ hash, bs->bh->b_blocknr);
+ if (oe) {
+ /*
+ * Xattr block is getting reused. Leave
+ * it alone.
+ */
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto clone_block;
+ }
+ }
ea_bdebug(bs->bh, "modifying in-place");
error = ext4_xattr_set_entry(i, s, handle, inode,
true /* is_block */);
@@ -1873,55 +1912,51 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (error)
goto cleanup;
goto inserted;
- } else {
- int offset = (char *)s->here - bs->bh->b_data;
+ }
+clone_block:
+ unlock_buffer(bs->bh);
+ ea_bdebug(bs->bh, "cloning");
+ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
+ error = -ENOMEM;
+ if (s->base == NULL)
+ goto cleanup;
+ s->first = ENTRY(header(s->base)+1);
+ header(s->base)->h_refcount = cpu_to_le32(1);
+ s->here = ENTRY(s->base + offset);
+ s->end = s->base + bs->bh->b_size;
- unlock_buffer(bs->bh);
- ea_bdebug(bs->bh, "cloning");
- s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
- error = -ENOMEM;
- if (s->base == NULL)
+ /*
+ * If existing entry points to an xattr inode, we need
+ * to prevent ext4_xattr_set_entry() from decrementing
+ * ref count on it because the reference belongs to the
+ * original block. In this case, make the entry look
+ * like it has an empty value.
+ */
+ if (!s->not_found && s->here->e_value_inum) {
+ ea_ino = le32_to_cpu(s->here->e_value_inum);
+ error = ext4_xattr_inode_iget(inode, ea_ino,
+ le32_to_cpu(s->here->e_hash),
+ &tmp_inode);
+ if (error)
goto cleanup;
- memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
- s->first = ENTRY(header(s->base)+1);
- header(s->base)->h_refcount = cpu_to_le32(1);
- s->here = ENTRY(s->base + offset);
- s->end = s->base + bs->bh->b_size;
- /*
- * If existing entry points to an xattr inode, we need
- * to prevent ext4_xattr_set_entry() from decrementing
- * ref count on it because the reference belongs to the
- * original block. In this case, make the entry look
- * like it has an empty value.
- */
- if (!s->not_found && s->here->e_value_inum) {
- ea_ino = le32_to_cpu(s->here->e_value_inum);
- error = ext4_xattr_inode_iget(inode, ea_ino,
- le32_to_cpu(s->here->e_hash),
- &tmp_inode);
- if (error)
- goto cleanup;
-
- if (!ext4_test_inode_state(tmp_inode,
- EXT4_STATE_LUSTRE_EA_INODE)) {
- /*
- * Defer quota free call for previous
- * inode until success is guaranteed.
- */
- old_ea_inode_quota = le32_to_cpu(
- s->here->e_value_size);
- }
- iput(tmp_inode);
-
- s->here->e_value_inum = 0;
- s->here->e_value_size = 0;
+ if (!ext4_test_inode_state(tmp_inode,
+ EXT4_STATE_LUSTRE_EA_INODE)) {
+ /*
+ * Defer quota free call for previous
+ * inode until success is guaranteed.
+ */
+ old_ea_inode_quota = le32_to_cpu(
+ s->here->e_value_size);
}
+ iput(tmp_inode);
+
+ s->here->e_value_inum = 0;
+ s->here->e_value_size = 0;
}
} else {
/* Allocate a buffer where we construct the new block. */
s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
- /* assert(header == s->base) */
error = -ENOMEM;
if (s->base == NULL)
goto cleanup;
@@ -1976,25 +2011,21 @@ inserted:
if (error)
goto cleanup;
BUFFER_TRACE(new_bh, "get_write_access");
- error = ext4_journal_get_write_access(handle,
- new_bh);
+ error = ext4_journal_get_write_access(
+ handle, sb, new_bh,
+ EXT4_JTR_NONE);
if (error)
goto cleanup_dquot;
lock_buffer(new_bh);
/*
* We have to be careful about races with
- * freeing, rehashing or adding references to
- * xattr block. Once we hold buffer lock xattr
- * block's state is stable so we can check
- * whether the block got freed / rehashed or
- * not. Since we unhash mbcache entry under
- * buffer lock when freeing / rehashing xattr
- * block, checking whether entry is still
- * hashed is reliable. Same rules hold for
- * e_reusable handling.
+ * adding references to xattr block. Once we
+ * hold buffer lock xattr block's state is
+ * stable so we can check the additional
+ * reference fits.
*/
- if (hlist_bl_unhashed(&ce->e_hash_list) ||
- !ce->e_reusable) {
+ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+ if (ref > EXT4_XATTR_REFCOUNT_MAX) {
/*
* Undo everything and check mbcache
* again.
@@ -2009,9 +2040,8 @@ inserted:
new_bh = NULL;
goto inserted;
}
- ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
- if (ref >= EXT4_XATTR_REFCOUNT_MAX)
+ if (ref == EXT4_XATTR_REFCOUNT_MAX)
ce->e_reusable = 0;
ea_bdebug(new_bh, "reusing; refcount now=%d",
ref);
@@ -2081,7 +2111,8 @@ getblk_failed:
}
lock_buffer(new_bh);
- error = ext4_journal_get_create_access(handle, new_bh);
+ error = ext4_journal_get_create_access(handle, sb,
+ new_bh, EXT4_JTR_NONE);
if (error) {
unlock_buffer(new_bh);
error = -EIO;
@@ -2158,8 +2189,9 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
struct ext4_inode *raw_inode;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return 0;
+
raw_inode = ext4_raw_inode(&is->iloc);
header = IHDR(inode, raw_inode);
is->s.base = is->s.first = IFIRST(header);
@@ -2179,7 +2211,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
return 0;
}
-int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is)
{
@@ -2187,32 +2219,9 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_search *s = &is->s;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return -ENOSPC;
- error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
- if (error)
- return error;
- header = IHDR(inode, ext4_raw_inode(&is->iloc));
- if (!IS_LAST_ENTRY(s->first)) {
- header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
- ext4_set_inode_state(inode, EXT4_STATE_XATTR);
- } else {
- header->h_magic = cpu_to_le32(0);
- ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
- }
- return 0;
-}
-static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
-{
- struct ext4_xattr_ibody_header *header;
- struct ext4_xattr_search *s = &is->s;
- int error;
-
- if (EXT4_I(inode)->i_extra_isize == 0)
- return -ENOSPC;
error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
if (error)
return error;
@@ -2319,6 +2328,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
error = -ENOSPC;
goto cleanup;
}
+ WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
}
error = ext4_reserve_inode_write(handle, inode, &is.iloc);
@@ -2392,7 +2402,7 @@ retry_inode:
* external inode if possible.
*/
if (ext4_has_feature_ea_inode(inode->i_sb) &&
- !i.in_inode) {
+ i.value_len && !i.in_inode) {
i.in_inode = 1;
goto retry_inode;
}
@@ -2402,6 +2412,7 @@ retry_inode:
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
if (!value)
no_expand = 0;
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -2413,6 +2424,7 @@ retry_inode:
if (IS_SYNC(inode))
ext4_handle_sync(handle);
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
cleanup:
brelse(is.iloc.bh);
@@ -2490,6 +2502,7 @@ retry:
if (error == 0)
error = error2;
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL);
return error;
}
@@ -2858,7 +2871,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
goto cleanup;
}
- error = ext4_journal_get_write_access(handle, iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb,
+ iloc.bh, EXT4_JTR_NONE);
if (error) {
EXT4_ERROR_INODE(inode, "write access (error %d)",
error);
@@ -2880,9 +2894,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
if (IS_ERR(bh)) {
error = PTR_ERR(bh);
if (error == -EIO) {
- ext4_set_errno(inode->i_sb, EIO);
- EXT4_ERROR_INODE(inode, "block %llu read error",
- EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE_ERR(inode, EIO,
+ "block %llu read error",
+ EXT4_I(inode)->i_file_acl);
}
bh = NULL;
goto cleanup;
@@ -2922,6 +2936,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
error);
goto cleanup;
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
}
error = 0;
cleanup:
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index f39cad2abe2a..824faf0b15a8 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -48,7 +48,7 @@ struct ext4_xattr_entry {
__le32 e_value_inum; /* inode in which the value is stored */
__le32 e_value_size; /* size of attribute value */
__le32 e_hash; /* hash value of name and value */
- char e_name[0]; /* attribute name */
+ char e_name[]; /* attribute name */
};
#define EXT4_XATTR_PAD_BITS 2
@@ -84,7 +84,7 @@ struct ext4_xattr_entry {
/*
* The minimum size of EA value when you start storing it in an external inode
* size of block - size of header - size of 1 entry - 4 null bytes
-*/
+ */
#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \
((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
@@ -95,6 +95,19 @@ struct ext4_xattr_entry {
#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+/*
+ * If we want to add an xattr to the inode, we should make sure that
+ * i_extra_isize is not 0 and that the inode size is not less than
+ * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
+ * EXT4_GOOD_OLD_INODE_SIZE extra_isize header entry pad data
+ * |--------------------------|------------|------|---------|---|-------|
+ */
+#define EXT4_INODE_HAS_XATTR_SPACE(inode) \
+ ((EXT4_I(inode)->i_extra_isize != 0) && \
+ (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize + \
+ sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <= \
+ EXT4_INODE_SIZE((inode)->i_sb)))
+
struct ext4_xattr_info {
const char *name;
const void *value;
@@ -118,12 +131,13 @@ struct ext4_xattr_ibody_find {
struct ext4_xattr_inode_array {
unsigned int count; /* # of used items in the array */
- struct inode *inodes[0];
+ struct inode *inodes[];
};
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_hurd_handler;
#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
@@ -177,6 +191,7 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
+extern void ext4_evict_ea_inode(struct inode *inode);
extern const struct xattr_handler *ext4_xattr_handlers[];
@@ -185,9 +200,9 @@ extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
const char *name,
void *buffer, size_t buffer_size);
-extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is);
+extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c
new file mode 100644
index 000000000000..c78df5790377
--- /dev/null
+++ b/fs/ext4/xattr_hurd.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/ext4/xattr_hurd.c
+ * Handler for extended gnu attributes for the Hurd.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ * Copyright (C) 2020 by Jan (janneke) Nieuwenhuizen, <janneke@gnu.org>
+ */
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include "ext4.h"
+#include "xattr.h"
+
+static bool
+ext4_xattr_hurd_list(struct dentry *dentry)
+{
+ return test_opt(dentry->d_sb, XATTR_USER);
+}
+
+static int
+ext4_xattr_hurd_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_HURD,
+ name, buffer, size);
+}
+
+static int
+ext4_xattr_hurd_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_HURD,
+ name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_hurd_handler = {
+ .prefix = XATTR_HURD_PREFIX,
+ .list = ext4_xattr_hurd_list,
+ .get = ext4_xattr_hurd_get,
+ .set = ext4_xattr_hurd_set,
+};
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 197a9d8a15ef..8213f66f7b2d 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -23,6 +23,7 @@ ext4_xattr_security_get(const struct xattr_handler *handler,
static int
ext4_xattr_security_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index e9389e5d75c3..7c21ffb26d25 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -30,6 +30,7 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler,
static int
ext4_xattr_trusted_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d4546184b34b..2fe7ff0a479c 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -31,6 +31,7 @@ ext4_xattr_user_get(const struct xattr_handler *handler,
static int
ext4_xattr_user_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)