From 6e06ae88edae77379bef7c0cb7d3c2dd88676867 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 12 Jul 2015 18:11:30 -0400 Subject: jbd2: speedup jbd2_journal_dirty_metadata() It is often the case that we mark buffer as having dirty metadata when the buffer is already in that state (frequent for bitmaps, inode table blocks, superblock). Thus it is unnecessary to contend on grabbing journal head reference and bh_state lock. Avoid that by checking whether any modification to the buffer is needed before grabbing any locks or references. [ Note: this is a fixed version of commit 2143c1965a761, which was reverted in ebeaa8ddb3663b5 due to a false positive triggering of an assertion check. -- Ted ] Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index f3d06174b051..a6eec9747c74 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1280,8 +1280,6 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, triggers->t_abort(triggers, jh2bh(jh)); } - - /** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. @@ -1314,12 +1312,41 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - journal = transaction->t_journal; - jh = jbd2_journal_grab_journal_head(bh); - if (!jh) { + if (!buffer_jbd(bh)) { ret = -EUCLEAN; goto out; } + /* + * We don't grab jh reference here since the buffer must be part + * of the running transaction. + */ + jh = bh2jh(bh); + /* + * This and the following assertions are unreliable since we may see jh + * in inconsistent state unless we grab bh_state lock. But this is + * crucial to catch bugs so let's do a reliable check until the + * lockless handling is fully proven. + */ + if (jh->b_transaction != transaction && + jh->b_next_transaction != transaction) { + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_next_transaction == transaction); + jbd_unlock_bh_state(bh); + } + if (jh->b_modified == 1) { + /* If it's in our transaction it must be in BJ_Metadata list. */ + if (jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata) { + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction != transaction || + jh->b_jlist == BJ_Metadata); + jbd_unlock_bh_state(bh); + } + goto out; + } + + journal = transaction->t_journal; jbd_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); @@ -1410,7 +1437,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); - jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); return ret; -- cgit v1.2.3-59-g8ed1b From 806c24adf74ec02543e4dcad989c0336f9fe82c4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Jul 2015 11:16:47 -0400 Subject: ext4 crypto: use a jbd2 transaction when adding a crypto policy Start a jbd2 transaction, and mark the inode dirty on the inode under that transaction after setting the encrypt flag. Otherwise if the directory isn't modified after setting the crypto policy, the encrypted flag might not survive the inode getting pushed out from memory, or the the file system getting unmounted and remounted. Signed-off-by: Theodore Ts'o --- fs/ext4/crypto_policy.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 02c4e5df7afb..a640ec2c4b13 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -12,6 +12,7 @@ #include #include +#include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" @@ -49,7 +50,8 @@ static int ext4_create_encryption_context_from_policy( struct inode *inode, const struct ext4_encryption_policy *policy) { struct ext4_encryption_context ctx; - int res = 0; + handle_t *handle; + int res, res2; res = ext4_convert_inline_data(inode); if (res) @@ -78,11 +80,22 @@ static int ext4_create_encryption_context_from_policy( BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + handle = ext4_journal_start(inode, EXT4_HT_MISC, + ext4_jbd2_credits_xattr(inode)); + if (IS_ERR(handle)) + return PTR_ERR(handle); res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); - if (!res) + if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + res = ext4_mark_inode_dirty(handle, inode); + if (res) + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); + } + res2 = ext4_journal_stop(handle); + if (!res) + res = res2; return res; } -- cgit v1.2.3-59-g8ed1b From 27977b69e493c9b259eb0490534e0f74bc325ba8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Jul 2015 11:33:16 -0400 Subject: ext4 crypto: check for too-short encrypted file names An encrypted file name should never be shorter than an 16 bytes, the AES block size. The 3.10 crypto layer will oops and crash the kernel if ciphertext shorter than the block size is passed to it. Fortunately, in modern kernels the crypto layer will not crash the kernel in this scenario, but nevertheless, it represents a corrupted directory, and we should detect it and mark the file system as corrupted so that e2fsck can fix this. Signed-off-by: Theodore Ts'o --- fs/ext4/crypto_fname.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 7dc4eb55913c..86ee996a2bd4 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -329,6 +329,10 @@ int _ext4_fname_disk_to_usr(struct inode *inode, return oname->len; } } + if (iname->len < EXT4_CRYPTO_BLOCK_SIZE) { + EXT4_ERROR_INODE(inode, "encrypted inode too small"); + return -EUCLEAN; + } if (EXT4_I(inode)->i_crypt_info) return ext4_fname_decrypt(inode, iname, oname); -- cgit v1.2.3-59-g8ed1b From 5a33911fa5ecd7395115df2e27fbd22b73357ac5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Jul 2015 23:50:24 -0400 Subject: ext4: replace ext4_io_submit->io_op with ->io_wbc ext4_io_submit_init() takes the pointer to writeback_control to test its sync_mode and determine between WRITE and WRITE_SYNC and records the result in ->io_op. This patch makes it record the pointer directly and moves the test to ext4_io_submit(). This doesn't cause any noticeable differences now but having writeback_control available throughout IO submission path will be depended upon by the planned cgroup writeback support. Signed-off-by: Tejun Heo Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/page-io.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f5e9f04220c1..32071f5c1c26 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -187,7 +187,7 @@ typedef struct ext4_io_end { } ext4_io_end_t; struct ext4_io_submit { - int io_op; + struct writeback_control *io_wbc; struct bio *io_bio; ext4_io_end_t *io_end; sector_t io_next_block; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5602450f03f6..a917bfe3e70c 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -357,8 +357,10 @@ void ext4_io_submit(struct ext4_io_submit *io) struct bio *bio = io->io_bio; if (bio) { + int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE; bio_get(io->io_bio); - submit_bio(io->io_op, io->io_bio); + submit_bio(io_op, io->io_bio); bio_put(io->io_bio); } io->io_bio = NULL; @@ -367,7 +369,7 @@ void ext4_io_submit(struct ext4_io_submit *io) void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc) { - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_wbc = wbc; io->io_bio = NULL; io->io_end = NULL; } -- cgit v1.2.3-59-g8ed1b From 001e4a8775f6e8ad52a89e0072f09aee47d5d252 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Jul 2015 23:51:26 -0400 Subject: ext4: implement cgroup writeback support For ordered and writeback data modes, all data IOs go through ext4_io_submit. This patch adds cgroup writeback support by invoking wbc_init_bio() from io_submit_init_bio() and wbc_account_io() in io_submit_add_bh(). Journal data which is written by jbd2 worker is left alone by this patch and will always be written out from the root cgroup. ext4_fill_super() is updated to set MS_CGROUPWB when data mode is either ordered or writeback. In journaled data mode, most IOs become synchronous through the journal and enabling cgroup writeback support doesn't make much sense or difference. Journaled data mode is left alone. Lightly tested with sequential data write workload. Behaves as expected. Signed-off-by: Tejun Heo Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 2 ++ fs/ext4/super.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a917bfe3e70c..58ab2e3dd114 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -383,6 +383,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); if (!bio) return -ENOMEM; + wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_end_io = ext4_end_bio; @@ -411,6 +412,7 @@ submit_and_retry: ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; + wbc_account_io(io->io_wbc, page, bh->b_size); io->io_next_block++; return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 58987b5c514b..d2c9a7985fd7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3643,6 +3643,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); + } else { + sb->s_iflags |= SB_I_CGROUPWB; } sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | -- cgit v1.2.3-59-g8ed1b From 5ba92bcf0dd63b35a85e90c3016989733b239bb5 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 21 Jul 2015 23:57:59 -0400 Subject: ext4: reject journal options for ext2 mounts There is no reason to allow ext2 filesystems be mounted with journal mount options. So, this patch adds them to the MOPT_NO_EXT2 mount options list. Signed-off-by: Carlos Maiolino Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d2c9a7985fd7..14909cd91515 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1394,9 +1394,9 @@ static const struct mount_opts { {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, - {Opt_journal_dev, 0, MOPT_GTE0}, - {Opt_journal_path, 0, MOPT_STRING}, - {Opt_journal_ioprio, 0, MOPT_GTE0}, + {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0}, + {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING}, + {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0}, {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, -- cgit v1.2.3-59-g8ed1b From d76d99b219e1a233a720775c0451c310d34594e8 Mon Sep 17 00:00:00 2001 From: Laurent Navet Date: Wed, 22 Jul 2015 00:08:08 -0400 Subject: ext4 crypto: exit cleanly if ext4_derive_key_aes() fails Return value of ext4_derive_key_aes() is stored but not used. Add test to exit cleanly if ext4_derive_key_aes() fail. Also fix coverity CID 1309760. Signed-off-by: Laurent Navet Signed-off-by: Theodore Ts'o --- fs/ext4/crypto_key.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 442d24e8efc0..ce75bc8b9aef 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -220,6 +220,8 @@ retry: BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); res = ext4_derive_key_aes(ctx.nonce, master_key->raw, raw_key); + if (res) + goto out; got_key: ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { -- cgit v1.2.3-59-g8ed1b From bb9a4e7e824e998070c2a2d1d4c67bc971ab72b8 Mon Sep 17 00:00:00 2001 From: Laurent Navet Date: Wed, 22 Jul 2015 00:09:45 -0400 Subject: ext4 crypto: fix spelling typo in comment Signed-off-by: Laurent Navet Signed-off-by: Theodore Ts'o --- fs/ext4/crypto_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index ce75bc8b9aef..1d510c11b100 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -30,7 +30,7 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) /** * ext4_derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivatio. + * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. * @derived_key: Derived key. * -- cgit v1.2.3-59-g8ed1b From 564bc402526e437729ecafe3c3511f7cab9f0327 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 23 Jul 2015 09:46:11 -0400 Subject: ext4, jbd2: add REQ_FUA flag when recording an error in the superblock When an error condition is detected, an error status should be recorded into superblocks of EXT4 or JBD2. However, the write request is submitted now without REQ_FUA flag, even in "barrier=1" mode, which is followed by panic() function in "errors=panic" mode. On mobile devices which make whole system reset as soon as kernel panic occurs, this write request containing an error flag will disappear just from storage cache without written to the physical cells. Therefore, when next start, even forever, the error flag cannot be shown in both superblocks, and e2fsck cannot fix the filesystem problems automatically, unless e2fsck is executed in force checking mode. [ Changed use test_opt(sb, BARRIER) of checking the journal flags -- TYT ] Signed-off-by: Daeho Jeong Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 3 ++- fs/jbd2/journal.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 14909cd91515..a51db9ca90fd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4667,7 +4667,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) ext4_superblock_csum_set(sb); mark_buffer_dirty(sbh); if (sync) { - error = sync_dirty_buffer(sbh); + error = __sync_dirty_buffer(sbh, + test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC); if (error) return error; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 4ff3fad4e9e3..fe1b4bdecdfa 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1456,7 +1456,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, WRITE_SYNC); + jbd2_write_superblock(journal, WRITE_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); -- cgit v1.2.3-59-g8ed1b From 841df7df196237ea63233f0f9eaa41db53afd70f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Jul 2015 14:57:14 -0400 Subject: jbd2: avoid infinite loop when destroying aborted journal Commit 6f6a6fda2945 "jbd2: fix ocfs2 corrupt when updating journal superblock fails" changed jbd2_cleanup_journal_tail() to return EIO when the journal is aborted. That makes logic in jbd2_log_do_checkpoint() bail out which is fine, except that jbd2_journal_destroy() expects jbd2_log_do_checkpoint() to always make a progress in cleaning the journal. Without it jbd2_journal_destroy() just loops in an infinite loop. Fix jbd2_journal_destroy() to cleanup journal checkpoint lists of jbd2_log_do_checkpoint() fails with error. Reported-by: Eryu Guan Tested-by: Eryu Guan Fixes: 6f6a6fda294506dfe0e3e0a253bb2d2923f28f0a Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 39 +++++++++++++++++++++++++++++++++------ fs/jbd2/commit.c | 2 +- fs/jbd2/journal.c | 11 ++++++++++- include/linux/jbd2.h | 3 ++- 4 files changed, 46 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 4227dc4f7437..8c44654ce274 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -417,12 +417,12 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * journal_clean_one_cp_list * * Find all the written-back checkpoint buffers in the given list and - * release them. + * release them. If 'destroy' is set, clean all buffers unconditionally. * * Called with j_list_lock held. * Returns 1 if we freed the transaction, 0 otherwise. */ -static int journal_clean_one_cp_list(struct journal_head *jh) +static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) { struct journal_head *last_jh; struct journal_head *next_jh = jh; @@ -436,7 +436,10 @@ static int journal_clean_one_cp_list(struct journal_head *jh) do { jh = next_jh; next_jh = jh->b_cpnext; - ret = __try_to_free_cp_buf(jh); + if (!destroy) + ret = __try_to_free_cp_buf(jh); + else + ret = __jbd2_journal_remove_checkpoint(jh) + 1; if (!ret) return freed; if (ret == 2) @@ -459,10 +462,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh) * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. + * If 'destroy' is set, release all buffers unconditionally. * * Called with j_list_lock held. */ -void __jbd2_journal_clean_checkpoint_list(journal_t *journal) +void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) { transaction_t *transaction, *last_transaction, *next_transaction; int ret; @@ -476,7 +480,8 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal) do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret = journal_clean_one_cp_list(transaction->t_checkpoint_list); + ret = journal_clean_one_cp_list(transaction->t_checkpoint_list, + destroy); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if @@ -492,7 +497,7 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal) * we can possibly see not yet submitted buffers on io_list */ ret = journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list); + t_checkpoint_io_list, destroy); if (need_resched()) return; /* @@ -505,6 +510,28 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal) } while (transaction != last_transaction); } +/* + * Remove buffers from all checkpoint lists as journal is aborted and we just + * need to free memory + */ +void jbd2_journal_destroy_checkpoint(journal_t *journal) +{ + /* + * We loop because __jbd2_journal_clean_checkpoint_list() may abort + * early due to a need of rescheduling. + */ + while (1) { + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) { + spin_unlock(&journal->j_list_lock); + break; + } + __jbd2_journal_clean_checkpoint_list(journal, true); + spin_unlock(&journal->j_list_lock); + cond_resched(); + } +} + /* * journal_remove_checkpoint: called after a buffer has been committed * to disk (either by being write-back flushed to disk, or being diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b73e0215baa7..362e5f614450 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -510,7 +510,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * frees some memory */ spin_lock(&journal->j_list_lock); - __jbd2_journal_clean_checkpoint_list(journal); + __jbd2_journal_clean_checkpoint_list(journal, false); spin_unlock(&journal->j_list_lock); jbd_debug(3, "JBD2: commit phase 1\n"); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fe1b4bdecdfa..8270fe9e3641 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1693,8 +1693,17 @@ int jbd2_journal_destroy(journal_t *journal) while (journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); mutex_lock(&journal->j_checkpoint_mutex); - jbd2_log_do_checkpoint(journal); + err = jbd2_log_do_checkpoint(journal); mutex_unlock(&journal->j_checkpoint_mutex); + /* + * If checkpointing failed, just free the buffers to avoid + * looping forever + */ + if (err) { + jbd2_journal_destroy_checkpoint(journal); + spin_lock(&journal->j_list_lock); + break; + } spin_lock(&journal->j_list_lock); } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index edb640ae9a94..eb1cebed3f36 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1042,8 +1042,9 @@ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ -void __jbd2_journal_clean_checkpoint_list(journal_t *journal); +void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); int __jbd2_journal_remove_checkpoint(struct journal_head *); +void jbd2_journal_destroy_checkpoint(journal_t *journal); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); -- cgit v1.2.3-59-g8ed1b From 911af577de4e444622d46500c1f9a37ab4335d3a Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 28 Jul 2015 15:08:41 -0400 Subject: ext4: update c/mtime on truncate up Commit 3da40c7b0898 ("ext4: only call ext4_truncate when size <= isize") introduced a bug that c/mtime is not updated on truncate up. Fix the issue by setting c/mtime explicitly in the truncate up case. Note that ftruncate(2) is not affected, so you won't see this bug using truncate(1) and xfs_io(1). Signed-off-by: Zirong Lang Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cecf9aa10811..2442eb065e19 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4725,6 +4725,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = ext4_orphan_add(handle, inode); orphan = 1; } + /* + * Update c/mtime on truncate up, ext4_truncate() will + * update c/mtime in shrink case below + */ + if (!shrink) { + inode->i_mtime = ext4_current_time(inode); + inode->i_ctime = inode->i_mtime; + } down_write(&EXT4_I(inode)->i_data_sem); EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); -- cgit v1.2.3-59-g8ed1b From 923ae471773a6b324815ef3e3c5e787818ae129a Mon Sep 17 00:00:00 2001 From: "zilong.liu" Date: Tue, 28 Jul 2015 15:12:18 -0400 Subject: ext4 crypto: remove duplicate header file Remove key.h which is included twice in crypto_fname.c Signed-off-by: zilong.liu Signed-off-by: Theodore Ts'o --- fs/ext4/crypto_fname.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 86ee996a2bd4..847f919c84d9 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3-59-g8ed1b From 6d3ec14d703c660c4baf8d726538b5415e23b4fb Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 4 Aug 2015 11:21:52 -0400 Subject: jbd2: limit number of reserved credits Currently there is no limitation on number of reserved credits we can ask for. If we ask for more reserved credits than 1/2 of maximum transaction size, or if total number of credits exceeds the maximum transaction size per operation (which is currently only possible with the former) we will spin forever in start_this_handle(). Fix this by adding this limitation at the start of start_this_handle(). This patch also removes the credit limitation 1/2 of maximum transaction size, since we really only want to limit the number of reserved credits. There is not much point to limit the credits if there is still space in the journal. This accidentally also fixes the online resize, where due to the limitation of the journal credits we're unable to grow file systems with 1k block size and size between 16M and 32M. It has been partially fixed by 2c869b262a10ca99cb866d04087d75311587a30c, but not entirely. Thanks Jan Kara for helping me getting the correct fix. Signed-off-by: Lukas Czerner Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a6eec9747c74..6b8338ec2464 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -204,6 +204,20 @@ static int add_transaction_credits(journal_t *journal, int blocks, * attach this handle to a new transaction. */ atomic_sub(total, &t->t_outstanding_credits); + + /* + * Is the number of reserved credits in the current transaction too + * big to fit this handle? Wait until reserved credits are freed. + */ + if (atomic_read(&journal->j_reserved_credits) + total > + journal->j_max_transaction_buffers) { + read_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) + total <= + journal->j_max_transaction_buffers); + return 1; + } + wait_transaction_locked(journal); return 1; } @@ -262,20 +276,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle, int rsv_blocks = 0; unsigned long ts = jiffies; + if (handle->h_rsv_handle) + rsv_blocks = handle->h_rsv_handle->h_buffer_credits; + /* - * 1/2 of transaction can be reserved so we can practically handle - * only 1/2 of maximum transaction size per operation + * Limit the number of reserved credits to 1/2 of maximum transaction + * size and limit the number of total credits to not exceed maximum + * transaction size per operation. */ - if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) { - printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", - current->comm, blocks, - journal->j_max_transaction_buffers / 2); + if ((rsv_blocks > journal->j_max_transaction_buffers / 2) || + (rsv_blocks + blocks > journal->j_max_transaction_buffers)) { + printk(KERN_ERR "JBD2: %s wants too many credits " + "credits:%d rsv_credits:%d max:%d\n", + current->comm, blocks, rsv_blocks, + journal->j_max_transaction_buffers); + WARN_ON(1); return -ENOSPC; } - if (handle->h_rsv_handle) - rsv_blocks = handle->h_rsv_handle->h_buffer_credits; - alloc_transaction: if (!journal->j_running_transaction) { /* -- cgit v1.2.3-59-g8ed1b From c642dc9e1aaed953597e7092d7df329e6234096e Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 15 Aug 2015 10:45:06 -0400 Subject: ext4: don't manipulate recovery flag when freezing no-journal fs At some point along this sequence of changes: f6e63f9 ext4: fold ext4_nojournal_sops into ext4_sops bb04457 ext4: support freezing ext2 (nojournal) file systems 9ca9238 ext4: Use separate super_operations structure for no_journal filesystems ext4 started setting needs_recovery on filesystems without journals when they are unfrozen. This makes no sense, and in fact confuses blkid to the point where it doesn't recognize the filesystem at all. (freeze ext2; unfreeze ext2; run blkid; see no output; run dumpe2fs, see needs_recovery set on fs w/ no journal). To fix this, don't manipulate the INCOMPAT_RECOVER feature on filesystems without journals. Reported-by: Stu Mark Reviewed-by: Jan Kara Signed-off-by: Eric Sandeen Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a51db9ca90fd..70c52ea0a27c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4836,10 +4836,11 @@ static int ext4_freeze(struct super_block *sb) error = jbd2_journal_flush(journal); if (error < 0) goto out; + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); } - /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); out: if (journal) @@ -4857,8 +4858,11 @@ static int ext4_unfreeze(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return 0; - /* Reset the needs_recovery flag before the fs is unlocked. */ - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + if (EXT4_SB(sb)->s_journal) { + /* Reset the needs_recovery flag before the fs is unlocked. */ + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + } + ext4_commit_super(sb, 1); return 0; } -- cgit v1.2.3-59-g8ed1b From 9810446836ab5a4b34a749bb77f3eb5836f982cc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 15 Aug 2015 11:30:31 -0400 Subject: ext4: simplify some code in read_mmp_block() My static check complains because we have: if (!*bh) return -ENOMEM; if (*bh) { The second check is unnecessary. I've simplified this code by moving the "if (!*bh)" checks around. Also Andreas Dilger says we should probably print a warning if sb_getblk() fails. [ Restructured the code so that we print a warning message as well if the mmp block doesn't check out, and to print the error code to disambiguate between the error cases. - TYT ] Reviewed-by: Andreas Dilger Signed-off-by: Dan Carpenter Signed-off-by: Theodore Ts'o --- fs/ext4/mmp.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 8313ca3324ec..048c52af2fb6 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -69,6 +69,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, ext4_fsblk_t mmp_block) { struct mmp_struct *mmp; + int ret; if (*bh) clear_buffer_uptodate(*bh); @@ -76,33 +77,36 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, /* This would be sb_bread(sb, mmp_block), except we need to be sure * that the MD RAID device cache has been bypassed, and that the read * is not blocked in the elevator. */ - if (!*bh) + if (!*bh) { *bh = sb_getblk(sb, mmp_block); - if (!*bh) - return -ENOMEM; - if (*bh) { - get_bh(*bh); - lock_buffer(*bh); - (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); - wait_on_buffer(*bh); - if (!buffer_uptodate(*bh)) { - brelse(*bh); - *bh = NULL; + if (!*bh) { + ret = -ENOMEM; + goto warn_exit; } } - if (unlikely(!*bh)) { - ext4_warning(sb, "Error while reading MMP block %llu", - mmp_block); - return -EIO; + + get_bh(*bh); + lock_buffer(*bh); + (*bh)->b_end_io = end_buffer_read_sync; + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + brelse(*bh); + *bh = NULL; + ret = -EIO; + goto warn_exit; } mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC || - !ext4_mmp_csum_verify(sb, mmp)) - return -EINVAL; - - return 0; + if (le32_to_cpu(mmp->mmp_magic) == EXT4_MMP_MAGIC && + ext4_mmp_csum_verify(sb, mmp)) + return 0; + ret = -EINVAL; + +warn_exit: + ext4_warning(sb, "Error %d while reading MMP block %llu", + ret, mmp_block); + return ret; } /* -- cgit v1.2.3-59-g8ed1b From da0b5e40ab10f44019a1ecec09fadfcd5bdb76b6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 15 Aug 2015 11:38:13 -0400 Subject: ext4: silence a format string false positive Static checkers complain that the format string should be "%s". It does not make a difference for the current code. Signed-off-by: Dan Carpenter Signed-off-by: Theodore Ts'o --- fs/ext4/mmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 048c52af2fb6..6eb1a619890c 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -115,7 +115,7 @@ warn_exit: void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, const char *function, unsigned int line, const char *msg) { - __ext4_warning(sb, function, line, msg); + __ext4_warning(sb, function, line, "%s", msg); __ext4_warning(sb, function, line, "MMP failure info: last update time: %llu, last update " "node: %s, last update device: %s\n", -- cgit v1.2.3-59-g8ed1b From e294a5371b2e0bd22d4a917d4c354a52a7057b6e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 15 Aug 2015 14:59:44 -0400 Subject: ext4: ratelimit the file system mounted message The xfstests ext4/305 will mount and unmount the same file system over 4,000 times, and each one of these will cause a system log message. Ratelimit this message since if we are getting more than a few dozen of these messages, they probably aren't going to be helpful. Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 70c52ea0a27c..45658c1bc324 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -60,6 +60,7 @@ static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; static struct ext4_features *ext4_feat; static int ext4_mballoc_ready; +static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -4277,9 +4278,10 @@ no_journal: "the device does not support discard"); } - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, - *sbi->s_es->s_mount_opts ? "; " : "", orig_data); + if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " + "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + *sbi->s_es->s_mount_opts ? "; " : "", orig_data); if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -5617,6 +5619,7 @@ static int __init ext4_init_fs(void) { int i, err; + ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; mutex_init(&ext4_li_mtx); -- cgit v1.2.3-59-g8ed1b From bdfe0cbd746aa9b2509c2f6d6be17193cf7facd7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 16 Aug 2015 10:03:57 -0400 Subject: Revert "ext4: remove block_device_ejected" This reverts commit 08439fec266c3cc5702953b4f54bdf5649357de0. Unfortunately we still need to test for bdi->dev to avoid a crash when a USB stick is yanked out while a file system is mounted: usb 2-2: USB disconnect, device number 2 Buffer I/O error on dev sdb1, logical block 15237120, lost sync page write JBD2: Error -5 detected when updating journal superblock for sdb1-8. BUG: unable to handle kernel paging request at 34beb000 IP: [] __percpu_counter_add+0x18/0xc0 *pdpt = 0000000023db9001 *pde = 0000000000000000 Oops: 0000 [#1] SMP CPU: 0 PID: 4083 Comm: umount Tainted: G U OE 4.1.1-040101-generic #201507011435 Hardware name: LENOVO 7675CTO/7675CTO, BIOS 7NETC2WW (2.22 ) 03/22/2011 task: ebf06b50 ti: ebebc000 task.ti: ebebc000 EIP: 0060:[] EFLAGS: 00010082 CPU: 0 EIP is at __percpu_counter_add+0x18/0xc0 EAX: f21c8e88 EBX: f21c8e88 ECX: 00000000 EDX: 00000001 ESI: 00000001 EDI: 00000000 EBP: ebebde60 ESP: ebebde40 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 CR0: 8005003b CR2: 34beb000 CR3: 33354200 CR4: 000007f0 Stack: c1abe100 edcb0098 edcb00ec ffffffff f21c8e68 ffffffff f21c8e68 f286d160 ebebde84 c1160454 00000010 00000282 f72a77f8 00000984 f72a77f8 f286d160 f286d170 ebebdea0 c11e613f 00000000 00000282 f72a77f8 edd7f4d0 00000000 Call Trace: [] account_page_dirtied+0x74/0x110 [] __set_page_dirty+0x3f/0xb0 [] mark_buffer_dirty+0x53/0xc0 [] ext4_commit_super+0x17b/0x250 [] ext4_put_super+0xc1/0x320 [] ? fsnotify_unmount_inodes+0x1aa/0x1c0 [] ? evict_inodes+0xca/0xe0 [] generic_shutdown_super+0x6a/0xe0 [] ? prepare_to_wait_event+0xd0/0xd0 [] ? unregister_shrinker+0x40/0x50 [] kill_block_super+0x26/0x70 [] deactivate_locked_super+0x45/0x80 [] deactivate_super+0x47/0x60 [] cleanup_mnt+0x39/0x80 [] __cleanup_mnt+0x10/0x20 [] task_work_run+0x91/0xd0 [] do_notify_resume+0x7c/0x90 [] work_notify Code: 8b 55 e8 e9 f4 fe ff ff 90 90 90 90 90 90 90 90 90 90 90 55 89 e5 83 ec 20 89 5d f4 89 c3 89 75 f8 89 d6 89 7d fc 89 cf 8b 48 14 <64> 8b 01 89 45 ec 89 c2 8b 45 08 c1 fa 1f 01 75 ec 89 55 f0 89 EIP: [] __percpu_counter_add+0x18/0xc0 SS:ESP 0068:ebebde40 CR2: 0000000034beb000 ---[ end trace dd564a7bea834ecd ]--- Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=101011 Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 45658c1bc324..bd3ff923dd59 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -326,6 +326,22 @@ static void save_error_info(struct super_block *sb, const char *func, ext4_commit_super(sb, 1); } +/* + * The del_gendisk() function uninitializes the disk-specific data + * structures, including the bdi structure, without telling anyone + * else. Once this happens, any attempt to call mark_buffer_dirty() + * (for example, by ext4_commit_super), will cause a kernel OOPS. + * This is a kludge to prevent these oops until we can put in a proper + * hook in del_gendisk() to inform the VFS and file system layers. + */ +static int block_device_ejected(struct super_block *sb) +{ + struct inode *bd_inode = sb->s_bdev->bd_inode; + struct backing_dev_info *bdi = inode_to_bdi(bd_inode); + + return bdi->dev == NULL; +} + static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; @@ -4621,7 +4637,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; - if (!sbh) + if (!sbh || block_device_ejected(sb)) return error; if (buffer_write_io_error(sbh)) { /* -- cgit v1.2.3-59-g8ed1b