From d76a3a77113db020d9bb1e894822869410450bd9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 3 Apr 2013 22:02:52 -0400
Subject: ext4/jbd2: don't wait (forever) for stale tid caused by wraparound

In the case where an inode has a very stale transaction id (tid) in
i_datasync_tid or i_sync_tid, it's possible that after a very large
(2**31) number of transactions, that the tid number space might wrap,
causing tid_geq()'s calculations to fail.

Commit deeeaf13 "jbd2: fix fsync() tid wraparound bug", later modified
by commit e7b04ac0 "jbd2: don't wake kjournald unnecessarily",
attempted to fix this problem, but it only avoided kjournald spinning
forever by fixing the logic in jbd2_log_start_commit().

Unfortunately, in the codepaths in fs/ext4/fsync.c and fs/ext4/inode.c
that might call jbd2_log_start_commit() with a stale tid, those
functions will subsequently call jbd2_log_wait_commit() with the same
stale tid, and then wait for a very long time.  To fix this, we
replace the calls to jbd2_log_start_commit() and
jbd2_log_wait_commit() with a call to a new function,
jbd2_complete_transaction(), which will correctly handle stale tid's.

As a bonus, jbd2_complete_transaction() will avoid locking
j_state_lock for writing unless a commit needs to be started.  This
should have a small (but probably not measurable) improvement for
ext4's scalability.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reported-by: Ben Hutchings <ben@decadent.org.uk>
Reported-by: George Barnett <gbarnett@atlassian.com>
Cc: stable@vger.kernel.org
---
 fs/jbd2/journal.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'fs/jbd2')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ed10991ab006..886ec2faa9b4 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -709,6 +709,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	return err;
 }
 
+/*
+ * When this function returns the transaction corresponding to tid
+ * will be completed.  If the transaction has currently running, start
+ * committing that transaction before waiting for it to complete.  If
+ * the transaction id is stale, it is by definition already completed,
+ * so just return SUCCESS.
+ */
+int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+{
+	int	need_to_wait = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid) {
+		if (journal->j_commit_request != tid) {
+			/* transaction not yet started, so request it */
+			read_unlock(&journal->j_state_lock);
+			jbd2_log_start_commit(journal, tid);
+			goto wait_commit;
+		}
+	} else if (!(journal->j_committing_transaction &&
+		     journal->j_committing_transaction->t_tid == tid))
+		need_to_wait = 0;
+	read_unlock(&journal->j_state_lock);
+	if (!need_to_wait)
+		return 0;
+wait_commit:
+	return jbd2_log_wait_commit(journal, tid);
+}
+EXPORT_SYMBOL(jbd2_complete_transaction);
+
 /*
  * Log buffer allocation routines:
  */
-- 
cgit v1.2.3-59-g8ed1b


From 794446c6946513c684d448205fbd76fa35f38b72 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 3 Apr 2013 22:06:52 -0400
Subject: jbd2: fix race between jbd2_journal_remove_checkpoint and
 ->j_commit_callback

The following race is possible:

[kjournald2]                              other_task
jbd2_journal_commit_transaction()
  j_state = T_FINISHED;
  spin_unlock(&journal->j_list_lock);
                                         ->jbd2_journal_remove_checkpoint()
					   ->jbd2_journal_free_transaction();
					     ->kmem_cache_free(transaction)
  ->j_commit_callback(journal, transaction);
    -> USE_AFTER_FREE

WARNING: at lib/list_debug.c:62 __list_del_entry+0x1c0/0x250()
Hardware name:
list_del corruption. prev->next should be ffff88019a4ec198, but was 6b6b6b6b6b6b6b6b
Modules linked in: cpufreq_ondemand acpi_cpufreq freq_table mperf coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode sg xhci_hcd button sd_mod crc_t10dif aesni_intel ablk_helper cryptd lrw aes_x86_64 xts gf128mul ahci libahci pata_acpi ata_generic dm_mirror dm_region_hash dm_log dm_mod
Pid: 16400, comm: jbd2/dm-1-8 Tainted: G        W    3.8.0-rc3+ #107
Call Trace:
 [<ffffffff8106fb0d>] warn_slowpath_common+0xad/0xf0
 [<ffffffff8106fc06>] warn_slowpath_fmt+0x46/0x50
 [<ffffffff813637e9>] ? ext4_journal_commit_callback+0x99/0xc0
 [<ffffffff8148cae0>] __list_del_entry+0x1c0/0x250
 [<ffffffff813637bf>] ext4_journal_commit_callback+0x6f/0xc0
 [<ffffffff813ca336>] jbd2_journal_commit_transaction+0x23a6/0x2570
 [<ffffffff8108aa42>] ? try_to_del_timer_sync+0x82/0xa0
 [<ffffffff8108b491>] ? del_timer_sync+0x91/0x1e0
 [<ffffffff813d3ecf>] kjournald2+0x19f/0x6a0
 [<ffffffff810ad630>] ? wake_up_bit+0x40/0x40
 [<ffffffff813d3d30>] ? bit_spin_lock+0x80/0x80
 [<ffffffff810ac6be>] kthread+0x10e/0x120
 [<ffffffff810ac5b0>] ? __init_kthread_worker+0x70/0x70
 [<ffffffff818ff6ac>] ret_from_fork+0x7c/0xb0
 [<ffffffff810ac5b0>] ? __init_kthread_worker+0x70/0x70

In order to demonstrace this issue one should mount ext4 with mount -o
discard option on SSD disk.  This makes callback longer and race
window becomes wider.

In order to fix this we should mark transaction as finished only after
callbacks have completed

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/jbd2/commit.c     | 50 ++++++++++++++++++++++++++++----------------------
 include/linux/jbd2.h |  1 +
 2 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'fs/jbd2')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 750c70148eff..0f53946f13c1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int space_left = 0;
 	int first_tag = 0;
 	int tag_flag;
-	int i, to_free = 0;
+	int i;
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
@@ -1134,7 +1134,7 @@ restart_loop:
 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
 	spin_unlock(&journal->j_history_lock);
 
-	commit_transaction->t_state = T_FINISHED;
+	commit_transaction->t_state = T_COMMIT_CALLBACK;
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
 	journal->j_committing_transaction = NULL;
@@ -1149,38 +1149,44 @@ restart_loop:
 				journal->j_average_commit_time*3) / 4;
 	else
 		journal->j_average_commit_time = commit_time;
+
 	write_unlock(&journal->j_state_lock);
 
-	if (commit_transaction->t_checkpoint_list == NULL &&
-	    commit_transaction->t_checkpoint_io_list == NULL) {
-		__jbd2_journal_drop_transaction(journal, commit_transaction);
-		to_free = 1;
+	if (journal->j_checkpoint_transactions == NULL) {
+		journal->j_checkpoint_transactions = commit_transaction;
+		commit_transaction->t_cpnext = commit_transaction;
+		commit_transaction->t_cpprev = commit_transaction;
 	} else {
-		if (journal->j_checkpoint_transactions == NULL) {
-			journal->j_checkpoint_transactions = commit_transaction;
-			commit_transaction->t_cpnext = commit_transaction;
-			commit_transaction->t_cpprev = commit_transaction;
-		} else {
-			commit_transaction->t_cpnext =
-				journal->j_checkpoint_transactions;
-			commit_transaction->t_cpprev =
-				commit_transaction->t_cpnext->t_cpprev;
-			commit_transaction->t_cpnext->t_cpprev =
-				commit_transaction;
-			commit_transaction->t_cpprev->t_cpnext =
+		commit_transaction->t_cpnext =
+			journal->j_checkpoint_transactions;
+		commit_transaction->t_cpprev =
+			commit_transaction->t_cpnext->t_cpprev;
+		commit_transaction->t_cpnext->t_cpprev =
+			commit_transaction;
+		commit_transaction->t_cpprev->t_cpnext =
 				commit_transaction;
-		}
 	}
 	spin_unlock(&journal->j_list_lock);
-
+	/* Drop all spin_locks because commit_callback may be block.
+	 * __journal_remove_checkpoint() can not destroy transaction
+	 * under us because it is not marked as T_FINISHED yet */
 	if (journal->j_commit_callback)
 		journal->j_commit_callback(journal, commit_transaction);
 
 	trace_jbd2_end_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
-	if (to_free)
-		jbd2_journal_free_transaction(commit_transaction);
 
+	write_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	commit_transaction->t_state = T_FINISHED;
+	/* Recheck checkpoint lists after j_list_lock was dropped */
+	if (commit_transaction->t_checkpoint_list == NULL &&
+	    commit_transaction->t_checkpoint_io_list == NULL) {
+		__jbd2_journal_drop_transaction(journal, commit_transaction);
+		jbd2_journal_free_transaction(commit_transaction);
+	}
+	spin_unlock(&journal->j_list_lock);
+	write_unlock(&journal->j_state_lock);
 	wake_up(&journal->j_wait_done_commit);
 }
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f0289754b464..f9fe88957b7a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -480,6 +480,7 @@ struct transaction_s
 		T_COMMIT,
 		T_COMMIT_DFLUSH,
 		T_COMMIT_JFLUSH,
+		T_COMMIT_CALLBACK,
 		T_FINISHED
 	}			t_state;
 
-- 
cgit v1.2.3-59-g8ed1b


From 28daf4fae8693d4a285123494899fe01950cba50 Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Fri, 19 Apr 2013 17:49:23 -0400
Subject: jbd2: use kmem_cache_zalloc instead of kmem_cache_alloc/memset

The jbd2_alloc_handle() function is only called by new_handle().  So
this commit uses kmem_cache_zalloc() instead of
kmem_cache_alloc()/memset().

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 1 -
 include/linux/jbd2.h  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/jbd2')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 325bc019ed88..a1920da22802 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks)
 	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
 	if (!handle)
 		return NULL;
-	memset(handle, 0, sizeof(*handle));
 	handle->h_buffer_credits = nblocks;
 	handle->h_ref = 1;
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f9fe88957b7a..6e051f472edb 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1145,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache;
 
 static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
 {
-	return kmem_cache_alloc(jbd2_handle_cache, gfp_flags);
+	return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
 }
 
 static inline void jbd2_free_handle(handle_t *handle)
-- 
cgit v1.2.3-59-g8ed1b


From f783f091e49ce4896e6b026af82d76e0537c6089 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 21 Apr 2013 16:47:54 -0400
Subject: jbd2: trace when lock_buffer in do_get_write_access takes a long time

While investigating interactivity problems it was clear that processes
sometimes stall for long periods of times if an attempt is made to
lock a buffer which is undergoing writeback.  It would stall in
a trace looking something like

[<ffffffff811a39de>] __lock_buffer+0x2e/0x30
[<ffffffff8123a60f>] do_get_write_access+0x43f/0x4b0
[<ffffffff8123a7cb>] jbd2_journal_get_write_access+0x2b/0x50
[<ffffffff81220f79>] __ext4_journal_get_write_access+0x39/0x80
[<ffffffff811f3198>] ext4_reserve_inode_write+0x78/0xa0
[<ffffffff811f3209>] ext4_mark_inode_dirty+0x49/0x220
[<ffffffff811f57d1>] ext4_dirty_inode+0x41/0x60
[<ffffffff8119ac3e>] __mark_inode_dirty+0x4e/0x2d0
[<ffffffff8118b9b9>] update_time+0x79/0xc0
[<ffffffff8118ba98>] file_update_time+0x98/0x100
[<ffffffff81110ffc>] __generic_file_aio_write+0x17c/0x3b0
[<ffffffff811112aa>] generic_file_aio_write+0x7a/0xf0
[<ffffffff811ea853>] ext4_file_write+0x83/0xd0
[<ffffffff81172b23>] do_sync_write+0xa3/0xe0
[<ffffffff811731ae>] vfs_write+0xae/0x180
[<ffffffff8117361d>] sys_write+0x4d/0x90
[<ffffffff8159d62d>] system_call_fastpath+0x1a/0x1f
[<ffffffffffffffff>] 0xffffffffffffffff

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c       |  8 ++++++++
 include/trace/events/jbd2.h | 21 +++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'fs/jbd2')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a1920da22802..10f524c59ea8 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -639,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 	int error;
 	char *frozen_buffer = NULL;
 	int need_copy = 0;
+	unsigned long start_lock, time_lock;
 
 	if (is_handle_aborted(handle))
 		return -EROFS;
@@ -654,9 +655,16 @@ repeat:
 
 	/* @@@ Need to check for errors here at some point. */
 
+ 	start_lock = jiffies;
 	lock_buffer(bh);
 	jbd_lock_bh_state(bh);
 
+	/* If it takes too long to lock the buffer, trace it */
+	time_lock = jbd2_time_diff(start_lock, jiffies);
+	if (time_lock > HZ/10)
+		trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
+			jiffies_to_msecs(time_lock));
+
 	/* We now hold the buffer lock so it is safe to query the buffer
 	 * state.  Is the buffer dirty?
 	 *
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
index 070df49e4a1d..c1d1f3eb242d 100644
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -358,6 +358,27 @@ TRACE_EVENT(jbd2_write_superblock,
 		  MINOR(__entry->dev), __entry->write_op)
 );
 
+TRACE_EVENT(jbd2_lock_buffer_stall,
+
+	TP_PROTO(dev_t dev, unsigned long stall_ms),
+
+	TP_ARGS(dev, stall_ms),
+
+	TP_STRUCT__entry(
+		__field(        dev_t, dev	)
+		__field(unsigned long, stall_ms	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= dev;
+		__entry->stall_ms	= stall_ms;
+	),
+
+	TP_printk("dev %d,%d stall_ms %lu",
+		MAJOR(__entry->dev), MINOR(__entry->dev),
+		__entry->stall_ms)
+);
+
 #endif /* _TRACE_JBD2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3-59-g8ed1b