From 03a9e24ef2aaa5f1f9837356aed79c860521407a Mon Sep 17 00:00:00 2001
From: "colyli@suse.de" <colyli@suse.de>
Date: Sat, 28 Jan 2017 21:11:49 +0800
Subject: md linear: fix a race between linear_add() and linear_congested()

Recently I receive a bug report that on Linux v3.0 based kerenl, hot add
disk to a md linear device causes kernel crash at linear_congested(). From
the crash image analysis, I find in linear_congested(), mddev->raid_disks
contains value N, but conf->disks[] only has N-1 pointers available. Then
a NULL pointer deference crashes the kernel.

There is a race between linear_add() and linear_congested(), RCU stuffs
used in these two functions cannot avoid the race. Since Linuv v4.0
RCU code is replaced by introducing mddev_suspend().  After checking the
upstream code, it seems linear_congested() is not called in
generic_make_request() code patch, so mddev_suspend() cannot provent it
from being called. The possible race still exists.

Here I explain how the race still exists in current code.  For a machine
has many CPUs, on one CPU, linear_add() is called to add a hard disk to a
md linear device; at the same time on other CPU, linear_congested() is
called to detect whether this md linear device is congested before issuing
an I/O request onto it.

Now I use a possible code execution time sequence to demo how the possible
race happens,

seq    linear_add()                linear_congested()
 0                                 conf=mddev->private
 1   oldconf=mddev->private
 2   mddev->raid_disks++
 3                              for (i=0; i<mddev->raid_disks;i++)
 4                                bdev_get_queue(conf->disks[i].rdev->bdev)
 5   mddev->private=newconf

In linear_add() mddev->raid_disks is increased in time seq 2, and on
another CPU in linear_congested() the for-loop iterates conf->disks[i] by
the increased mddev->raid_disks in time seq 3,4. But conf with one more
element (which is a pointer to struct dev_info type) to conf->disks[] is
not updated yet, accessing its structure member in time seq 4 will cause a
NULL pointer deference fault.

To fix this race, there are 2 parts of modification in the patch,
 1) Add 'int raid_disks' in struct linear_conf, as a copy of
    mddev->raid_disks. It is initialized in linear_conf(), always being
    consistent with pointers number of 'struct dev_info disks[]'. When
    iterating conf->disks[] in linear_congested(), use conf->raid_disks to
    replace mddev->raid_disks in the for-loop, then NULL pointer deference
    will not happen again.
 2) RCU stuffs are back again, and use kfree_rcu() in linear_add() to
    free oldconf memory. Because oldconf may be referenced as mddev->private
    in linear_congested(), kfree_rcu() makes sure that its memory will not
    be released until no one uses it any more.
Also some code comments are added in this patch, to make this modification
to be easier understandable.

This patch can be applied for kernels since v4.0 after commit:
3be260cc18f8 ("md/linear: remove rcu protections in favour of
suspend/resume"). But this bug is reported on Linux v3.0 based kernel, for
people who maintain kernels before Linux v4.0, they need to do some back
back port to this patch.

Changelog:
 - V3: add 'int raid_disks' in struct linear_conf, and use kfree_rcu() to
       replace rcu_call() in linear_add().
 - v2: add RCU stuffs by suggestion from Shaohua and Neil.
 - v1: initial effort.

Signed-off-by: Coly Li <colyli@suse.de>
Cc: Shaohua Li <shli@fb.com>
Cc: Neil Brown <neilb@suse.com>
Cc: stable@vger.kernel.org
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/linear.c | 39 ++++++++++++++++++++++++++++++++++-----
 drivers/md/linear.h |  1 +
 2 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'drivers/md')
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5975c9915684..26a73b2002cf 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -53,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
 	return conf->disks + lo;
 }
 
+/*
+ * In linear_congested() conf->raid_disks is used as a copy of
+ * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks
+ * and conf->disks[] are created in linear_conf(), they are always
+ * consitent with each other, but mddev->raid_disks does not.
+ */
 static int linear_congested(struct mddev *mddev, int bits)
 {
 	struct linear_conf *conf;
 	int i, ret = 0;
 
-	conf = mddev->private;
+	rcu_read_lock();
+	conf = rcu_dereference(mddev->private);
 
-	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+	for (i = 0; i < conf->raid_disks && !ret ; i++) {
 		struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
 		ret |= bdi_congested(&q->backing_dev_info, bits);
 	}
 
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -144,6 +152,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 			conf->disks[i-1].end_sector +
 			conf->disks[i].rdev->sectors;
 
+	/*
+	 * conf->raid_disks is copy of mddev->raid_disks. The reason to
+	 * keep a copy of mddev->raid_disks in struct linear_conf is,
+	 * mddev->raid_disks may not be consistent with pointers number of
+	 * conf->disks[] when it is updated in linear_add() and used to
+	 * iterate old conf->disks[] earray in linear_congested().
+	 * Here conf->raid_disks is always consitent with number of
+	 * pointers in conf->disks[] array, and mddev->private is updated
+	 * with rcu_assign_pointer() in linear_addr(), such race can be
+	 * avoided.
+	 */
+	conf->raid_disks = raid_disks;
+
 	return conf;
 
 out:
@@ -196,15 +217,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
 	if (!newconf)
 		return -ENOMEM;
 
+	/* newconf->raid_disks already keeps a copy of * the increased
+	 * value of mddev->raid_disks, WARN_ONCE() is just used to make
+	 * sure of this. It is possible that oldconf is still referenced
+	 * in linear_congested(), therefore kfree_rcu() is used to free
+	 * oldconf until no one uses it anymore.
+	 */
 	mddev_suspend(mddev);
-	oldconf = mddev->private;
+	oldconf = rcu_dereference(mddev->private);
 	mddev->raid_disks++;
-	mddev->private = newconf;
+	WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
+		"copied raid_disks doesn't match mddev->raid_disks");
+	rcu_assign_pointer(mddev->private, newconf);
 	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	mddev_resume(mddev);
 	revalidate_disk(mddev->gendisk);
-	kfree(oldconf);
+	kfree_rcu(oldconf, rcu);
 	return 0;
 }
 
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index b685ddd7d7f7..8d392e6098b3 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -10,6 +10,7 @@ struct linear_conf
 {
 	struct rcu_head		rcu;
 	sector_t		array_sectors;
+	int			raid_disks; /* a copy of mddev->raid_disks */
 	struct dev_info		disks[0];
 };
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 765d704db1f583630d52dc14c1ea573db6783459 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 4 Jan 2017 09:33:23 -0800
Subject: raid5: only dispatch IO from raid5d for harddisk raid

We made raid5 stripe handling multi-thread before. It works well for
SSD. But for harddisk, the multi-threading creates more disk seek, so
not always improve performance. For several hard disks based raid5,
multi-threading is required as raid5d becames a bottleneck especially
for sequential write.

To overcome the disk seek issue, we only dispatch IO from raid5d if the
array is harddisk based. Other threads can still handle stripes, but
can't dispatch IO.

Idealy, we should control IO dispatching order according to IO position
interrnally. Right now we still depend on block layer, which isn't very
efficient sometimes though.

My setup has 9 harddisks, each disk can do around 180M/s sequential
write. So in theory, the raid5 can do 180 * 8 = 1440M/s sequential
write. The test machine uses an ATOM CPU. I measure sequential write
with large iodepth bandwidth to raid array:

without patch: ~600M/s
without patch and group_thread_cnt=4: 750M/s
with patch and group_thread_cnt=4: 950M/s
with patch, group_thread_cnt=4, skip_copy=1: 1150M/s

We are pretty close to the maximum bandwidth in the large iodepth
iodepth case. The performance gap of small iodepth sequential write
between software raid and theory value is still very big though, because
we don't have an efficient pipeline.

Cc: NeilBrown <neilb@suse.com>
Cc: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid5.h |  4 ++++
 2 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3c7e106c12a2..9d744a8961d1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -863,6 +863,43 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
 	return 1;
 }
 
+static void flush_deferred_bios(struct r5conf *conf)
+{
+	struct bio_list tmp;
+	struct bio *bio;
+
+	if (!conf->batch_bio_dispatch || !conf->group_cnt)
+		return;
+
+	bio_list_init(&tmp);
+	spin_lock(&conf->pending_bios_lock);
+	bio_list_merge(&tmp, &conf->pending_bios);
+	bio_list_init(&conf->pending_bios);
+	spin_unlock(&conf->pending_bios_lock);
+
+	while ((bio = bio_list_pop(&tmp)))
+		generic_make_request(bio);
+}
+
+static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
+{
+	/*
+	 * change group_cnt will drain all bios, so this is safe
+	 *
+	 * A read generally means a read-modify-write, which usually means a
+	 * randwrite, so we don't delay it
+	 */
+	if (!conf->batch_bio_dispatch || !conf->group_cnt ||
+	    bio_op(bio) == REQ_OP_READ) {
+		generic_make_request(bio);
+		return;
+	}
+	spin_lock(&conf->pending_bios_lock);
+	bio_list_add(&conf->pending_bios, bio);
+	spin_unlock(&conf->pending_bios_lock);
+	md_wakeup_thread(conf->mddev->thread);
+}
+
 static void
 raid5_end_read_request(struct bio *bi);
 static void
@@ -1043,7 +1080,7 @@ again:
 				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
 						      bi, disk_devt(conf->mddev->gendisk),
 						      sh->dev[i].sector);
-			generic_make_request(bi);
+			defer_bio_issue(conf, bi);
 		}
 		if (rrdev) {
 			if (s->syncing || s->expanding || s->expanded
@@ -1088,7 +1125,7 @@ again:
 				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
 						      rbi, disk_devt(conf->mddev->gendisk),
 						      sh->dev[i].sector);
-			generic_make_request(rbi);
+			defer_bio_issue(conf, rbi);
 		}
 		if (!rdev && !rrdev) {
 			if (op_is_write(op))
@@ -6126,6 +6163,8 @@ static void raid5d(struct md_thread *thread)
 		mutex_unlock(&conf->cache_size_mutex);
 	}
 
+	flush_deferred_bios(conf);
+
 	r5l_flush_stripe_to_raid(conf->log);
 
 	async_tx_issue_pending_all();
@@ -6711,6 +6750,18 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
 	atomic_set(&conf->active_aligned_reads, 0);
+	bio_list_init(&conf->pending_bios);
+	spin_lock_init(&conf->pending_bios_lock);
+	conf->batch_bio_dispatch = true;
+	rdev_for_each(rdev, mddev) {
+		if (test_bit(Journal, &rdev->flags))
+			continue;
+		if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
+			conf->batch_bio_dispatch = false;
+			break;
+		}
+	}
+
 	conf->bypass_threshold = BYPASS_THRESHOLD;
 	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 1440fa26e296..ebb89bda88f1 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -684,6 +684,10 @@ struct r5conf {
 	int			group_cnt;
 	int			worker_cnt_per_group;
 	struct r5l_log		*log;
+
+	struct bio_list		pending_bios;
+	spinlock_t		pending_bios_lock;
+	bool			batch_bio_dispatch;
 };
 
 
-- 
cgit v1.2.3-59-g8ed1b


From 03b047f45c29dff02f913a0234ca0cc1ca51966f Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 11 Jan 2017 13:39:14 -0800
Subject: md/r5cache: enable chunk_aligned_read with write back cache

Chunk aligned read significantly reduces CPU usage of raid456.
However, it is not safe to fully bypass the write back cache.
This patch enables chunk aligned read with write back cache.

For chunk aligned read, we track stripes in write back cache at
a bigger granularity, "big_stripe". Each chunk may contain more
than one stripe (for example, a 256kB chunk contains 64 4kB-page,
so this chunk contain 64 stripes). For chunk_aligned_read, these
stripes are grouped into one big_stripe, so we only need one lookup
for the whole chunk.

For each big_stripe, struct big_stripe_info tracks how many stripes
of this big_stripe are in the write back cache. We count how many
stripes of this big_stripe are in the write back cache. These
counters are tracked in a radix tree (big_stripe_tree).
r5c_tree_index() is used to calculate keys for the radix tree.

chunk_aligned_read() calls r5c_big_stripe_cached() to look up
big_stripe of each chunk in the tree. If this big_stripe is in the
tree, chunk_aligned_read() aborts. This look up is protected by
rcu_read_lock().

It is necessary to remember whether a stripe is counted in
big_stripe_tree. Instead of adding new flag, we reuses existing flags:
STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
two flags are set, the stripe is counted in big_stripe_tree. This
requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
r5c_try_caching_write(); and moving clear_bit of
STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
r5c_finish_stripe_write_out().

Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 171 ++++++++++++++++++++++++++++++++++++++++++-----
 drivers/md/raid5.c       |  20 ++++--
 drivers/md/raid5.h       |   1 +
 3 files changed, 168 insertions(+), 24 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 302dea3296ba..76c0e5063f1b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -20,6 +20,7 @@
 #include <linux/crc32c.h>
 #include <linux/random.h>
 #include <linux/kthread.h>
+#include <linux/types.h>
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
@@ -164,8 +165,59 @@ struct r5l_log {
 	struct work_struct deferred_io_work;
 	/* to disable write back during in degraded mode */
 	struct work_struct disable_writeback_work;
+
+	/* to for chunk_aligned_read in writeback mode, details below */
+	spinlock_t tree_lock;
+	struct radix_tree_root big_stripe_tree;
 };
 
+/*
+ * Enable chunk_aligned_read() with write back cache.
+ *
+ * Each chunk may contain more than one stripe (for example, a 256kB
+ * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
+ * chunk_aligned_read, these stripes are grouped into one "big_stripe".
+ * For each big_stripe, we count how many stripes of this big_stripe
+ * are in the write back cache. These data are tracked in a radix tree
+ * (big_stripe_tree). We use radix_tree item pointer as the counter.
+ * r5c_tree_index() is used to calculate keys for the radix tree.
+ *
+ * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
+ * big_stripe of each chunk in the tree. If this big_stripe is in the
+ * tree, chunk_aligned_read() aborts. This look up is protected by
+ * rcu_read_lock().
+ *
+ * It is necessary to remember whether a stripe is counted in
+ * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
+ * two flags are set, the stripe is counted in big_stripe_tree. This
+ * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
+ * r5c_try_caching_write(); and moving clear_bit of
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
+ * r5c_finish_stripe_write_out().
+ */
+
+/*
+ * radix tree requests lowest 2 bits of data pointer to be 2b'00.
+ * So it is necessary to left shift the counter by 2 bits before using it
+ * as data pointer of the tree.
+ */
+#define R5C_RADIX_COUNT_SHIFT 2
+
+/*
+ * calculate key for big_stripe_tree
+ *
+ * sect: align_bi->bi_iter.bi_sector or sh->sector
+ */
+static inline sector_t r5c_tree_index(struct r5conf *conf,
+				      sector_t sect)
+{
+	sector_t offset;
+
+	offset = sector_div(sect, conf->chunk_sectors);
+	return sect;
+}
+
 /*
  * an IO range starts from a meta data block and end at the next meta data
  * block. The io unit's the meta data block tracks data/parity followed it. io
@@ -412,16 +464,6 @@ void r5c_make_stripe_write_out(struct stripe_head *sh)
 
 	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 		atomic_inc(&conf->preread_active_stripes);
-
-	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
-		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
-		atomic_dec(&conf->r5c_cached_partial_stripes);
-	}
-
-	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
-		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
-		atomic_dec(&conf->r5c_cached_full_stripes);
-	}
 }
 
 static void r5c_handle_data_cached(struct stripe_head *sh)
@@ -2320,6 +2362,10 @@ int r5c_try_caching_write(struct r5conf *conf,
 	int i;
 	struct r5dev *dev;
 	int to_cache = 0;
+	void **pslot;
+	sector_t tree_index;
+	int ret;
+	uintptr_t refcount;
 
 	BUG_ON(!r5c_is_writeback(log));
 
@@ -2364,6 +2410,44 @@ int r5c_try_caching_write(struct r5conf *conf,
 		}
 	}
 
+	/* if the stripe is not counted in big_stripe_tree, add it now */
+	if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+	    !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+		tree_index = r5c_tree_index(conf, sh->sector);
+		spin_lock(&log->tree_lock);
+		pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+					       tree_index);
+		if (pslot) {
+			refcount = (uintptr_t)radix_tree_deref_slot_protected(
+				pslot, &log->tree_lock) >>
+				R5C_RADIX_COUNT_SHIFT;
+			radix_tree_replace_slot(
+				&log->big_stripe_tree, pslot,
+				(void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
+		} else {
+			/*
+			 * this radix_tree_insert can fail safely, so no
+			 * need to call radix_tree_preload()
+			 */
+			ret = radix_tree_insert(
+				&log->big_stripe_tree, tree_index,
+				(void *)(1 << R5C_RADIX_COUNT_SHIFT));
+			if (ret) {
+				spin_unlock(&log->tree_lock);
+				r5c_make_stripe_write_out(sh);
+				return -EAGAIN;
+			}
+		}
+		spin_unlock(&log->tree_lock);
+
+		/*
+		 * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
+		 * counted in the radix tree
+		 */
+		set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
+		atomic_inc(&conf->r5c_cached_partial_stripes);
+	}
+
 	for (i = disks; i--; ) {
 		dev = &sh->dev[i];
 		if (dev->towrite) {
@@ -2438,17 +2522,20 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 				 struct stripe_head *sh,
 				 struct stripe_head_state *s)
 {
+	struct r5l_log *log = conf->log;
 	int i;
 	int do_wakeup = 0;
+	sector_t tree_index;
+	void **pslot;
+	uintptr_t refcount;
 
-	if (!conf->log ||
-	    !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
+	if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
 		return;
 
 	WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
 	clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 
-	if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
 		return;
 
 	for (i = sh->disks; i--; ) {
@@ -2470,12 +2557,43 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 	if (do_wakeup)
 		wake_up(&conf->wait_for_overlap);
 
-	spin_lock_irq(&conf->log->stripe_in_journal_lock);
+	spin_lock_irq(&log->stripe_in_journal_lock);
 	list_del_init(&sh->r5c);
-	spin_unlock_irq(&conf->log->stripe_in_journal_lock);
+	spin_unlock_irq(&log->stripe_in_journal_lock);
 	sh->log_start = MaxSector;
-	atomic_dec(&conf->log->stripe_in_journal_count);
-	r5c_update_log_state(conf->log);
+
+	atomic_dec(&log->stripe_in_journal_count);
+	r5c_update_log_state(log);
+
+	/* stop counting this stripe in big_stripe_tree */
+	if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
+	    test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+		tree_index = r5c_tree_index(conf, sh->sector);
+		spin_lock(&log->tree_lock);
+		pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+					       tree_index);
+		BUG_ON(pslot == NULL);
+		refcount = (uintptr_t)radix_tree_deref_slot_protected(
+			pslot, &log->tree_lock) >>
+			R5C_RADIX_COUNT_SHIFT;
+		if (refcount == 1)
+			radix_tree_delete(&log->big_stripe_tree, tree_index);
+		else
+			radix_tree_replace_slot(
+				&log->big_stripe_tree, pslot,
+				(void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
+		spin_unlock(&log->tree_lock);
+	}
+
+	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
+		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+		atomic_dec(&conf->r5c_cached_partial_stripes);
+	}
+
+	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
+		atomic_dec(&conf->r5c_cached_full_stripes);
+	}
 }
 
 int
@@ -2535,6 +2653,22 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
 	return 0;
 }
 
+/* check whether this big stripe is in write back cache. */
+bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
+{
+	struct r5l_log *log = conf->log;
+	sector_t tree_index;
+	void *slot;
+
+	if (!log)
+		return false;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	tree_index = r5c_tree_index(conf, sect);
+	slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
+	return slot != NULL;
+}
+
 static int r5l_load_log(struct r5l_log *log)
 {
 	struct md_rdev *rdev = log->rdev;
@@ -2681,6 +2815,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->meta_pool)
 		goto out_mempool;
 
+	spin_lock_init(&log->tree_lock);
+	INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
+
 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9d744a8961d1..b62f671a93ab 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -281,13 +281,13 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 						atomic_dec(&conf->r5c_cached_partial_stripes);
 					list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
 					r5c_check_cached_full_stripe(conf);
-				} else {
-					/* partial stripe */
-					if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
-							      &sh->state))
-						atomic_inc(&conf->r5c_cached_partial_stripes);
+				} else
+					/*
+					 * STRIPE_R5C_PARTIAL_STRIPE is set in
+					 * r5c_try_caching_write(). No need to
+					 * set it again.
+					 */
 					list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
-				}
 			}
 		}
 	}
@@ -5062,6 +5062,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		      rdev->recovery_offset >= end_sector)))
 			rdev = NULL;
 	}
+
+	if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
+		rcu_read_unlock();
+		bio_put(align_bi);
+		return 0;
+	}
+
 	if (rdev) {
 		sector_t first_bad;
 		int bad_sectors;
@@ -5418,7 +5425,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	 * data on failed drives.
 	 */
 	if (rw == READ && mddev->degraded == 0 &&
-	    !r5c_is_writeback(conf->log) &&
 	    mddev->reshape_position == MaxSector) {
 		bi = chunk_aligned_read(mddev, bi);
 		if (!bi)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ebb89bda88f1..c0687df5ba06 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -792,4 +792,5 @@ extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
 extern void r5c_check_cached_full_stripe(struct r5conf *conf);
 extern struct md_sysfs_entry r5c_journal_mode;
 extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 39b99586b321a9cbd4fe9abb5ea9346d2c4ca9c6 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Tue, 24 Jan 2017 14:08:23 -0800
Subject: md/r5cache: improve journal device efficiency

It is important to be able to flush all stripes in raid5-cache.
Therefore, we need reserve some space on the journal device for
these flushes. If flush operation includes pending writes to the
stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
for the flush out. This reduces the efficiency of journal space.
If we exclude these pending writes from flush operation, we only
need (conf->max_degraded + 1) pages per stripe.

With this patch, when log space is critical (R5C_LOG_CRITICAL=1),
pending writes will be excluded from stripe flush out. Therefore,
we can reduce reserved space for flush out and thus improve journal
device efficiency.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 36 +++++++++++++++++++++++++-----------
 drivers/md/raid5.c       | 42 +++++++++++++++++++++++++++++++++---------
 2 files changed, 58 insertions(+), 20 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 76c0e5063f1b..caae85331a02 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -389,17 +389,30 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
 /*
  * Total log space (in sectors) needed to flush all data in cache
  *
- * Currently, writing-out phase automatically includes all pending writes
- * to the same sector. So the reclaim of each stripe takes up to
- * (conf->raid_disks + 1) pages of log space.
+ * To avoid deadlock due to log space, it is necessary to reserve log
+ * space to flush critical stripes (stripes that occupying log space near
+ * last_checkpoint). This function helps check how much log space is
+ * required to flush all cached stripes.
  *
- * To totally avoid deadlock due to log space, the code reserves
- * (conf->raid_disks + 1) pages for each stripe in cache, which is not
- * necessary in most cases.
+ * To reduce log space requirements, two mechanisms are used to give cache
+ * flush higher priorities:
+ *    1. In handle_stripe_dirtying() and schedule_reconstruction(),
+ *       stripes ALREADY in journal can be flushed w/o pending writes;
+ *    2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
+ *       can be delayed (r5l_add_no_space_stripe).
  *
- * To improve this, we will need writing-out phase to be able to NOT include
- * pending writes, which will reduce the requirement to
- * (conf->max_degraded + 1) pages per stripe in cache.
+ * In cache flush, the stripe goes through 1 and then 2. For a stripe that
+ * already passed 1, flushing it requires at most (conf->max_degraded + 1)
+ * pages of journal space. For stripes that has not passed 1, flushing it
+ * requires (conf->raid_disks + 1) pages of journal space. There are at
+ * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
+ * required to flush all cached stripes (in pages) is:
+ *
+ *     (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
+ *     (group_cnt + 1) * (raid_disks + 1)
+ * or
+ *     (stripe_in_journal_count) * (max_degraded + 1) +
+ *     (group_cnt + 1) * (raid_disks - max_degraded)
  */
 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
 {
@@ -408,8 +421,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
 	if (!r5c_is_writeback(log))
 		return 0;
 
-	return BLOCK_SECTORS * (conf->raid_disks + 1) *
-		atomic_read(&log->stripe_in_journal_count);
+	return BLOCK_SECTORS *
+		((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
+		 (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
 }
 
 /*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b62f671a93ab..b0d1345c832c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2951,12 +2951,36 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
  *      like to flush data in journal to RAID disks first, so complex rmw
  *      is handled in the write patch (handle_stripe_dirtying).
  *
+ *   2. when journal space is critical (R5C_LOG_CRITICAL=1)
+ *
+ *      It is important to be able to flush all stripes in raid5-cache.
+ *      Therefore, we need reserve some space on the journal device for
+ *      these flushes. If flush operation includes pending writes to the
+ *      stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
+ *      for the flush out. If we exclude these pending writes from flush
+ *      operation, we only need (conf->max_degraded + 1) pages per stripe.
+ *      Therefore, excluding pending writes in these cases enables more
+ *      efficient use of the journal device.
+ *
+ *      Note: To make sure the stripe makes progress, we only delay
+ *      towrite for stripes with data already in journal (injournal > 0).
+ *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
+ *      no_space_stripes list.
+ *
  */
-static inline bool delay_towrite(struct r5dev *dev,
-				   struct stripe_head_state *s)
+static inline bool delay_towrite(struct r5conf *conf,
+				 struct r5dev *dev,
+				 struct stripe_head_state *s)
 {
-	return !test_bit(R5_OVERWRITE, &dev->flags) &&
-		!test_bit(R5_Insync, &dev->flags) && s->injournal;
+	/* case 1 above */
+	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+	    !test_bit(R5_Insync, &dev->flags) && s->injournal)
+		return true;
+	/* case 2 above */
+	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
+	    s->injournal > 0)
+		return true;
+	return false;
 }
 
 static void
@@ -2979,7 +3003,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 
-			if (dev->towrite && !delay_towrite(dev, s)) {
+			if (dev->towrite && !delay_towrite(conf, dev, s)) {
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantdrain, &dev->flags);
 				if (!expand)
@@ -3731,7 +3755,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if (((dev->towrite && !delay_towrite(dev, s)) ||
+		if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
 		     i == sh->pd_idx || i == sh->qd_idx ||
 		     test_bit(R5_InJournal, &dev->flags)) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
@@ -3755,8 +3779,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 		}
 	}
 
-	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
-		(unsigned long long)sh->sector, rmw, rcw);
+	pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
+		 (unsigned long long)sh->sector, sh->state, rmw, rcw);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
 		/* prefer read-modify-write, but need to get some data */
@@ -3796,7 +3820,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (((dev->towrite && !delay_towrite(dev, s)) ||
+			if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
 			     i == sh->pd_idx || i == sh->qd_idx ||
 			     test_bit(R5_InJournal, &dev->flags)) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-- 
cgit v1.2.3-59-g8ed1b


From 9356863c9409efc79029c01a85d015efae977e69 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 6 Feb 2017 13:41:39 +1100
Subject: md: ensure md devices are freed before module is unloaded.

Commit: cbd199837750 ("md: Fix unfortunate interaction with evms")
change mddev_put() so that it would not destroy an md device while
->ctime was non-zero.

Unfortunately, we didn't make sure to clear ->ctime when unloading
the module, so it is possible for an md device to remain after
module unload.  An attempt to open such a device will trigger
an invalid memory reference in:
  get_gendisk -> kobj_lookup -> exact_lock -> get_disk

when tring to access disk->fops, which was in the module that has
been removed.

So ensure we clear ->ctime in md_exit(), and explain how that is useful,
as it isn't immediately obvious when looking at the code.

Fixes: cbd199837750 ("md: Fix unfortunate interaction with evms")
Tested-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 01175dac0db6..13020e5c8cc0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8980,7 +8980,14 @@ static __exit void md_exit(void)
 
 	for_each_mddev(mddev, tmp) {
 		export_array(mddev);
+		mddev->ctime = 0;
 		mddev->hold_active = 0;
+		/*
+		 * for_each_mddev() will call mddev_put() at the end of each
+		 * iteration.  As the mddev is now fully clear, this will
+		 * schedule the mddev for destruction by a workqueue, and the
+		 * destroy_workqueue() below will wait for that to complete.
+		 */
 	}
 	destroy_workqueue(md_misc_wq);
 	destroy_workqueue(md_wq);
-- 
cgit v1.2.3-59-g8ed1b


From e8fd52eec2cd25b917983b3f3aa738b722522376 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 10 Feb 2017 16:18:08 -0800
Subject: md/raid5-cache: stripe reclaim only counts valid stripes

When log space is tight, we try to reclaim stripes from log head. There
are stripes which can't be reclaimed right now if some conditions are
met. We skip such stripes but accidentally count them, which might cause
no stripes are claimed. Fixing this by only counting valid stripes.

Cc: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index caae85331a02..a01f4daeb390 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1418,9 +1418,9 @@ static void r5c_do_reclaim(struct r5conf *conf)
 			    !test_bit(STRIPE_HANDLE, &sh->state) &&
 			    atomic_read(&sh->count) == 0) {
 				r5c_flush_stripe(conf, sh);
+				if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
+					break;
 			}
-			if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
-				break;
 		}
 		spin_unlock(&conf->device_lock);
 		spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
-- 
cgit v1.2.3-59-g8ed1b


From e33fbb9cc73d6502e69eaf1c178e0c39059763ea Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 10 Feb 2017 16:18:09 -0800
Subject: md/raid5-cache: exclude reclaiming stripes in reclaim check

stripes which are being reclaimed are still accounted into cached
stripes. The reclaim takes time. r5c_do_reclaim isn't aware of the
stripes and does unnecessary stripe reclaim. In practice, I saw one
stripe is reclaimed one time. This will cause bad IO pattern. Fixing
this by excluding the reclaing stripes in the check.

Cc: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 14 ++++++++++++--
 drivers/md/raid5.c       |  2 ++
 drivers/md/raid5.h       |  2 ++
 3 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index a01f4daeb390..3f307be01b10 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1327,6 +1327,10 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
 	atomic_inc(&conf->active_stripes);
 	r5c_make_stripe_write_out(sh);
 
+	if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
+		atomic_inc(&conf->r5c_flushing_partial_stripes);
+	else
+		atomic_inc(&conf->r5c_flushing_full_stripes);
 	raid5_release_stripe(sh);
 }
 
@@ -1369,12 +1373,16 @@ static void r5c_do_reclaim(struct r5conf *conf)
 	unsigned long flags;
 	int total_cached;
 	int stripes_to_flush;
+	int flushing_partial, flushing_full;
 
 	if (!r5c_is_writeback(log))
 		return;
 
+	flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
+	flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
 	total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
-		atomic_read(&conf->r5c_cached_full_stripes);
+		atomic_read(&conf->r5c_cached_full_stripes) -
+		flushing_full - flushing_partial;
 
 	if (total_cached > conf->min_nr_stripes * 3 / 4 ||
 	    atomic_read(&conf->empty_inactive_list_nr) > 0)
@@ -1384,7 +1392,7 @@ static void r5c_do_reclaim(struct r5conf *conf)
 		 */
 		stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
 	else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
-		 atomic_read(&conf->r5c_cached_full_stripes) >
+		 atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
 		 R5C_FULL_STRIPE_FLUSH_BATCH)
 		/*
 		 * if stripe cache pressure moderate, or if there is many full
@@ -2601,11 +2609,13 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 
 	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
 		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+		atomic_dec(&conf->r5c_flushing_partial_stripes);
 		atomic_dec(&conf->r5c_cached_partial_stripes);
 	}
 
 	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
 		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
+		atomic_dec(&conf->r5c_flushing_full_stripes);
 		atomic_dec(&conf->r5c_cached_full_stripes);
 	}
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b0d1345c832c..e2fa88fc6a1b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6838,6 +6838,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
 	atomic_set(&conf->r5c_cached_partial_stripes, 0);
 	INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
+	atomic_set(&conf->r5c_flushing_full_stripes, 0);
+	atomic_set(&conf->r5c_flushing_partial_stripes, 0);
 
 	conf->level = mddev->new_level;
 	conf->chunk_sectors = mddev->new_chunk_sectors;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index c0687df5ba06..4bb27b97bf6b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -663,6 +663,8 @@ struct r5conf {
 	struct list_head	r5c_full_stripe_list;
 	atomic_t		r5c_cached_partial_stripes;
 	struct list_head	r5c_partial_stripe_list;
+	atomic_t		r5c_flushing_full_stripes;
+	atomic_t		r5c_flushing_partial_stripes;
 
 	atomic_t		empty_inactive_list_nr;
 	struct llist_head	released_stripes;
-- 
cgit v1.2.3-59-g8ed1b


From 26483819f89c5cf9d27620d70c95afeeeb9bece5 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 13 Feb 2017 16:21:49 -0800
Subject: md: disable WRITE SAME if it fails in underlayer disks

This makes md do the same thing as dm for write same IO failure. Please
see 7eee4ae(dm: disable WRITE SAME if it fails) for details why we need
this.

We did a little bit different than dm. Instead of disabling writesame in
the first IO error, we disable it till next writesame IO coming after
the first IO error. This way we don't need to clone a bio.

Also reported here: https://bugzilla.kernel.org/show_bug.cgi?id=118581

Suggested-by: NeilBrown <neilb@suse.com>
Acked-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/linear.c    | 1 +
 drivers/md/md.h        | 7 +++++++
 drivers/md/multipath.c | 1 +
 drivers/md/raid0.c     | 1 +
 4 files changed, 10 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 26a73b2002cf..789008bc94ff 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -291,6 +291,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
 						      split, disk_devt(mddev->gendisk),
 						      bio_sector);
+			mddev_check_writesame(mddev, split);
 			generic_make_request(split);
 		}
 	} while (split != bio);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2a514036a83d..42f8398181a8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -710,4 +710,11 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
 {
 	mddev->flags &= ~unsupported_flags;
 }
+
+static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
+{
+	if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+		mddev->queue->limits.max_write_same_sectors = 0;
+}
 #endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index aa8c4e5c1ee2..065fe2823285 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -138,6 +138,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 	mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
 	mp_bh->bio.bi_end_io = multipath_end_request;
 	mp_bh->bio.bi_private = mp_bh;
+	mddev_check_writesame(mddev, &mp_bh->bio);
 	generic_make_request(&mp_bh->bio);
 	return;
 }
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 848365d474f3..b3d264452fd5 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -503,6 +503,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
 						      split, disk_devt(mddev->gendisk),
 						      bio_sector);
+			mddev_check_writesame(mddev, split);
 			generic_make_request(split);
 		}
 	} while (split != bio);
-- 
cgit v1.2.3-59-g8ed1b


From 10273170fd5602d8090b1312e66ad4746ab02c94 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Tue, 14 Feb 2017 23:29:00 +0800
Subject: md: fail if mddev->bio_set can't be created

The current behaviour is to fall back to allocate
bio from 'fs_bio_set', that isn't a correct way
because it might cause deadlock.

So this patch simply return failure if mddev->bio_set
can't be created.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 13020e5c8cc0..137a1fe124e8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5228,8 +5228,11 @@ int md_run(struct mddev *mddev)
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
 	}
 
-	if (mddev->bio_set == NULL)
+	if (mddev->bio_set == NULL) {
 		mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
+		if (!mddev->bio_set)
+			return -ENOMEM;
+	}
 
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
-- 
cgit v1.2.3-59-g8ed1b


From 8e58e327e25c7fffc3fb79a24c76637bdda37716 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Tue, 14 Feb 2017 23:29:01 +0800
Subject: md/raid1: use bio_clone_bioset_partial() in case of write behind

Write behind need to replace pages in bio's bvecs, and we have
to clone a fresh bio with new bvec table, so use the introduced
bio_clone_bioset_partial() for it.

For other bio_clone_mddev() cases, we will use fast clone since
they don't need to touch bvec table.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7b0f647bcccb..691d6d9bf740 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1345,13 +1345,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 	first_clone = 1;
 	for (i = 0; i < disks; i++) {
-		struct bio *mbio;
+		struct bio *mbio = NULL;
+		sector_t offset;
 		if (!r1_bio->bios[i])
 			continue;
 
-		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
-			 max_sectors);
+		offset = r1_bio->sector - bio->bi_iter.bi_sector;
 
 		if (first_clone) {
 			/* do behind I/O ?
@@ -1361,8 +1360,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			if (bitmap &&
 			    (atomic_read(&bitmap->behind_writes)
 			     < mddev->bitmap_info.max_write_behind) &&
-			    !waitqueue_active(&bitmap->behind_wait))
+			    !waitqueue_active(&bitmap->behind_wait)) {
+				mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
+								mddev->bio_set,
+								offset,
+								max_sectors);
 				alloc_behind_pages(mbio, r1_bio);
+			}
 
 			bitmap_startwrite(bitmap, r1_bio->sector,
 					  r1_bio->sectors,
@@ -1370,6 +1374,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 						   &r1_bio->state));
 			first_clone = 0;
 		}
+
+		if (!mbio) {
+			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			bio_trim(mbio, offset, max_sectors);
+		}
+
 		if (r1_bio->behind_bvecs) {
 			struct bio_vec *bvec;
 			int j;
-- 
cgit v1.2.3-59-g8ed1b


From ed7ef732ca9f7d6f42be8df6cc7bf4ace3534af3 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Tue, 14 Feb 2017 23:29:02 +0800
Subject: md: remove unnecessary check on mddev

mddev is never NULL and neither is ->bio_set, so
remove the check.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 137a1fe124e8..0e408bc67904 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -193,9 +193,6 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 			    struct mddev *mddev)
 {
-	if (!mddev || !mddev->bio_set)
-		return bio_clone(bio, gfp_mask);
-
 	return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
 }
 EXPORT_SYMBOL_GPL(bio_clone_mddev);
-- 
cgit v1.2.3-59-g8ed1b


From d7a1030839d35c04a620e841f406b9b2a8600041 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Tue, 14 Feb 2017 23:29:03 +0800
Subject: md: fast clone bio in bio_clone_mddev()

Firstly bio_clone_mddev() is used in raid normal I/O and isn't
in resync I/O path.

Secondly all the direct access to bvec table in raid happens on
resync I/O except for write behind of raid1, in which we still
use bio_clone() for allocating new bvec table.

So this patch replaces bio_clone() with bio_clone_fast()
in bio_clone_mddev().

Also kill bio_clone_mddev() and call bio_clone_fast() directly, as
suggested by Christoph Hellwig.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/faulty.c |  2 +-
 drivers/md/md.c     |  7 -------
 drivers/md/md.h     |  2 --
 drivers/md/raid1.c  | 10 ++++++----
 drivers/md/raid10.c | 11 +++++------
 drivers/md/raid5.c  |  4 ++--
 6 files changed, 14 insertions(+), 22 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 685aa2d77e25..b0536cfd8e17 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
 		}
 	}
 	if (failit) {
-		struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
+		struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 
 		b->bi_bdev = conf->rdev->bdev;
 		b->bi_private = bio;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0e408bc67904..55e7e7a8714e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -190,13 +190,6 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 }
 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 
-struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-			    struct mddev *mddev)
-{
-	return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
-}
-EXPORT_SYMBOL_GPL(bio_clone_mddev);
-
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 42f8398181a8..b8859cbf84b6 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -673,8 +673,6 @@ extern void md_rdev_clear(struct md_rdev *rdev);
 
 extern void mddev_suspend(struct mddev *mddev);
 extern void mddev_resume(struct mddev *mddev);
-extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-				   struct mddev *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   struct mddev *mddev);
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 691d6d9bf740..ad5c9483bd50 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1108,7 +1108,7 @@ read_again:
 	r1_bio->read_disk = rdisk;
 	r1_bio->start_next_window = 0;
 
-	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 	bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
 		 max_sectors);
 
@@ -1376,7 +1376,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		}
 
 		if (!mbio) {
-			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 			bio_trim(mbio, offset, max_sectors);
 		}
 
@@ -2286,7 +2286,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 
 			wbio->bi_vcnt = vcnt;
 		} else {
-			wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+			wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
+					      mddev->bio_set);
 		}
 
 		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2424,7 +2425,8 @@ read_more:
 		const unsigned long do_sync
 			= r1_bio->master_bio->bi_opf & REQ_SYNC;
 		r1_bio->read_disk = disk;
-		bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+		bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
+				     mddev->bio_set);
 		bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
 			 max_sectors);
 		r1_bio->bios[r1_bio->read_disk] = bio;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1920756828df..ade7d69234d5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1132,7 +1132,7 @@ read_again:
 	}
 	slot = r10_bio->read_slot;
 
-	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 	bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
 		 max_sectors);
 
@@ -1406,7 +1406,7 @@ retry_write:
 		int d = r10_bio->devs[i].devnum;
 		if (r10_bio->devs[i].bio) {
 			struct md_rdev *rdev = conf->mirrors[d].rdev;
-			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
 				 max_sectors);
 			r10_bio->devs[i].bio = mbio;
@@ -1457,7 +1457,7 @@ retry_write:
 				smp_mb();
 				rdev = conf->mirrors[d].rdev;
 			}
-			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
 				 max_sectors);
 			r10_bio->devs[i].repl_bio = mbio;
@@ -2565,7 +2565,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
 		if (sectors > sect_to_write)
 			sectors = sect_to_write;
 		/* Write at 'sector' for 'sectors' */
-		wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+		wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 		bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
 		wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
 		wbio->bi_iter.bi_sector = wsector +
@@ -2641,8 +2641,7 @@ read_more:
 			   mdname(mddev),
 			   bdevname(rdev->bdev, b),
 			   (unsigned long long)r10_bio->sector);
-	bio = bio_clone_mddev(r10_bio->master_bio,
-			      GFP_NOIO, mddev);
+	bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set);
 	bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
 	r10_bio->devs[slot].bio = bio;
 	r10_bio->devs[slot].rdev = rdev;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e2fa88fc6a1b..b193316804d4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5056,9 +5056,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		return 0;
 	}
 	/*
-	 * use bio_clone_mddev to make a copy of the bio
+	 * use bio_clone_fast to make a copy of the bio
 	 */
-	align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
+	align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set);
 	if (!align_bi)
 		return 0;
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From eae8263fb1f4256460270dd8f42334604dcdfac6 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Tue, 14 Feb 2017 16:26:24 +0900
Subject: md/raid5: Don't reinvent the wheel but use existing llist API

Although llist provides proper APIs, they are not used. Make them used.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b193316804d4..7b7722bb2e8d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -353,17 +353,15 @@ static void release_inactive_stripe_list(struct r5conf *conf,
 static int release_stripe_list(struct r5conf *conf,
 			       struct list_head *temp_inactive_list)
 {
-	struct stripe_head *sh;
+	struct stripe_head *sh, *t;
 	int count = 0;
 	struct llist_node *head;
 
 	head = llist_del_all(&conf->released_stripes);
 	head = llist_reverse_order(head);
-	while (head) {
+	llist_for_each_entry_safe(sh, t, head, release_list) {
 		int hash;
 
-		sh = llist_entry(head, struct stripe_head, release_list);
-		head = llist_next(head);
 		/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
 		smp_mb();
 		clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
-- 
cgit v1.2.3-59-g8ed1b


From fd76863e37fef26fe05547fddfa6e3d05e1682e6 Mon Sep 17 00:00:00 2001
From: "colyli@suse.de" <colyli@suse.de>
Date: Sat, 18 Feb 2017 03:05:56 +0800
Subject: RAID1: a new I/O barrier implementation to remove resync window

'Commit 79ef3a8aa1cb ("raid1: Rewrite the implementation of iobarrier.")'
introduces a sliding resync window for raid1 I/O barrier, this idea limits
I/O barriers to happen only inside a slidingresync window, for regular
I/Os out of this resync window they don't need to wait for barrier any
more. On large raid1 device, it helps a lot to improve parallel writing
I/O throughput when there are background resync I/Os performing at
same time.

The idea of sliding resync widow is awesome, but code complexity is a
challenge. Sliding resync window requires several variables to work
collectively, this is complexed and very hard to make it work correctly.
Just grep "Fixes: 79ef3a8aa1" in kernel git log, there are 8 more patches
to fix the original resync window patch. This is not the end, any further
related modification may easily introduce more regreassion.

Therefore I decide to implement a much simpler raid1 I/O barrier, by
removing resync window code, I believe life will be much easier.

The brief idea of the simpler barrier is,
 - Do not maintain a global unique resync window
 - Use multiple hash buckets to reduce I/O barrier conflicts, regular
   I/O only has to wait for a resync I/O when both them have same barrier
   bucket index, vice versa.
 - I/O barrier can be reduced to an acceptable number if there are enough
   barrier buckets

Here I explain how the barrier buckets are designed,
 - BARRIER_UNIT_SECTOR_SIZE
   The whole LBA address space of a raid1 device is divided into multiple
   barrier units, by the size of BARRIER_UNIT_SECTOR_SIZE.
   Bio requests won't go across border of barrier unit size, that means
   maximum bio size is BARRIER_UNIT_SECTOR_SIZE<<9 (64MB) in bytes.
   For random I/O 64MB is large enough for both read and write requests,
   for sequential I/O considering underlying block layer may merge them
   into larger requests, 64MB is still good enough.
   Neil also points out that for resync operation, "we want the resync to
   move from region to region fairly quickly so that the slowness caused
   by having to synchronize with the resync is averaged out over a fairly
   small time frame". For full speed resync, 64MB should take less then 1
   second. When resync is competing with other I/O, it could take up a few
   minutes. Therefore 64MB size is fairly good range for resync.

 - BARRIER_BUCKETS_NR
   There are BARRIER_BUCKETS_NR buckets in total, which is defined by,
        #define BARRIER_BUCKETS_NR_BITS   (PAGE_SHIFT - 2)
        #define BARRIER_BUCKETS_NR        (1<<BARRIER_BUCKETS_NR_BITS)
   this patch makes the bellowed members of struct r1conf from integer
   to array of integers,
        -       int                     nr_pending;
        -       int                     nr_waiting;
        -       int                     nr_queued;
        -       int                     barrier;
        +       int                     *nr_pending;
        +       int                     *nr_waiting;
        +       int                     *nr_queued;
        +       int                     *barrier;
   number of the array elements is defined as BARRIER_BUCKETS_NR. For 4KB
   kernel space page size, (PAGE_SHIFT - 2) indecates there are 1024 I/O
   barrier buckets, and each array of integers occupies single memory page.
   1024 means for a request which is smaller than the I/O barrier unit size
   has ~0.1% chance to wait for resync to pause, which is quite a small
   enough fraction. Also requesting single memory page is more friendly to
   kernel page allocator than larger memory size.

 - I/O barrier bucket is indexed by bio start sector
   If multiple I/O requests hit different I/O barrier units, they only need
   to compete I/O barrier with other I/Os which hit the same I/O barrier
   bucket index with each other. The index of a barrier bucket which a
   bio should look for is calculated by sector_to_idx() which is defined
   in raid1.h as an inline function,
        static inline int sector_to_idx(sector_t sector)
        {
                return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
                                BARRIER_BUCKETS_NR_BITS);
        }
   Here sector_nr is the start sector number of a bio.

 - Single bio won't go across boundary of a I/O barrier unit
   If a request goes across boundary of barrier unit, it will be split. A
   bio may be split in raid1_make_request() or raid1_sync_request(), if
   sectors returned by align_to_barrier_unit_end() is smaller than
   original bio size.

Comparing to single sliding resync window,
 - Currently resync I/O grows linearly, therefore regular and resync I/O
   will conflict within a single barrier units. So the I/O behavior is
   similar to single sliding resync window.
 - But a barrier unit bucket is shared by all barrier units with identical
   barrier uinit index, the probability of conflict might be higher
   than single sliding resync window, in condition that writing I/Os
   always hit barrier units which have identical barrier bucket indexs with
   the resync I/Os. This is a very rare condition in real I/O work loads,
   I cannot imagine how it could happen in practice.
 - Therefore we can achieve a good enough low conflict rate with much
   simpler barrier algorithm and implementation.

There are two changes should be noticed,
 - In raid1d(), I change the code to decrease conf->nr_pending[idx] into
   single loop, it looks like this,
        spin_lock_irqsave(&conf->device_lock, flags);
        conf->nr_queued[idx]--;
        spin_unlock_irqrestore(&conf->device_lock, flags);
   This change generates more spin lock operations, but in next patch of
   this patch set, it will be replaced by a single line code,
        atomic_dec(&conf->nr_queueud[idx]);
   So we don't need to worry about spin lock cost here.
 - Mainline raid1 code split original raid1_make_request() into
   raid1_read_request() and raid1_write_request(). If the original bio
   goes across an I/O barrier unit size, this bio will be split before
   calling raid1_read_request() or raid1_write_request(),  this change
   the code logic more simple and clear.
 - In this patch wait_barrier() is moved from raid1_make_request() to
   raid1_write_request(). In raid_read_request(), original wait_barrier()
   is replaced by raid1_read_request().
   The differnece is wait_read_barrier() only waits if array is frozen,
   using different barrier function in different code path makes the code
   more clean and easy to read.
Changelog
V4:
- Add alloc_r1bio() to remove redundant r1bio memory allocation code.
- Fix many typos in patch comments.
- Use (PAGE_SHIFT - ilog2(sizeof(int))) to define BARRIER_BUCKETS_NR_BITS.
V3:
- Rebase the patch against latest upstream kernel code.
- Many fixes by review comments from Neil,
  - Back to use pointers to replace arraries in struct r1conf
  - Remove total_barriers from struct r1conf
  - Add more patch comments to explain how/why the values of
    BARRIER_UNIT_SECTOR_SIZE and BARRIER_BUCKETS_NR are decided.
  - Use get_unqueued_pending() to replace get_all_pendings() and
    get_all_queued()
  - Increase bucket number from 512 to 1024
- Change code comments format by review from Shaohua.
V2:
- Use bio_split() to split the orignal bio if it goes across barrier unit
  bounday, to make the code more simple, by suggestion from Shaohua and
  Neil.
- Use hash_long() to replace original linear hash, to avoid a possible
  confilict between resync I/O and sequential write I/O, by suggestion from
  Shaohua.
- Add conf->total_barriers to record barrier depth, which is used to
  control number of parallel sync I/O barriers, by suggestion from Shaohua.
- In V1 patch the bellowed barrier buckets related members in r1conf are
  allocated in memory page. To make the code more simple, V2 patch moves
  the memory space into struct r1conf, like this,
        -       int                     nr_pending;
        -       int                     nr_waiting;
        -       int                     nr_queued;
        -       int                     barrier;
        +       int                     nr_pending[BARRIER_BUCKETS_NR];
        +       int                     nr_waiting[BARRIER_BUCKETS_NR];
        +       int                     nr_queued[BARRIER_BUCKETS_NR];
        +       int                     barrier[BARRIER_BUCKETS_NR];
  This change is by the suggestion from Shaohua.
- Remove some inrelavent code comments, by suggestion from Guoqing.
- Add a missing wait_barrier() before jumping to retry_write, in
  raid1_make_write_request().
V1:
- Original RFC patch for comments

Signed-off-by: Coly Li <colyli@suse.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Guoqing Jiang <gqjiang@suse.com>
Reviewed-by: Neil Brown <neilb@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 473 +++++++++++++++++++++++++++++------------------------
 drivers/md/raid1.h |  57 ++++---
 2 files changed, 294 insertions(+), 236 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ad5c9483bd50..40297fd17f7e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -71,9 +71,8 @@
  */
 static int max_queued_requests = 1024;
 
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
-			  sector_t bi_sector);
-static void lower_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
 #define raid1_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
@@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
 #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
-#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -215,7 +213,7 @@ static void put_buf(struct r1bio *r1_bio)
 
 	mempool_free(r1_bio, conf->r1buf_pool);
 
-	lower_barrier(conf);
+	lower_barrier(conf, r1_bio->sector);
 }
 
 static void reschedule_retry(struct r1bio *r1_bio)
@@ -223,10 +221,12 @@ static void reschedule_retry(struct r1bio *r1_bio)
 	unsigned long flags;
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
+	int idx;
 
+	idx = sector_to_idx(r1_bio->sector);
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r1_bio->retry_list, &conf->retry_list);
-	conf->nr_queued ++;
+	conf->nr_queued[idx]++;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	wake_up(&conf->wait_barrier);
@@ -243,7 +243,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	struct bio *bio = r1_bio->master_bio;
 	int done;
 	struct r1conf *conf = r1_bio->mddev->private;
-	sector_t start_next_window = r1_bio->start_next_window;
 	sector_t bi_sector = bio->bi_iter.bi_sector;
 
 	if (bio->bi_phys_segments) {
@@ -269,7 +268,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
 		 * Wake up any possible resync thread that waits for the device
 		 * to go idle.
 		 */
-		allow_barrier(conf, start_next_window, bi_sector);
+		allow_barrier(conf, bi_sector);
 	}
 }
 
@@ -517,6 +516,25 @@ static void raid1_end_write_request(struct bio *bio)
 		bio_put(to_put);
 }
 
+static sector_t align_to_barrier_unit_end(sector_t start_sector,
+					  sector_t sectors)
+{
+	sector_t len;
+
+	WARN_ON(sectors == 0);
+	/*
+	 * len is the number of sectors from start_sector to end of the
+	 * barrier unit which start_sector belongs to.
+	 */
+	len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
+	      start_sector;
+
+	if (len > sectors)
+		len = sectors;
+
+	return len;
+}
+
 /*
  * This routine returns the disk from which the requested read should
  * be done. There is a per-array 'next expected sequential IO' sector
@@ -813,168 +831,168 @@ static void flush_pending_writes(struct r1conf *conf)
  */
 static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
 {
+	int idx = sector_to_idx(sector_nr);
+
 	spin_lock_irq(&conf->resync_lock);
 
 	/* Wait until no block IO is waiting */
-	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting[idx],
 			    conf->resync_lock);
 
 	/* block any new IO from starting */
-	conf->barrier++;
-	conf->next_resync = sector_nr;
+	conf->barrier[idx]++;
 
 	/* For these conditions we must wait:
 	 * A: while the array is in frozen state
-	 * B: while barrier >= RESYNC_DEPTH, meaning resync reach
-	 *    the max count which allowed.
-	 * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
-	 *    next resync will reach to the window which normal bios are
-	 *    handling.
-	 * D: while there are any active requests in the current window.
+	 * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
+	 *    existing in corresponding I/O barrier bucket.
+	 * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
+	 *    max resync count which allowed on current I/O barrier bucket.
 	 */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->array_frozen &&
-			    conf->barrier < RESYNC_DEPTH &&
-			    conf->current_window_requests == 0 &&
-			    (conf->start_next_window >=
-			     conf->next_resync + RESYNC_SECTORS),
+			     !conf->nr_pending[idx] &&
+			     conf->barrier[idx] < RESYNC_DEPTH,
 			    conf->resync_lock);
 
-	conf->nr_pending++;
+	conf->nr_pending[idx]++;
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void lower_barrier(struct r1conf *conf)
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 {
 	unsigned long flags;
-	BUG_ON(conf->barrier <= 0);
+	int idx = sector_to_idx(sector_nr);
+
+	BUG_ON(conf->barrier[idx] <= 0);
+
 	spin_lock_irqsave(&conf->resync_lock, flags);
-	conf->barrier--;
-	conf->nr_pending--;
+	conf->barrier[idx]--;
+	conf->nr_pending[idx]--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 
-static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
+static void _wait_barrier(struct r1conf *conf, int idx)
 {
-	bool wait = false;
-
-	if (conf->array_frozen || !bio)
-		wait = true;
-	else if (conf->barrier && bio_data_dir(bio) == WRITE) {
-		if ((conf->mddev->curr_resync_completed
-		     >= bio_end_sector(bio)) ||
-		    (conf->start_next_window + NEXT_NORMALIO_DISTANCE
-		     <= bio->bi_iter.bi_sector))
-			wait = false;
-		else
-			wait = true;
+	spin_lock_irq(&conf->resync_lock);
+	if (conf->array_frozen || conf->barrier[idx]) {
+		conf->nr_waiting[idx]++;
+		/* Wait for the barrier to drop. */
+		wait_event_lock_irq(
+			conf->wait_barrier,
+			!conf->array_frozen && !conf->barrier[idx],
+			conf->resync_lock);
+		conf->nr_waiting[idx]--;
 	}
 
-	return wait;
+	conf->nr_pending[idx]++;
+	spin_unlock_irq(&conf->resync_lock);
 }
 
-static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 {
-	sector_t sector = 0;
+	int idx = sector_to_idx(sector_nr);
 
 	spin_lock_irq(&conf->resync_lock);
-	if (need_to_wait_for_sync(conf, bio)) {
-		conf->nr_waiting++;
-		/* Wait for the barrier to drop.
-		 * However if there are already pending
-		 * requests (preventing the barrier from
-		 * rising completely), and the
-		 * per-process bio queue isn't empty,
-		 * then don't wait, as we need to empty
-		 * that queue to allow conf->start_next_window
-		 * to increase.
-		 */
-		raid1_log(conf->mddev, "wait barrier");
-		wait_event_lock_irq(conf->wait_barrier,
-				    !conf->array_frozen &&
-				    (!conf->barrier ||
-				     ((conf->start_next_window <
-				       conf->next_resync + RESYNC_SECTORS) &&
-				      current->bio_list &&
-				      !bio_list_empty(current->bio_list))),
-				    conf->resync_lock);
-		conf->nr_waiting--;
-	}
-
-	if (bio && bio_data_dir(bio) == WRITE) {
-		if (bio->bi_iter.bi_sector >= conf->next_resync) {
-			if (conf->start_next_window == MaxSector)
-				conf->start_next_window =
-					conf->next_resync +
-					NEXT_NORMALIO_DISTANCE;
-
-			if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
-			    <= bio->bi_iter.bi_sector)
-				conf->next_window_requests++;
-			else
-				conf->current_window_requests++;
-			sector = conf->start_next_window;
-		}
+	if (conf->array_frozen) {
+		conf->nr_waiting[idx]++;
+		/* Wait for array to unfreeze */
+		wait_event_lock_irq(
+			conf->wait_barrier,
+			!conf->array_frozen,
+			conf->resync_lock);
+		conf->nr_waiting[idx]--;
 	}
 
-	conf->nr_pending++;
+	conf->nr_pending[idx]++;
 	spin_unlock_irq(&conf->resync_lock);
-	return sector;
 }
 
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
-			  sector_t bi_sector)
+static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
+{
+	int idx = sector_to_idx(sector_nr);
+
+	_wait_barrier(conf, idx);
+}
+
+static void wait_all_barriers(struct r1conf *conf)
+{
+	int idx;
+
+	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+		_wait_barrier(conf, idx);
+}
+
+static void _allow_barrier(struct r1conf *conf, int idx)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&conf->resync_lock, flags);
-	conf->nr_pending--;
-	if (start_next_window) {
-		if (start_next_window == conf->start_next_window) {
-			if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
-			    <= bi_sector)
-				conf->next_window_requests--;
-			else
-				conf->current_window_requests--;
-		} else
-			conf->current_window_requests--;
-
-		if (!conf->current_window_requests) {
-			if (conf->next_window_requests) {
-				conf->current_window_requests =
-					conf->next_window_requests;
-				conf->next_window_requests = 0;
-				conf->start_next_window +=
-					NEXT_NORMALIO_DISTANCE;
-			} else
-				conf->start_next_window = MaxSector;
-		}
-	}
+	conf->nr_pending[idx]--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
+{
+	int idx = sector_to_idx(sector_nr);
+
+	_allow_barrier(conf, idx);
+}
+
+static void allow_all_barriers(struct r1conf *conf)
+{
+	int idx;
+
+	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+		_allow_barrier(conf, idx);
+}
+
+/* conf->resync_lock should be held */
+static int get_unqueued_pending(struct r1conf *conf)
+{
+	int idx, ret;
+
+	for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+		ret += conf->nr_pending[idx] - conf->nr_queued[idx];
+
+	return ret;
+}
+
 static void freeze_array(struct r1conf *conf, int extra)
 {
-	/* stop syncio and normal IO and wait for everything to
+	/* Stop sync I/O and normal I/O and wait for everything to
 	 * go quite.
-	 * We wait until nr_pending match nr_queued+extra
-	 * This is called in the context of one normal IO request
-	 * that has failed. Thus any sync request that might be pending
-	 * will be blocked by nr_pending, and we need to wait for
-	 * pending IO requests to complete or be queued for re-try.
-	 * Thus the number queued (nr_queued) plus this request (extra)
-	 * must match the number of pending IOs (nr_pending) before
-	 * we continue.
+	 * This is called in two situations:
+	 * 1) management command handlers (reshape, remove disk, quiesce).
+	 * 2) one normal I/O request failed.
+
+	 * After array_frozen is set to 1, new sync IO will be blocked at
+	 * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
+	 * or wait_read_barrier(). The flying I/Os will either complete or be
+	 * queued. When everything goes quite, there are only queued I/Os left.
+
+	 * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
+	 * barrier bucket index which this I/O request hits. When all sync and
+	 * normal I/O are queued, sum of all conf->nr_pending[] will match sum
+	 * of all conf->nr_queued[]. But normal I/O failure is an exception,
+	 * in handle_read_error(), we may call freeze_array() before trying to
+	 * fix the read error. In this case, the error read I/O is not queued,
+	 * so get_unqueued_pending() == 1.
+	 *
+	 * Therefore before this function returns, we need to wait until
+	 * get_unqueued_pendings(conf) gets equal to extra. For
+	 * normal I/O context, extra is 1, in rested situations extra is 0.
 	 */
 	spin_lock_irq(&conf->resync_lock);
 	conf->array_frozen = 1;
 	raid1_log(conf->mddev, "wait freeze");
-	wait_event_lock_irq_cmd(conf->wait_barrier,
-				conf->nr_pending == conf->nr_queued+extra,
-				conf->resync_lock,
-				flush_pending_writes(conf));
+	wait_event_lock_irq_cmd(
+		conf->wait_barrier,
+		get_unqueued_pending(conf) == extra,
+		conf->resync_lock,
+		flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r1conf *conf)
@@ -1070,11 +1088,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }
 
-static void raid1_read_request(struct mddev *mddev, struct bio *bio,
-				 struct r1bio *r1_bio)
+static inline struct r1bio *
+alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
+{
+	struct r1conf *conf = mddev->private;
+	struct r1bio *r1_bio;
+
+	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+	r1_bio->master_bio = bio;
+	r1_bio->sectors = bio_sectors(bio) - sectors_handled;
+	r1_bio->state = 0;
+	r1_bio->mddev = mddev;
+	r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+
+	return r1_bio;
+}
+
+static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r1conf *conf = mddev->private;
 	struct raid1_info *mirror;
+	struct r1bio *r1_bio;
 	struct bio *read_bio;
 	struct bitmap *bitmap = mddev->bitmap;
 	const int op = bio_op(bio);
@@ -1083,8 +1118,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	int max_sectors;
 	int rdisk;
 
-	wait_barrier(conf, bio);
+	/*
+	 * Still need barrier for READ in case that whole
+	 * array is frozen.
+	 */
+	wait_read_barrier(conf, bio->bi_iter.bi_sector);
+
+	r1_bio = alloc_r1bio(mddev, bio, 0);
 
+	/*
+	 * We might need to issue multiple reads to different
+	 * devices if there are bad blocks around, so we keep
+	 * track of the number of reads in bio->bi_phys_segments.
+	 * If this is 0, there is only one r1_bio and no locking
+	 * will be needed when requests complete.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	bio_clear_flag(bio, BIO_SEG_VALID);
+
+	/*
+	 * make_request() can abort the operation when read-ahead is being
+	 * used and no empty request is available.
+	 */
 read_again:
 	rdisk = read_balance(conf, r1_bio, &max_sectors);
 
@@ -1106,7 +1162,6 @@ read_again:
 			   atomic_read(&bitmap->behind_writes) == 0);
 	}
 	r1_bio->read_disk = rdisk;
-	r1_bio->start_next_window = 0;
 
 	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 	bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
@@ -1151,22 +1206,16 @@ read_again:
 		 */
 		reschedule_retry(r1_bio);
 
-		r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-		r1_bio->master_bio = bio;
-		r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-		r1_bio->state = 0;
-		r1_bio->mddev = mddev;
-		r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
 		goto read_again;
 	} else
 		generic_make_request(read_bio);
 }
 
-static void raid1_write_request(struct mddev *mddev, struct bio *bio,
-				struct r1bio *r1_bio)
+static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r1conf *conf = mddev->private;
+	struct r1bio *r1_bio;
 	int i, disks;
 	struct bitmap *bitmap = mddev->bitmap;
 	unsigned long flags;
@@ -1180,7 +1229,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
-	sector_t start_next_window;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -1216,7 +1264,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		}
 		finish_wait(&conf->wait_barrier, &w);
 	}
-	start_next_window = wait_barrier(conf, bio);
+	wait_barrier(conf, bio->bi_iter.bi_sector);
+
+	r1_bio = alloc_r1bio(mddev, bio, 0);
+
+	/* We might need to issue multiple writes to different
+	 * devices if there are bad blocks around, so we keep
+	 * track of the number of writes in bio->bi_phys_segments.
+	 * If this is 0, there is only one r1_bio and no locking
+	 * will be needed when requests complete.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	bio_clear_flag(bio, BIO_SEG_VALID);
 
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
@@ -1237,7 +1297,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 	disks = conf->raid_disks * 2;
  retry_write:
-	r1_bio->start_next_window = start_next_window;
 	blocked_rdev = NULL;
 	rcu_read_lock();
 	max_sectors = r1_bio->sectors;
@@ -1304,25 +1363,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	if (unlikely(blocked_rdev)) {
 		/* Wait for this device to become unblocked */
 		int j;
-		sector_t old = start_next_window;
 
 		for (j = 0; j < i; j++)
 			if (r1_bio->bios[j])
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 		r1_bio->state = 0;
-		allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
+		allow_barrier(conf, bio->bi_iter.bi_sector);
 		raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		start_next_window = wait_barrier(conf, bio);
-		/*
-		 * We must make sure the multi r1bios of bio have
-		 * the same value of bi_phys_segments
-		 */
-		if (bio->bi_phys_segments && old &&
-		    old != start_next_window)
-			/* Wait for the former r1bio(s) to complete */
-			wait_event(conf->wait_barrier,
-				   bio->bi_phys_segments == 1);
+		wait_barrier(conf, bio->bi_iter.bi_sector);
 		goto retry_write;
 	}
 
@@ -1440,12 +1489,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		/* We need another r1_bio.  It has already been counted
 		 * in bio->bi_phys_segments
 		 */
-		r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-		r1_bio->master_bio = bio;
-		r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-		r1_bio->state = 0;
-		r1_bio->mddev = mddev;
-		r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
 		goto retry_write;
 	}
 
@@ -1457,36 +1501,25 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 {
-	struct r1conf *conf = mddev->private;
-	struct r1bio *r1_bio;
+	struct bio *split;
+	sector_t sectors;
 
-	/*
-	 * make_request() can abort the operation when read-ahead is being
-	 * used and no empty request is available.
-	 *
-	 */
-	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-	r1_bio->master_bio = bio;
-	r1_bio->sectors = bio_sectors(bio);
-	r1_bio->state = 0;
-	r1_bio->mddev = mddev;
-	r1_bio->sector = bio->bi_iter.bi_sector;
-
-	/*
-	 * We might need to issue multiple reads to different devices if there
-	 * are bad blocks around, so we keep track of the number of reads in
-	 * bio->bi_phys_segments.  If this is 0, there is only one r1_bio and
-	 * no locking will be needed when requests complete.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
+	/* if bio exceeds barrier unit boundary, split it */
+	do {
+		sectors = align_to_barrier_unit_end(
+				bio->bi_iter.bi_sector, bio_sectors(bio));
+		if (sectors < bio_sectors(bio)) {
+			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
+			bio_chain(split, bio);
+		} else {
+			split = bio;
+		}
 
-	if (bio_data_dir(bio) == READ)
-		raid1_read_request(mddev, bio, r1_bio);
-	else
-		raid1_write_request(mddev, bio, r1_bio);
+		if (bio_data_dir(split) == READ)
+			raid1_read_request(mddev, split);
+		else
+			raid1_write_request(mddev, split);
+	} while (split != bio);
 }
 
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -1577,19 +1610,11 @@ static void print_conf(struct r1conf *conf)
 
 static void close_sync(struct r1conf *conf)
 {
-	wait_barrier(conf, NULL);
-	allow_barrier(conf, 0, 0);
+	wait_all_barriers(conf);
+	allow_all_barriers(conf);
 
 	mempool_destroy(conf->r1buf_pool);
 	conf->r1buf_pool = NULL;
-
-	spin_lock_irq(&conf->resync_lock);
-	conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE;
-	conf->start_next_window = MaxSector;
-	conf->current_window_requests +=
-		conf->next_window_requests;
-	conf->next_window_requests = 0;
-	spin_unlock_irq(&conf->resync_lock);
 }
 
 static int raid1_spare_active(struct mddev *mddev)
@@ -2337,8 +2362,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 
 static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 {
-	int m;
+	int m, idx;
 	bool fail = false;
+
 	for (m = 0; m < conf->raid_disks * 2 ; m++)
 		if (r1_bio->bios[m] == IO_MADE_GOOD) {
 			struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2364,7 +2390,8 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 	if (fail) {
 		spin_lock_irq(&conf->device_lock);
 		list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
-		conf->nr_queued++;
+		idx = sector_to_idx(r1_bio->sector);
+		conf->nr_queued[idx]++;
 		spin_unlock_irq(&conf->device_lock);
 		md_wakeup_thread(conf->mddev->thread);
 	} else {
@@ -2460,15 +2487,8 @@ read_more:
 			generic_make_request(bio);
 			bio = NULL;
 
-			r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-			r1_bio->master_bio = mbio;
-			r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
-			r1_bio->state = 0;
+			r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
 			set_bit(R1BIO_ReadError, &r1_bio->state);
-			r1_bio->mddev = mddev;
-			r1_bio->sector = mbio->bi_iter.bi_sector +
-				sectors_handled;
 
 			goto read_more;
 		} else {
@@ -2487,6 +2507,7 @@ static void raid1d(struct md_thread *thread)
 	struct r1conf *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	struct blk_plug plug;
+	int idx;
 
 	md_check_recovery(mddev);
 
@@ -2494,17 +2515,17 @@ static void raid1d(struct md_thread *thread)
 	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
 		LIST_HEAD(tmp);
 		spin_lock_irqsave(&conf->device_lock, flags);
-		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-			while (!list_empty(&conf->bio_end_io_list)) {
-				list_move(conf->bio_end_io_list.prev, &tmp);
-				conf->nr_queued--;
-			}
-		}
+		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+			list_splice_init(&conf->bio_end_io_list, &tmp);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		while (!list_empty(&tmp)) {
 			r1_bio = list_first_entry(&tmp, struct r1bio,
 						  retry_list);
 			list_del(&r1_bio->retry_list);
+			idx = sector_to_idx(r1_bio->sector);
+			spin_lock_irqsave(&conf->device_lock, flags);
+			conf->nr_queued[idx]--;
+			spin_unlock_irqrestore(&conf->device_lock, flags);
 			if (mddev->degraded)
 				set_bit(R1BIO_Degraded, &r1_bio->state);
 			if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2525,7 +2546,8 @@ static void raid1d(struct md_thread *thread)
 		}
 		r1_bio = list_entry(head->prev, struct r1bio, retry_list);
 		list_del(head->prev);
-		conf->nr_queued--;
+		idx = sector_to_idx(r1_bio->sector);
+		conf->nr_queued[idx]--;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r1_bio->mddev;
@@ -2564,7 +2586,6 @@ static int init_resync(struct r1conf *conf)
 					  conf->poolinfo);
 	if (!conf->r1buf_pool)
 		return -ENOMEM;
-	conf->next_resync = 0;
 	return 0;
 }
 
@@ -2593,6 +2614,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	int still_degraded = 0;
 	int good_sectors = RESYNC_SECTORS;
 	int min_bad = 0; /* number of sectors that are bad in all devices */
+	int idx = sector_to_idx(sector_nr);
 
 	if (!conf->r1buf_pool)
 		if (init_resync(conf))
@@ -2642,7 +2664,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	 * If there is non-resync activity waiting for a turn, then let it
 	 * though before starting on this new sync request.
 	 */
-	if (conf->nr_waiting)
+	if (conf->nr_waiting[idx])
 		schedule_timeout_uninterruptible(1);
 
 	/* we are incrementing sector_nr below. To be safe, we check against
@@ -2669,6 +2691,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	r1_bio->sector = sector_nr;
 	r1_bio->state = 0;
 	set_bit(R1BIO_IsSync, &r1_bio->state);
+	/* make sure good_sectors won't go across barrier unit boundary */
+	good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
 
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		struct md_rdev *rdev;
@@ -2899,6 +2923,26 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	if (!conf)
 		goto abort;
 
+	conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
+				   sizeof(int), GFP_KERNEL);
+	if (!conf->nr_pending)
+		goto abort;
+
+	conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
+				   sizeof(int), GFP_KERNEL);
+	if (!conf->nr_waiting)
+		goto abort;
+
+	conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
+				  sizeof(int), GFP_KERNEL);
+	if (!conf->nr_queued)
+		goto abort;
+
+	conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
+				sizeof(int), GFP_KERNEL);
+	if (!conf->barrier)
+		goto abort;
+
 	conf->mirrors = kzalloc(sizeof(struct raid1_info)
 				* mddev->raid_disks * 2,
 				 GFP_KERNEL);
@@ -2954,9 +2998,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	conf->pending_count = 0;
 	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
-	conf->start_next_window = MaxSector;
-	conf->current_window_requests = conf->next_window_requests = 0;
-
 	err = -EIO;
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 
@@ -2999,6 +3040,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		kfree(conf->mirrors);
 		safe_put_page(conf->tmppage);
 		kfree(conf->poolinfo);
+		kfree(conf->nr_pending);
+		kfree(conf->nr_waiting);
+		kfree(conf->nr_queued);
+		kfree(conf->barrier);
 		kfree(conf);
 	}
 	return ERR_PTR(err);
@@ -3100,6 +3145,10 @@ static void raid1_free(struct mddev *mddev, void *priv)
 	kfree(conf->mirrors);
 	safe_put_page(conf->tmppage);
 	kfree(conf->poolinfo);
+	kfree(conf->nr_pending);
+	kfree(conf->nr_waiting);
+	kfree(conf->nr_queued);
+	kfree(conf->barrier);
 	kfree(conf);
 }
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index c52ef424a24b..3442e8fe3fcd 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,6 +1,29 @@
 #ifndef _RAID1_H
 #define _RAID1_H
 
+/*
+ * each barrier unit size is 64MB fow now
+ * note: it must be larger than RESYNC_DEPTH
+ */
+#define BARRIER_UNIT_SECTOR_BITS	17
+#define BARRIER_UNIT_SECTOR_SIZE	(1<<17)
+/*
+ * In struct r1conf, the following members are related to I/O barrier
+ * buckets,
+ *	int	*nr_pending;
+ *	int	*nr_waiting;
+ *	int	*nr_queued;
+ *	int	*barrier;
+ * Each of them points to array of integers, each array is designed to
+ * have BARRIER_BUCKETS_NR elements and occupy a single memory page. The
+ * data width of integer variables is 4, equal to 1<<(ilog2(sizeof(int))),
+ * BARRIER_BUCKETS_NR_BITS is defined as (PAGE_SHIFT - ilog2(sizeof(int)))
+ * to make sure an array of integers with BARRIER_BUCKETS_NR elements just
+ * exactly occupies a single memory page.
+ */
+#define BARRIER_BUCKETS_NR_BITS		(PAGE_SHIFT - ilog2(sizeof(int)))
+#define BARRIER_BUCKETS_NR		(1<<BARRIER_BUCKETS_NR_BITS)
+
 struct raid1_info {
 	struct md_rdev	*rdev;
 	sector_t	head_position;
@@ -35,25 +58,6 @@ struct r1conf {
 						 */
 	int			raid_disks;
 
-	/* During resync, read_balancing is only allowed on the part
-	 * of the array that has been resynced.  'next_resync' tells us
-	 * where that is.
-	 */
-	sector_t		next_resync;
-
-	/* When raid1 starts resync, we divide array into four partitions
-	 * |---------|--------------|---------------------|-------------|
-	 *        next_resync   start_next_window       end_window
-	 * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
-	 * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
-	 * current_window_requests means the count of normalIO between
-	 *   start_next_window and end_window.
-	 * next_window_requests means the count of normalIO after end_window.
-	 * */
-	sector_t		start_next_window;
-	int			current_window_requests;
-	int			next_window_requests;
-
 	spinlock_t		device_lock;
 
 	/* list of 'struct r1bio' that need to be processed by raid1d,
@@ -79,10 +83,10 @@ struct r1conf {
 	 */
 	wait_queue_head_t	wait_barrier;
 	spinlock_t		resync_lock;
-	int			nr_pending;
-	int			nr_waiting;
-	int			nr_queued;
-	int			barrier;
+	int			*nr_pending;
+	int			*nr_waiting;
+	int			*nr_queued;
+	int			*barrier;
 	int			array_frozen;
 
 	/* Set to 1 if a full sync is needed, (fresh device added).
@@ -135,7 +139,6 @@ struct r1bio {
 						 * in this BehindIO request
 						 */
 	sector_t		sector;
-	sector_t		start_next_window;
 	int			sectors;
 	unsigned long		state;
 	struct mddev		*mddev;
@@ -185,4 +188,10 @@ enum r1bio_state {
 	R1BIO_WriteError,
 	R1BIO_FailFast,
 };
+
+static inline int sector_to_idx(sector_t sector)
+{
+	return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
+			 BARRIER_BUCKETS_NR_BITS);
+}
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 824e47daddbfc6ebe1006b8659f080620472a136 Mon Sep 17 00:00:00 2001
From: "colyli@suse.de" <colyli@suse.de>
Date: Sat, 18 Feb 2017 03:05:57 +0800
Subject: RAID1: avoid unnecessary spin locks in I/O barrier code

When I run a parallel reading performan testing on a md raid1 device with
two NVMe SSDs, I observe very bad throughput in supprise: by fio with 64KB
block size, 40 seq read I/O jobs, 128 iodepth, overall throughput is
only 2.7GB/s, this is around 50% of the idea performance number.

The perf reports locking contention happens at allow_barrier() and
wait_barrier() code,
 - 41.41%  fio [kernel.kallsyms]     [k] _raw_spin_lock_irqsave
   - _raw_spin_lock_irqsave
         + 89.92% allow_barrier
         + 9.34% __wake_up
 - 37.30%  fio [kernel.kallsyms]     [k] _raw_spin_lock_irq
   - _raw_spin_lock_irq
         - 100.00% wait_barrier

The reason is, in these I/O barrier related functions,
 - raise_barrier()
 - lower_barrier()
 - wait_barrier()
 - allow_barrier()
They always hold conf->resync_lock firstly, even there are only regular
reading I/Os and no resync I/O at all. This is a huge performance penalty.

The solution is a lockless-like algorithm in I/O barrier code, and only
holding conf->resync_lock when it has to.

The original idea is from Hannes Reinecke, and Neil Brown provides
comments to improve it. I continue to work on it, and make the patch into
current form.

In the new simpler raid1 I/O barrier implementation, there are two
wait barrier functions,
 - wait_barrier()
   Which calls _wait_barrier(), is used for regular write I/O. If there is
   resync I/O happening on the same I/O barrier bucket, or the whole
   array is frozen, task will wait until no barrier on same barrier bucket,
   or the whold array is unfreezed.
 - wait_read_barrier()
   Since regular read I/O won't interfere with resync I/O (read_balance()
   will make sure only uptodate data will be read out), it is unnecessary
   to wait for barrier in regular read I/Os, waiting in only necessary
   when the whole array is frozen.

The operations on conf->nr_pending[idx], conf->nr_waiting[idx], conf->
barrier[idx] are very carefully designed in raise_barrier(),
lower_barrier(), _wait_barrier() and wait_read_barrier(), in order to
avoid unnecessary spin locks in these functions. Once conf->
nr_pengding[idx] is increased, a resync I/O with same barrier bucket index
has to wait in raise_barrier(). Then in _wait_barrier() if no barrier
raised in same barrier bucket index and array is not frozen, the regular
I/O doesn't need to hold conf->resync_lock, it can just increase
conf->nr_pending[idx], and return to its caller. wait_read_barrier() is
very similar to _wait_barrier(), the only difference is it only waits when
array is frozen. For heavy parallel reading I/Os, the lockless I/O barrier
code almostly gets rid of all spin lock cost.

This patch significantly improves raid1 reading peroformance. From my
testing, a raid1 device built by two NVMe SSD, runs fio with 64KB
blocksize, 40 seq read I/O jobs, 128 iodepth, overall throughput
increases from 2.7GB/s to 4.6GB/s (+70%).

Changelog
V4:
- Change conf->nr_queued[] to atomic_t.
- Define BARRIER_BUCKETS_NR_BITS by (PAGE_SHIFT - ilog2(sizeof(atomic_t)))
V3:
- Add smp_mb__after_atomic() as Shaohua and Neil suggested.
- Change conf->nr_queued[] from atomic_t to int.
- Change conf->array_frozen from atomic_t back to int, and use
  READ_ONCE(conf->array_frozen) to check value of conf->array_frozen
  in _wait_barrier() and wait_read_barrier().
- In _wait_barrier() and wait_read_barrier(), add a call to
  wake_up(&conf->wait_barrier) after atomic_dec(&conf->nr_pending[idx]),
  to fix a deadlock between  _wait_barrier()/wait_read_barrier and
  freeze_array().
V2:
- Remove a spin_lock/unlock pair in raid1d().
- Add more code comments to explain why there is no racy when checking two
  atomic_t variables at same time.
V1:
- Original RFC patch for comments.

Signed-off-by: Coly Li <colyli@suse.de>
Cc: Shaohua Li <shli@fb.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Guoqing Jiang <gqjiang@suse.com>
Reviewed-by: Neil Brown <neilb@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 165 ++++++++++++++++++++++++++++++++++++-----------------
 drivers/md/raid1.h |  31 +++++-----
 2 files changed, 130 insertions(+), 66 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 40297fd17f7e..fefbbfdb440b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -226,7 +226,7 @@ static void reschedule_retry(struct r1bio *r1_bio)
 	idx = sector_to_idx(r1_bio->sector);
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r1_bio->retry_list, &conf->retry_list);
-	conf->nr_queued[idx]++;
+	atomic_inc(&conf->nr_queued[idx]);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	wake_up(&conf->wait_barrier);
@@ -836,11 +836,21 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
 	spin_lock_irq(&conf->resync_lock);
 
 	/* Wait until no block IO is waiting */
-	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting[idx],
+	wait_event_lock_irq(conf->wait_barrier,
+			    !atomic_read(&conf->nr_waiting[idx]),
 			    conf->resync_lock);
 
 	/* block any new IO from starting */
-	conf->barrier[idx]++;
+	atomic_inc(&conf->barrier[idx]);
+	/*
+	 * In raise_barrier() we firstly increase conf->barrier[idx] then
+	 * check conf->nr_pending[idx]. In _wait_barrier() we firstly
+	 * increase conf->nr_pending[idx] then check conf->barrier[idx].
+	 * A memory barrier here to make sure conf->nr_pending[idx] won't
+	 * be fetched before conf->barrier[idx] is increased. Otherwise
+	 * there will be a race between raise_barrier() and _wait_barrier().
+	 */
+	smp_mb__after_atomic();
 
 	/* For these conditions we must wait:
 	 * A: while the array is in frozen state
@@ -851,42 +861,81 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
 	 */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->array_frozen &&
-			     !conf->nr_pending[idx] &&
-			     conf->barrier[idx] < RESYNC_DEPTH,
+			     !atomic_read(&conf->nr_pending[idx]) &&
+			     atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
 			    conf->resync_lock);
 
-	conf->nr_pending[idx]++;
+	atomic_inc(&conf->nr_pending[idx]);
 	spin_unlock_irq(&conf->resync_lock);
 }
 
 static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 {
-	unsigned long flags;
 	int idx = sector_to_idx(sector_nr);
 
-	BUG_ON(conf->barrier[idx] <= 0);
+	BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
 
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	conf->barrier[idx]--;
-	conf->nr_pending[idx]--;
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	atomic_dec(&conf->barrier[idx]);
+	atomic_dec(&conf->nr_pending[idx]);
 	wake_up(&conf->wait_barrier);
 }
 
 static void _wait_barrier(struct r1conf *conf, int idx)
 {
-	spin_lock_irq(&conf->resync_lock);
-	if (conf->array_frozen || conf->barrier[idx]) {
-		conf->nr_waiting[idx]++;
-		/* Wait for the barrier to drop. */
-		wait_event_lock_irq(
-			conf->wait_barrier,
-			!conf->array_frozen && !conf->barrier[idx],
-			conf->resync_lock);
-		conf->nr_waiting[idx]--;
-	}
+	/*
+	 * We need to increase conf->nr_pending[idx] very early here,
+	 * then raise_barrier() can be blocked when it waits for
+	 * conf->nr_pending[idx] to be 0. Then we can avoid holding
+	 * conf->resync_lock when there is no barrier raised in same
+	 * barrier unit bucket. Also if the array is frozen, I/O
+	 * should be blocked until array is unfrozen.
+	 */
+	atomic_inc(&conf->nr_pending[idx]);
+	/*
+	 * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
+	 * check conf->barrier[idx]. In raise_barrier() we firstly increase
+	 * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
+	 * barrier is necessary here to make sure conf->barrier[idx] won't be
+	 * fetched before conf->nr_pending[idx] is increased. Otherwise there
+	 * will be a race between _wait_barrier() and raise_barrier().
+	 */
+	smp_mb__after_atomic();
+
+	/*
+	 * Don't worry about checking two atomic_t variables at same time
+	 * here. If during we check conf->barrier[idx], the array is
+	 * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
+	 * 0, it is safe to return and make the I/O continue. Because the
+	 * array is frozen, all I/O returned here will eventually complete
+	 * or be queued, no race will happen. See code comment in
+	 * frozen_array().
+	 */
+	if (!READ_ONCE(conf->array_frozen) &&
+	    !atomic_read(&conf->barrier[idx]))
+		return;
 
-	conf->nr_pending[idx]++;
+	/*
+	 * After holding conf->resync_lock, conf->nr_pending[idx]
+	 * should be decreased before waiting for barrier to drop.
+	 * Otherwise, we may encounter a race condition because
+	 * raise_barrer() might be waiting for conf->nr_pending[idx]
+	 * to be 0 at same time.
+	 */
+	spin_lock_irq(&conf->resync_lock);
+	atomic_inc(&conf->nr_waiting[idx]);
+	atomic_dec(&conf->nr_pending[idx]);
+	/*
+	 * In case freeze_array() is waiting for
+	 * get_unqueued_pending() == extra
+	 */
+	wake_up(&conf->wait_barrier);
+	/* Wait for the barrier in same barrier unit bucket to drop. */
+	wait_event_lock_irq(conf->wait_barrier,
+			    !conf->array_frozen &&
+			     !atomic_read(&conf->barrier[idx]),
+			    conf->resync_lock);
+	atomic_inc(&conf->nr_pending[idx]);
+	atomic_dec(&conf->nr_waiting[idx]);
 	spin_unlock_irq(&conf->resync_lock);
 }
 
@@ -894,18 +943,32 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 {
 	int idx = sector_to_idx(sector_nr);
 
-	spin_lock_irq(&conf->resync_lock);
-	if (conf->array_frozen) {
-		conf->nr_waiting[idx]++;
-		/* Wait for array to unfreeze */
-		wait_event_lock_irq(
-			conf->wait_barrier,
-			!conf->array_frozen,
-			conf->resync_lock);
-		conf->nr_waiting[idx]--;
-	}
+	/*
+	 * Very similar to _wait_barrier(). The difference is, for read
+	 * I/O we don't need wait for sync I/O, but if the whole array
+	 * is frozen, the read I/O still has to wait until the array is
+	 * unfrozen. Since there is no ordering requirement with
+	 * conf->barrier[idx] here, memory barrier is unnecessary as well.
+	 */
+	atomic_inc(&conf->nr_pending[idx]);
 
-	conf->nr_pending[idx]++;
+	if (!READ_ONCE(conf->array_frozen))
+		return;
+
+	spin_lock_irq(&conf->resync_lock);
+	atomic_inc(&conf->nr_waiting[idx]);
+	atomic_dec(&conf->nr_pending[idx]);
+	/*
+	 * In case freeze_array() is waiting for
+	 * get_unqueued_pending() == extra
+	 */
+	wake_up(&conf->wait_barrier);
+	/* Wait for array to be unfrozen */
+	wait_event_lock_irq(conf->wait_barrier,
+			    !conf->array_frozen,
+			    conf->resync_lock);
+	atomic_inc(&conf->nr_pending[idx]);
+	atomic_dec(&conf->nr_waiting[idx]);
 	spin_unlock_irq(&conf->resync_lock);
 }
 
@@ -926,11 +989,7 @@ static void wait_all_barriers(struct r1conf *conf)
 
 static void _allow_barrier(struct r1conf *conf, int idx)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	conf->nr_pending[idx]--;
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	atomic_dec(&conf->nr_pending[idx]);
 	wake_up(&conf->wait_barrier);
 }
 
@@ -955,7 +1014,8 @@ static int get_unqueued_pending(struct r1conf *conf)
 	int idx, ret;
 
 	for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
-		ret += conf->nr_pending[idx] - conf->nr_queued[idx];
+		ret += atomic_read(&conf->nr_pending[idx]) -
+			atomic_read(&conf->nr_queued[idx]);
 
 	return ret;
 }
@@ -1000,8 +1060,8 @@ static void unfreeze_array(struct r1conf *conf)
 	/* reverse the effect of the freeze */
 	spin_lock_irq(&conf->resync_lock);
 	conf->array_frozen = 0;
-	wake_up(&conf->wait_barrier);
 	spin_unlock_irq(&conf->resync_lock);
+	wake_up(&conf->wait_barrier);
 }
 
 /* duplicate the data pages for behind I/O
@@ -2391,8 +2451,13 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 		spin_lock_irq(&conf->device_lock);
 		list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
 		idx = sector_to_idx(r1_bio->sector);
-		conf->nr_queued[idx]++;
+		atomic_inc(&conf->nr_queued[idx]);
 		spin_unlock_irq(&conf->device_lock);
+		/*
+		 * In case freeze_array() is waiting for condition
+		 * get_unqueued_pending() == extra to be true.
+		 */
+		wake_up(&conf->wait_barrier);
 		md_wakeup_thread(conf->mddev->thread);
 	} else {
 		if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2523,9 +2588,7 @@ static void raid1d(struct md_thread *thread)
 						  retry_list);
 			list_del(&r1_bio->retry_list);
 			idx = sector_to_idx(r1_bio->sector);
-			spin_lock_irqsave(&conf->device_lock, flags);
-			conf->nr_queued[idx]--;
-			spin_unlock_irqrestore(&conf->device_lock, flags);
+			atomic_dec(&conf->nr_queued[idx]);
 			if (mddev->degraded)
 				set_bit(R1BIO_Degraded, &r1_bio->state);
 			if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2547,7 +2610,7 @@ static void raid1d(struct md_thread *thread)
 		r1_bio = list_entry(head->prev, struct r1bio, retry_list);
 		list_del(head->prev);
 		idx = sector_to_idx(r1_bio->sector);
-		conf->nr_queued[idx]--;
+		atomic_dec(&conf->nr_queued[idx]);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r1_bio->mddev;
@@ -2664,7 +2727,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	 * If there is non-resync activity waiting for a turn, then let it
 	 * though before starting on this new sync request.
 	 */
-	if (conf->nr_waiting[idx])
+	if (atomic_read(&conf->nr_waiting[idx]))
 		schedule_timeout_uninterruptible(1);
 
 	/* we are incrementing sector_nr below. To be safe, we check against
@@ -2924,22 +2987,22 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		goto abort;
 
 	conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
-				   sizeof(int), GFP_KERNEL);
+				   sizeof(atomic_t), GFP_KERNEL);
 	if (!conf->nr_pending)
 		goto abort;
 
 	conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
-				   sizeof(int), GFP_KERNEL);
+				   sizeof(atomic_t), GFP_KERNEL);
 	if (!conf->nr_waiting)
 		goto abort;
 
 	conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
-				  sizeof(int), GFP_KERNEL);
+				  sizeof(atomic_t), GFP_KERNEL);
 	if (!conf->nr_queued)
 		goto abort;
 
 	conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
-				sizeof(int), GFP_KERNEL);
+				sizeof(atomic_t), GFP_KERNEL);
 	if (!conf->barrier)
 		goto abort;
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 3442e8fe3fcd..dd22a37d0d83 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -10,18 +10,19 @@
 /*
  * In struct r1conf, the following members are related to I/O barrier
  * buckets,
- *	int	*nr_pending;
- *	int	*nr_waiting;
- *	int	*nr_queued;
- *	int	*barrier;
- * Each of them points to array of integers, each array is designed to
- * have BARRIER_BUCKETS_NR elements and occupy a single memory page. The
- * data width of integer variables is 4, equal to 1<<(ilog2(sizeof(int))),
- * BARRIER_BUCKETS_NR_BITS is defined as (PAGE_SHIFT - ilog2(sizeof(int)))
- * to make sure an array of integers with BARRIER_BUCKETS_NR elements just
- * exactly occupies a single memory page.
+ *	atomic_t	*nr_pending;
+ *	atomic_t	*nr_waiting;
+ *	atomic_t	*nr_queued;
+ *	atomic_t	*barrier;
+ * Each of them points to array of atomic_t variables, each array is
+ * designed to have BARRIER_BUCKETS_NR elements and occupy a single
+ * memory page. The data width of atomic_t variables is 4 bytes, equal
+ * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined
+ * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of
+ * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly
+ * occupies a single memory page.
  */
-#define BARRIER_BUCKETS_NR_BITS		(PAGE_SHIFT - ilog2(sizeof(int)))
+#define BARRIER_BUCKETS_NR_BITS		(PAGE_SHIFT - ilog2(sizeof(atomic_t)))
 #define BARRIER_BUCKETS_NR		(1<<BARRIER_BUCKETS_NR_BITS)
 
 struct raid1_info {
@@ -83,10 +84,10 @@ struct r1conf {
 	 */
 	wait_queue_head_t	wait_barrier;
 	spinlock_t		resync_lock;
-	int			*nr_pending;
-	int			*nr_waiting;
-	int			*nr_queued;
-	int			*barrier;
+	atomic_t		*nr_pending;
+	atomic_t		*nr_waiting;
+	atomic_t		*nr_queued;
+	atomic_t		*barrier;
 	int			array_frozen;
 
 	/* Set to 1 if a full sync is needed, (fresh device added).
-- 
cgit v1.2.3-59-g8ed1b


From af5f42a7e426a87bfa69adc9b9d8930385a1ddf6 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Sun, 19 Feb 2017 22:41:27 -0800
Subject: md/raid1: fix a use-after-free bug

Commit fd76863 (RAID1: a new I/O barrier implementation to remove resync
window) introduces a user-after-free bug.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fefbbfdb440b..2e5e4805cbe1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -203,6 +203,7 @@ static void free_r1bio(struct r1bio *r1_bio)
 static void put_buf(struct r1bio *r1_bio)
 {
 	struct r1conf *conf = r1_bio->mddev->private;
+	sector_t sect = r1_bio->sector;
 	int i;
 
 	for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -213,7 +214,7 @@ static void put_buf(struct r1bio *r1_bio)
 
 	mempool_free(r1_bio, conf->r1buf_pool);
 
-	lower_barrier(conf, r1_bio->sector);
+	lower_barrier(conf, sect);
 }
 
 static void reschedule_retry(struct r1bio *r1_bio)
-- 
cgit v1.2.3-59-g8ed1b


From d939cdfde34f50b95254b375f498447c82190b3e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 21 Feb 2017 11:57:01 -0800
Subject: md/linear: shutup lockdep warnning

Commit 03a9e24(md linear: fix a race between linear_add() and
linear_congested()) introduces the warnning.

Acked-by: Coly Li <colyli@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/linear.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 789008bc94ff..5b06b0ddf7a3 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -224,7 +224,8 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
 	 * oldconf until no one uses it anymore.
 	 */
 	mddev_suspend(mddev);
-	oldconf = rcu_dereference(mddev->private);
+	oldconf = rcu_dereference_protected(mddev->private,
+			lockdep_is_held(&mddev->reconfig_mutex));
 	mddev->raid_disks++;
 	WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
 		"copied raid_disks doesn't match mddev->raid_disks");
-- 
cgit v1.2.3-59-g8ed1b


From aff8da09f2381f0869faaf6637b0d892a3ee99ed Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 21 Feb 2017 10:56:19 -0800
Subject: md/raid1: handle flush request correctly

I got a warning triggered in align_to_barrier_unit_end. It's a flush
request so sectors == 0. The flush request happens to work well without
the new barrier patch, but we'd better handle it explictly.

Cc: NeilBrown <neilb@suse.com>
Acked-by: Coly Li <colyli@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2e5e4805cbe1..8901f0c8c775 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1282,8 +1282,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	unsigned long flags;
 	const int op = bio_op(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
-	const unsigned long do_flush_fua = (bio->bi_opf &
-						(REQ_PREFLUSH | REQ_FUA));
+	const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
 	struct raid1_plug_cb *plug = NULL;
@@ -1509,7 +1508,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 				   conf->mirrors[i].rdev->data_offset);
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		bio_set_op_attrs(mbio, op, do_flush_fua | do_sync);
+		bio_set_op_attrs(mbio, op, do_fua | do_sync);
 		if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
 		    !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
 		    conf->raid_disks - mddev->degraded > 1)
@@ -1565,6 +1564,11 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 	struct bio *split;
 	sector_t sectors;
 
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
+		md_flush_request(mddev, bio);
+		return;
+	}
+
 	/* if bio exceeds barrier unit boundary, split it */
 	do {
 		sectors = align_to_barrier_unit_end(
-- 
cgit v1.2.3-59-g8ed1b


From 1ec492232ed659acde8cc00b9ecc7529778e03e1 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 21 Feb 2017 14:27:57 -0800
Subject: md/raid1: fix write behind issues introduced by
 bio_clone_bioset_partial

There are two issues, introduced by commit 8e58e32(md/raid1: use
bio_clone_bioset_partial() in case of write behind):
- bio_clone_bioset_partial() uses bytes instead of sectors as parameters
- in writebehind mode, we return bio if all !writemostly disk bios finish,
  which could happen before writemostly disk bios run. So all
  writemostly disk bios should have their bvec. Here we just make sure
  all bios are cloned instead of fast cloned.

Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8901f0c8c775..d4e8796b4569 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1472,8 +1472,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 			    !waitqueue_active(&bitmap->behind_wait)) {
 				mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
 								mddev->bio_set,
-								offset,
-								max_sectors);
+								offset << 9,
+								max_sectors << 9);
 				alloc_behind_pages(mbio, r1_bio);
 			}
 
@@ -1485,8 +1485,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 		}
 
 		if (!mbio) {
-			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-			bio_trim(mbio, offset, max_sectors);
+			if (r1_bio->behind_bvecs)
+				mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
+								mddev->bio_set,
+								offset << 9,
+								max_sectors << 9);
+			else {
+				mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+				bio_trim(mbio, offset, max_sectors);
+			}
 		}
 
 		if (r1_bio->behind_bvecs) {
-- 
cgit v1.2.3-59-g8ed1b