From bbfa57c0f2243a7c31fd248d22e9861a2802cad5 Mon Sep 17 00:00:00 2001
From: Sebastian Riemer <sebastian.riemer@profitbricks.com>
Date: Thu, 21 Feb 2013 13:28:09 +1100
Subject: md: protect against crash upon fsync on ro array

If an fsync occurs on a read-only array, we need to send a
completion for the IO and may not increment the active IO count.
Otherwise, we hit a bug trace and can't stop the MD array anymore.

By advice of Christoph Hellwig we return success upon a flush
request but we return -EROFS for other writes.
We detect flush requests by checking if the bio has zero sectors.

This patch is suitable to any -stable kernel to which it applies.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: NeilBrown <neilb@suse.de>
Cc: stable@vger.kernel.org
Signed-off-by: Sebastian Riemer <sebastian.riemer@profitbricks.com>
Reported-by: Ben Hutchings <ben@decadent.org.uk>
Acked-by: Paul Menzel <paulepanter@users.sourceforge.net>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3db3d1b271f7..1e634a68541e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 		bio_io_error(bio);
 		return;
 	}
+	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
+		bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
+		return;
+	}
 	smp_rmb(); /* Ensure implications of  'active' are visible */
 	rcu_read_lock();
 	if (mddev->suspended) {
-- 
cgit v1.2.3-59-g8ed1b


From c8dc9c654794a765ca61baed07f84ed8aaa7ca8c Mon Sep 17 00:00:00 2001
From: Joe Lawrence <Joe.Lawrence@stratus.com>
Date: Thu, 21 Feb 2013 13:28:09 +1100
Subject: md: raid1,10: Handle REQ_WRITE_SAME flag in write bios

Set mddev queue's max_write_same_sectors to its chunk_sector value (before
disk_stack_limits merges the underlying disk limits.)  With that in place,
be sure to handle writes coming down from the block layer that have the
REQ_WRITE_SAME flag set.  That flag needs to be copied into any newly cloned
write bio.

Signed-off-by: Joe Lawrence <joe.lawrence@stratus.com>
Acked-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 7 ++++++-
 drivers/md/raid10.c | 9 +++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d5bddfc4010e..6e5d5a5f9cb4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1000,6 +1000,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 	const unsigned long do_discard = (bio->bi_rw
 					  & (REQ_DISCARD | REQ_SECURE));
+	const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
 	struct raid1_plug_cb *plug = NULL;
@@ -1301,7 +1302,8 @@ read_again:
 				   conf->mirrors[i].rdev->data_offset);
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard;
+		mbio->bi_rw =
+			WRITE | do_flush_fua | do_sync | do_discard | do_same;
 		mbio->bi_private = r1_bio;
 
 		atomic_inc(&r1_bio->remaining);
@@ -2818,6 +2820,9 @@ static int run(struct mddev *mddev)
 	if (IS_ERR(conf))
 		return PTR_ERR(conf);
 
+	if (mddev->queue)
+		blk_queue_max_write_same_sectors(mddev->queue,
+						 mddev->chunk_sectors);
 	rdev_for_each(rdev, mddev) {
 		if (!mddev->gendisk)
 			continue;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03b..1a74c12f0a6e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1105,6 +1105,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
 	const unsigned long do_discard = (bio->bi_rw
 					  & (REQ_DISCARD | REQ_SECURE));
+	const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
@@ -1460,7 +1461,8 @@ retry_write:
 							      rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
-			mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+			mbio->bi_rw =
+				WRITE | do_sync | do_fua | do_discard | do_same;
 			mbio->bi_private = r10_bio;
 
 			atomic_inc(&r10_bio->remaining);
@@ -1502,7 +1504,8 @@ retry_write:
 						   r10_bio, rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
-			mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+			mbio->bi_rw =
+				WRITE | do_sync | do_fua | do_discard | do_same;
 			mbio->bi_private = r10_bio;
 
 			atomic_inc(&r10_bio->remaining);
@@ -3569,6 +3572,8 @@ static int run(struct mddev *mddev)
 	if (mddev->queue) {
 		blk_queue_max_discard_sectors(mddev->queue,
 					      mddev->chunk_sectors);
+		blk_queue_max_write_same_sectors(mddev->queue,
+						 mddev->chunk_sectors);
 		blk_queue_io_min(mddev->queue, chunk_size);
 		if (conf->geo.raid_disks % conf->geo.near_copies)
 			blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
-- 
cgit v1.2.3-59-g8ed1b


From 4c0ca26bd260dddf3b9781758cb5e2df3f74d4a3 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 21 Feb 2013 13:28:09 +1100
Subject: MD RAID10: Minor non-functional code changes

Changes include assigning 'addr' from 's' instead of 'sector' to be
consistent with the way the code does it just a few lines later and
using '%=' vs a conditional and subtraction.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1a74c12f0a6e..de174ad6f8bd 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -552,14 +552,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
 	for (n = 0; n < geo->near_copies; n++) {
 		int d = dev;
 		sector_t s = sector;
-		r10bio->devs[slot].addr = sector;
 		r10bio->devs[slot].devnum = d;
+		r10bio->devs[slot].addr = s;
 		slot++;
 
 		for (f = 1; f < geo->far_copies; f++) {
 			d += geo->near_copies;
-			if (d >= geo->raid_disks)
-				d -= geo->raid_disks;
+			d %= geo->raid_disks;
 			s += geo->stride;
 			r10bio->devs[slot].devnum = d;
 			r10bio->devs[slot].addr = s;
-- 
cgit v1.2.3-59-g8ed1b


From 475901aff15841fb0a81e7546517407779a9b061 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 21 Feb 2013 13:28:10 +1100
Subject: MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part
 1)

The MD RAID10 'far' and 'offset' algorithms make copies of entire stripe
widths - copying them to a different location on the same devices after
shifting the stripe.  An example layout of each follows below:

	        "far" algorithm
	dev1 dev2 dev3 dev4 dev5 dev6
	==== ==== ==== ==== ==== ====
	 A    B    C    D    E    F
	 G    H    I    J    K    L
	            ...
	 F    A    B    C    D    E  --> Copy of stripe0, but shifted by 1
	 L    G    H    I    J    K
	            ...

		"offset" algorithm
	dev1 dev2 dev3 dev4 dev5 dev6
	==== ==== ==== ==== ==== ====
	 A    B    C    D    E    F
	 F    A    B    C    D    E  --> Copy of stripe0, but shifted by 1
	 G    H    I    J    K    L
	 L    G    H    I    J    K
	            ...

Redundancy for these algorithms is gained by shifting the copied stripes
one device to the right.  This patch proposes that array be divided into
sets of adjacent devices and when the stripe copies are shifted, they wrap
on set boundaries rather than the array size boundary.  That is, for the
purposes of shifting, the copies are confined to their sets within the
array.  The sets are 'near_copies * far_copies' in size.

The above "far" algorithm example would change to:
	        "far" algorithm
	dev1 dev2 dev3 dev4 dev5 dev6
	==== ==== ==== ==== ==== ====
	 A    B    C    D    E    F
	 G    H    I    J    K    L
	            ...
	 B    A    D    C    F    E  --> Copy of stripe0, shifted 1, 2-dev sets
	 H    G    J    I    L    K      Dev sets are 1-2, 3-4, 5-6
	            ...

This has the affect of improving the redundancy of the array.  We can
always sustain at least one failure, but sometimes more than one can
be handled.  In the first examples, the pairs of devices that CANNOT fail
together are:
	(1,2) (2,3) (3,4) (4,5) (5,6) (1, 6) [40% of possible pairs]
In the example where the copies are confined to sets, the pairs of
devices that cannot fail together are:
	(1,2) (3,4) (5,6)                    [20% of possible pairs]

We cannot simply replace the old algorithms, so the 17th bit of the 'layout'
variable is used to indicate whether we use the old or new method of computing
the shift.  (This is similar to the way the 16th bit indicates whether the
"far" algorithm or the "offset" algorithm is being used.)

This patch only handles the cases where the number of total raid disks is
a multiple of 'far_copies'.  A follow-on patch addresses the condition where
this is not true.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 58 ++++++++++++++++++++++++++++++++++++-----------------
 drivers/md/raid10.h |  5 +++++
 2 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index de174ad6f8bd..70b58b4bcf89 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
  *    near_copies (stored in low byte of layout)
  *    far_copies (stored in second byte of layout)
  *    far_offset (stored in bit 16 of layout )
+ *    use_far_sets (stored in bit 17 of layout )
  *
- * The data to be stored is divided into chunks using chunksize.
- * Each device is divided into far_copies sections.
- * In each section, chunks are laid out in a style similar to raid0, but
- * near_copies copies of each chunk is stored (each on a different drive).
- * The starting device for each section is offset near_copies from the starting
- * device of the previous section.
- * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
- * drive.
- * near_copies and far_copies must be at least one, and their product is at most
- * raid_disks.
+ * The data to be stored is divided into chunks using chunksize.  Each device
+ * is divided into far_copies sections.   In each section, chunks are laid out
+ * in a style similar to raid0, but near_copies copies of each chunk is stored
+ * (each on a different drive).  The starting device for each section is offset
+ * near_copies from the starting device of the previous section.  Thus there
+ * are (near_copies * far_copies) of each chunk, and each is on a different
+ * drive.  near_copies and far_copies must be at least one, and their product
+ * is at most raid_disks.
  *
  * If far_offset is true, then the far_copies are handled a bit differently.
- * The copies are still in different stripes, but instead of be very far apart
- * on disk, there are adjacent stripes.
+ * The copies are still in different stripes, but instead of being very far
+ * apart on disk, there are adjacent stripes.
+ *
+ * The far and offset algorithms are handled slightly differently if
+ * 'use_far_sets' is true.  In this case, the array's devices are grouped into
+ * sets that are (near_copies * far_copies) in size.  The far copied stripes
+ * are still shifted by 'near_copies' devices, but this shifting stays confined
+ * to the set rather than the entire array.  This is done to improve the number
+ * of device combinations that can fail without causing the array to fail.
+ * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
+ * on a device):
+ *    A B C D    A B C D E
+ *      ...         ...
+ *    D A B C    E A B C D
+ * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
+ *    [A B] [C D]    [A B] [C D E]
+ *    |...| |...|    |...| | ... |
+ *    [B A] [D C]    [B A] [E C D]
  */
 
 /*
@@ -551,14 +566,18 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
 	/* and calculate all the others */
 	for (n = 0; n < geo->near_copies; n++) {
 		int d = dev;
+		int set;
 		sector_t s = sector;
 		r10bio->devs[slot].devnum = d;
 		r10bio->devs[slot].addr = s;
 		slot++;
 
 		for (f = 1; f < geo->far_copies; f++) {
+			set = d / geo->far_set_size;
 			d += geo->near_copies;
-			d %= geo->raid_disks;
+			d %= geo->far_set_size;
+			d += geo->far_set_size * set;
+
 			s += geo->stride;
 			r10bio->devs[slot].devnum = d;
 			r10bio->devs[slot].addr = s;
@@ -594,6 +613,8 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 	 * or recovery, so reshape isn't happening
 	 */
 	struct geom *geo = &conf->geo;
+	int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
+	int far_set_size = geo->far_set_size;
 
 	offset = sector & geo->chunk_mask;
 	if (geo->far_offset) {
@@ -601,13 +622,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 		chunk = sector >> geo->chunk_shift;
 		fc = sector_div(chunk, geo->far_copies);
 		dev -= fc * geo->near_copies;
-		if (dev < 0)
-			dev += geo->raid_disks;
+		if (dev < far_set_start)
+			dev += far_set_size;
 	} else {
 		while (sector >= geo->stride) {
 			sector -= geo->stride;
-			if (dev < geo->near_copies)
-				dev += geo->raid_disks - geo->near_copies;
+			if (dev < (geo->near_copies + far_set_start))
+				dev += far_set_size - geo->near_copies;
 			else
 				dev -= geo->near_copies;
 		}
@@ -3438,7 +3459,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
 		disks = mddev->raid_disks + mddev->delta_disks;
 		break;
 	}
-	if (layout >> 17)
+	if (layout >> 18)
 		return -1;
 	if (chunk < (PAGE_SIZE >> 9) ||
 	    !is_power_of_2(chunk))
@@ -3450,6 +3471,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
 	geo->near_copies = nc;
 	geo->far_copies = fc;
 	geo->far_offset = fo;
+	geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
 	geo->chunk_mask = chunk - 1;
 	geo->chunk_shift = ffz(~chunk);
 	return nc*fc;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1054cf602345..157d69e83ff4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,6 +33,11 @@ struct r10conf {
 					       * far_offset, in which case it is
 					       * 1 stripe.
 					       */
+		int             far_set_size; /* The number of devices in a set,
+					       * where a 'set' are devices that
+					       * contain far/offset copies of
+					       * each other.
+					       */
 		int		chunk_shift; /* shift from chunks to sectors */
 		sector_t	chunk_mask;
 	} prev, geo;
-- 
cgit v1.2.3-59-g8ed1b


From 9a3152ab024867100f2f50d124b998d05fb1c3f6 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 21 Feb 2013 13:28:10 +1100
Subject: MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part
 2)

MD RAID10:  Improve redundancy for 'far' and 'offset' algorithms (part 2)

This patch addresses raid arrays that have a number of devices that cannot
be evenly divided by 'far_copies'.  (E.g. 5 devices, far_copies = 2)  This
case must be handled differently because it causes that last set to be of
a different size than the rest of the sets.  We must compute a new modulo
for this last set so that copied chunks are properly wrapped around.

Example use_far_sets=1, far_copies=2, near_copies=1, devices=5:
                "far" algorithm
        dev1 dev2 dev3 dev4 dev5
	==== ==== ==== ==== ====
	[ A   B ] [ C    D   E ]
        [ G   H ] [ I    J   K ]
                    ...
        [ B   A ] [ E    C   D ] --> nominal set of 2 and last set of 3
        [ H   G ] [ K    I   J ]     []'s show far/offset sets

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 70b58b4bcf89..61ed150bd0cf 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -550,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
 	sector_t stripe;
 	int dev;
 	int slot = 0;
+	int last_far_set_start, last_far_set_size;
+
+	last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+	last_far_set_start *= geo->far_set_size;
+
+	last_far_set_size = geo->far_set_size;
+	last_far_set_size += (geo->raid_disks % geo->far_set_size);
 
 	/* now calculate first sector/dev */
 	chunk = r10bio->sector >> geo->chunk_shift;
@@ -575,9 +582,16 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
 		for (f = 1; f < geo->far_copies; f++) {
 			set = d / geo->far_set_size;
 			d += geo->near_copies;
-			d %= geo->far_set_size;
-			d += geo->far_set_size * set;
 
+			if ((geo->raid_disks % geo->far_set_size) &&
+			    (d > last_far_set_start)) {
+				d -= last_far_set_start;
+				d %= last_far_set_size;
+				d += last_far_set_start;
+			} else {
+				d %= geo->far_set_size;
+				d += geo->far_set_size * set;
+			}
 			s += geo->stride;
 			r10bio->devs[slot].devnum = d;
 			r10bio->devs[slot].addr = s;
@@ -615,6 +629,18 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 	struct geom *geo = &conf->geo;
 	int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
 	int far_set_size = geo->far_set_size;
+	int last_far_set_start;
+
+	if (geo->raid_disks % geo->far_set_size) {
+		last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+		last_far_set_start *= geo->far_set_size;
+
+		if (dev >= last_far_set_start) {
+			far_set_size = geo->far_set_size;
+			far_set_size += (geo->raid_disks % geo->far_set_size);
+			far_set_start = last_far_set_start;
+		}
+	}
 
 	offset = sector & geo->chunk_mask;
 	if (geo->far_offset) {
-- 
cgit v1.2.3-59-g8ed1b


From fe5d2f4a15967bbe907e7b3e31e49dae7af7cc6b Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 21 Feb 2013 13:28:10 +1100
Subject: DM RAID: Add support for MD's RAID10 "far" and "offset" algorithms

DM RAID:  Add support for MD's RAID10 "far" and "offset" algorithms

Until now, dm-raid.c only supported the "near" algorthm of MD's RAID10
implementation.  This patch adds support for the "far" and "offset"
algorithms, but only with the improved redundancy that is brought with
the introduction of the 'use_far_sets' bit, which shifts copied stripes
according to smaller sets vs the entire array.  That is, the 17th bit
of the 'layout' variable that defines the RAID10 implementation will
always be set.   (More information on how the 'layout' variable selects
the RAID10 algorithm can be found in the opening comments of
drivers/md/raid10.c.)

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 Documentation/device-mapper/dm-raid.txt |  44 ++++++++++--
 drivers/md/dm-raid.c                    | 123 ++++++++++++++++++++++++++------
 2 files changed, 140 insertions(+), 27 deletions(-)

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 56fb62b09fc5..b428556197c9 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -30,6 +30,7 @@ The target is named "raid" and it accepts the following parameters:
   raid10        Various RAID10 inspired algorithms chosen by additional params
 		- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
 		- RAID1E: Integrated Adjacent Stripe Mirroring
+		- RAID1E: Integrated Offset Stripe Mirroring
 		-  and other similar RAID10 variants
 
   Reference: Chapter 4 of
@@ -64,15 +65,15 @@ The target is named "raid" and it accepts the following parameters:
 		synchronisation state for each region.
 
         [raid10_copies   <# copies>]
-        [raid10_format   near]
+        [raid10_format   <near|far|offset>]
 		These two options are used to alter the default layout of
 		a RAID10 configuration.  The number of copies is can be
-		specified, but the default is 2.  There are other variations
-		to how the copies are laid down - the default and only current
-		option is "near".  Near copies are what most people think of
-		with respect to mirroring.  If these options are left
-		unspecified, or 'raid10_copies 2' and/or 'raid10_format near'
-		are given, then the layouts for 2, 3 and 4 devices are:
+		specified, but the default is 2.  There are also three
+		variations to how the copies are laid down - the default
+		is "near".  Near copies are what most people think of with
+		respect to mirroring.  If these options are left unspecified,
+		or 'raid10_copies 2' and/or 'raid10_format near' are given,
+		then the layouts for 2, 3 and 4 devices	are:
 		2 drives         3 drives          4 drives
 		--------         ----------        --------------
 		A1  A1           A1  A1  A2        A1  A1  A2  A2
@@ -85,6 +86,33 @@ The target is named "raid" and it accepts the following parameters:
 		3-device layout is what might be called a 'RAID1E - Integrated
 		Adjacent Stripe Mirroring'.
 
+		If 'raid10_copies 2' and 'raid10_format far', then the layouts
+		for 2, 3 and 4 devices are:
+		2 drives             3 drives             4 drives
+		--------             --------------       --------------------
+		A1  A2               A1   A2   A3         A1   A2   A3   A4
+		A3  A4               A4   A5   A6         A5   A6   A7   A8
+		A5  A6               A7   A8   A9         A9   A10  A11  A12
+		..  ..               ..   ..   ..         ..   ..   ..   ..
+		A2  A1               A3   A1   A2         A2   A1   A4   A3
+		A4  A3               A6   A4   A5         A6   A5   A8   A7
+		A6  A5               A9   A7   A8         A10  A9   A12  A11
+		..  ..               ..   ..   ..         ..   ..   ..   ..
+
+		If 'raid10_copies 2' and 'raid10_format offset', then the
+		layouts for 2, 3 and 4 devices are:
+		2 drives       3 drives           4 drives
+		--------       ------------       -----------------
+		A1  A2         A1  A2  A3         A1  A2  A3  A4
+		A2  A1         A3  A1  A2         A2  A1  A4  A3
+		A3  A4         A4  A5  A6         A5  A6  A7  A8
+		A4  A3         A6  A4  A5         A6  A5  A8  A7
+		A5  A6         A7  A8  A9         A9  A10 A11 A12
+		A6  A5         A9  A7  A8         A10 A9  A12 A11
+		..  ..         ..  ..  ..         ..  ..  ..  ..
+		Here we see layouts closely akin to 'RAID1E - Integrated
+		Offset Stripe Mirroring'.
+
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
 	containing the metadata (if any); the second is the one containing the
@@ -142,3 +170,5 @@ Version History
 1.3.0	Added support for RAID 10
 1.3.1	Allow device replacement/rebuild for RAID 10
 1.3.2   Fix/improve redundancy checking for RAID10
+1.4.0	Non-functional change.  Removes arg from mapping function.
+1.4.1   Add RAID10 "far" and "offset" algorithm support.
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9e58dbd8d8cb..22fd55993723 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -91,15 +91,44 @@ static struct raid_type {
 	{"raid6_nc", "RAID6 (N continue)",		2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
 };
 
+static char *raid10_md_layout_to_format(int layout)
+{
+	/*
+	 * Bit 16 and 17 stand for "offset" and "use_far_sets"
+	 * Refer to MD's raid10.c for details
+	 */
+	if ((layout & 0x10000) && (layout & 0x20000))
+		return "offset";
+
+	if ((layout & 0xFF) > 1)
+		return "near";
+
+	return "far";
+}
+
 static unsigned raid10_md_layout_to_copies(int layout)
 {
-	return layout & 0xFF;
+	if ((layout & 0xFF) > 1)
+		return layout & 0xFF;
+	return (layout >> 8) & 0xFF;
 }
 
 static int raid10_format_to_md_layout(char *format, unsigned copies)
 {
-	/* 1 "far" copy, and 'copies' "near" copies */
-	return (1 << 8) | (copies & 0xFF);
+	unsigned n = 1, f = 1;
+
+	if (!strcmp("near", format))
+		n = copies;
+	else
+		f = copies;
+
+	if (!strcmp("offset", format))
+		return 0x30000 | (f << 8) | n;
+
+	if (!strcmp("far", format))
+		return 0x20000 | (f << 8) | n;
+
+	return (f << 8) | n;
 }
 
 static struct raid_type *get_raid_type(char *name)
@@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
 {
 	unsigned i, rebuild_cnt = 0;
 	unsigned rebuilds_per_group, copies, d;
+	unsigned group_size, last_group_start;
 
 	for (i = 0; i < rs->md.raid_disks; i++)
 		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
@@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		 * as long as the failed devices occur in different mirror
 		 * groups (i.e. different stripes).
 		 *
-		 * Right now, we only allow for "near" copies.  When other
-		 * formats are added, we will have to check those too.
-		 *
 		 * When checking "near" format, make sure no adjacent devices
 		 * have failed beyond what can be handled.  In addition to the
 		 * simple case where the number of devices is a multiple of the
@@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		 *          A    A    B    B    C
 		 *          C    D    D    E    E
 		 */
-		for (i = 0; i < rs->md.raid_disks * copies; i++) {
-			if (!(i % copies))
+		if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
+			for (i = 0; i < rs->md.raid_disks * copies; i++) {
+				if (!(i % copies))
+					rebuilds_per_group = 0;
+				d = i % rs->md.raid_disks;
+				if ((!rs->dev[d].rdev.sb_page ||
+				     !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+				    (++rebuilds_per_group >= copies))
+					goto too_many;
+			}
+			break;
+		}
+
+		/*
+		 * When checking "far" and "offset" formats, we need to ensure
+		 * that the device that holds its copy is not also dead or
+		 * being rebuilt.  (Note that "far" and "offset" formats only
+		 * support two copies right now.  These formats also only ever
+		 * use the 'use_far_sets' variant.)
+		 *
+		 * This check is somewhat complicated by the need to account
+		 * for arrays that are not a multiple of (far) copies.  This
+		 * results in the need to treat the last (potentially larger)
+		 * set differently.
+		 */
+		group_size = (rs->md.raid_disks / copies);
+		last_group_start = (rs->md.raid_disks / group_size) - 1;
+		last_group_start *= group_size;
+		for (i = 0; i < rs->md.raid_disks; i++) {
+			if (!(i % copies) && !(i > last_group_start))
 				rebuilds_per_group = 0;
-			d = i % rs->md.raid_disks;
-			if ((!rs->dev[d].rdev.sb_page ||
-			     !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+			if ((!rs->dev[i].rdev.sb_page ||
+			     !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
 			    (++rebuilds_per_group >= copies))
-				goto too_many;
+					goto too_many;
 		}
 		break;
 	default:
@@ -433,7 +487,7 @@ too_many:
  *
  * RAID10-only options:
  *    [raid10_copies <# copies>]        Number of copies.  (Default: 2)
- *    [raid10_format <near>]            Layout algorithm.  (Default: near)
+ *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
  */
 static int parse_raid_params(struct raid_set *rs, char **argv,
 			     unsigned num_raid_params)
@@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
 				return -EINVAL;
 			}
-			if (strcmp("near", argv[i])) {
+			if (strcmp("near", argv[i]) &&
+			    strcmp("far", argv[i]) &&
+			    strcmp("offset", argv[i])) {
 				rs->ti->error = "Invalid 'raid10_format' value given";
 				return -EINVAL;
 			}
@@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			return -EINVAL;
 		}
 
+		/*
+		 * If the format is not "near", we only support
+		 * two copies at the moment.
+		 */
+		if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
+			rs->ti->error = "Too many copies for given RAID10 format.";
+			return -EINVAL;
+		}
+
 		/* (Len * #mirrors) / #devices */
 		sectors_per_dev = rs->ti->len * raid10_copies;
 		sector_div(sectors_per_dev, rs->md.raid_disks);
@@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 	/*
 	 * Reshaping is not currently allowed
 	 */
-	if ((le32_to_cpu(sb->level) != mddev->level) ||
-	    (le32_to_cpu(sb->layout) != mddev->layout) ||
-	    (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
-		DMERR("Reshaping arrays not yet supported.");
+	if (le32_to_cpu(sb->level) != mddev->level) {
+		DMERR("Reshaping arrays not yet supported. (RAID level change)");
+		return -EINVAL;
+	}
+	if (le32_to_cpu(sb->layout) != mddev->layout) {
+		DMERR("Reshaping arrays not yet supported. (RAID layout change)");
+		DMERR("  0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
+		DMERR("  Old layout: %s w/ %d copies",
+		      raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
+		      raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
+		DMERR("  New layout: %s w/ %d copies",
+		      raid10_md_layout_to_format(mddev->layout),
+		      raid10_md_layout_to_copies(mddev->layout));
+		return -EINVAL;
+	}
+	if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
+		DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
 		return -EINVAL;
 	}
 
 	/* We can only change the number of devices in RAID1 right now */
 	if ((rs->raid_type->level != 1) &&
 	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
-		DMERR("Reshaping arrays not yet supported.");
+		DMERR("Reshaping arrays not yet supported. (device count change)");
 		return -EINVAL;
 	}
 
@@ -1329,7 +1407,8 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 			       raid10_md_layout_to_copies(rs->md.layout));
 
 		if (rs->print_flags & DMPF_RAID10_FORMAT)
-			DMEMIT(" raid10_format near");
+			DMEMIT(" raid10_format %s",
+			       raid10_md_layout_to_format(rs->md.layout));
 
 		DMEMIT(" %d", rs->md.raid_disks);
 		for (i = 0; i < rs->md.raid_disks; i++) {
@@ -1420,6 +1499,10 @@ static struct target_type raid_target = {
 
 static int __init dm_raid_init(void)
 {
+	DMINFO("Loading target version %u.%u.%u",
+	       raid_target.version[0],
+	       raid_target.version[1],
+	       raid_target.version[2]);
 	return dm_register_target(&raid_target);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a64685399181780998281fe07309a94b25dd24c3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 Feb 2013 14:33:17 +1100
Subject: md: fix two bugs when attempting to resize RAID0 array.

You cannot resize a RAID0 array (in terms of making the devices
bigger), but the code doesn't entirely stop you.
So:

 disable setting of the available size on each device for
 RAID0 and Linear devices.  This must not change as doing so
 can change the effective layout of data.

 Make sure that the size that raid0_size() reports is accurate,
 but rounding devices sizes to chunk sizes.  As the device sizes
 cannot change now, this isn't so important, but it is best to be
 safe.

Without this change:
  mdadm --grow /dev/md0 -z max
  mdadm --grow /dev/md0 -Z max
  then read to the end of the array

can cause a BUG in a RAID0 array.

These bugs have been present ever since it became possible
to resize any device, which is a long time.  So the fix is
suitable for any -stable kerenl.

Cc: stable@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    | 3 +++
 drivers/md/raid0.c | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1e634a68541e..f363135144f6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2998,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
 		} else if (!sectors)
 			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
 				rdev->data_offset;
+		if (!my_mddev->pers->resize)
+			/* Cannot change size for RAID0 or Linear etc */
+			return -EINVAL;
 	}
 	if (sectors < my_mddev->dev_sectors)
 		return -EINVAL; /* component must fit device */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 24b359717a7e..15c8d3505450 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -411,7 +411,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
 		  "%s does not support generic reshape\n", __func__);
 
 	rdev_for_each(rdev, mddev)
-		array_sectors += rdev->sectors;
+		array_sectors += (rdev->sectors &
+				  ~(sector_t)(mddev->chunk_sectors-1));
 
 	return array_sectors;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 58ebb34c49fcfcaa029e4b1c1453d92583900f9a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 Feb 2013 15:36:38 +1100
Subject: md: raid0: fix error return from create_stripe_zones.

Create_stripe_zones returns an error slightly differently to
raid0_run and to raid0_takeover_*.

The error returned used by the second was wrong and an error would
result in mddev->private being set to NULL and sooner or later a
crash.

So never return NULL, return ERR_PTR(err), not NULL from
create_stripe_zones.

This bug has been present since 2.6.35 so the fix is suitable
for any kernel since then.

Cc: stable@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 15c8d3505450..d9babda582b9 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -289,7 +289,7 @@ abort:
 	kfree(conf->strip_zone);
 	kfree(conf->devlist);
 	kfree(conf);
-	*private_conf = NULL;
+	*private_conf = ERR_PTR(err);
 	return err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From f96c9f305c24a0d4a075e2c75aa6b417aa238687 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 Feb 2013 15:50:07 +1100
Subject: md/raid0: improve error message when converting RAID4-with-spares to
 RAID0

Mentioning "bad disk number -1" exposes irrelevant internal detail.
Just say they are inactive and must be removed.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d9babda582b9..0505452de8d6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 			rdev1->new_raid_disk = j;
 		}
 
-		if (j < 0 || j >= mddev->raid_disks) {
+		if (j < 0) {
+			printk(KERN_ERR
+			       "md/raid0:%s: remove inactive devices before converting to RAID0\n",
+			       mdname(mddev));
+			goto abort;
+		}
+		if (j >= mddev->raid_disks) {
 			printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
 			       "aborting!\n", mdname(mddev), j);
 			goto abort;
-- 
cgit v1.2.3-59-g8ed1b


From ee0b0244030434cdda26777bfb98962447e080cd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 25 Feb 2013 12:38:29 +1100
Subject: md/raid1,raid10: fix deadlock with freeze_array()

When raid1/raid10 needs to fix a read error, it first drains
all pending requests by calling freeze_array().
This calls flush_pending_writes() if it needs to sleep,
but some writes may be pending in a per-process plug rather
than in the per-array request queue.

When raid1{,0}_unplug() moves the request from the per-process
plug to the per-array request queue (from which
flush_pending_writes() can flush them), it needs to wake up
freeze_array(), or freeze_array() will never flush them and so
it will block forever.

So add the requires wake_up() calls.

This bug was introduced by commit
   f54a9d0e59c4bea3db733921ca9147612a6f292c
for raid1 and a similar commit for RAID10, and so has been present
since linux-3.6.  As the bug causes a deadlock I believe this fix is
suitable for -stable.

Cc: stable@vger.kernel.org (3.6.y 3.7.y 3.8.y)
Reported-by: Tregaron Bayly <tbayly@bluehost.com>
Tested-by: Tregaron Bayly <tbayly@bluehost.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 1 +
 drivers/md/raid10.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6e5d5a5f9cb4..fd86b372692d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		bio_list_merge(&conf->pending_bio_list, &plug->pending);
 		conf->pending_count += plug->pending_cnt;
 		spin_unlock_irq(&conf->device_lock);
+		wake_up(&conf->wait_barrier);
 		md_wakeup_thread(mddev->thread);
 		kfree(plug);
 		return;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 61ed150bd0cf..77b562d18a90 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1119,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		bio_list_merge(&conf->pending_bio_list, &plug->pending);
 		conf->pending_count += plug->pending_cnt;
 		spin_unlock_irq(&conf->device_lock);
+		wake_up(&conf->wait_barrier);
 		md_wakeup_thread(mddev->thread);
 		kfree(plug);
 		return;
-- 
cgit v1.2.3-59-g8ed1b


From 51acbcec6c42b24482bac18e42befc822524535d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Feb 2013 09:08:34 +1100
Subject: md: remove CONFIG_MULTICORE_RAID456

This doesn't seem to actually help and we have an alternate
multi-threading approach waiting in the wings, so just get
rid of this config option and associated code.

As a bonus, we remove one use of CONFIG_EXPERIMENTAL

Cc: Dan Williams <djbw@fb.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/Kconfig | 11 -----------
 drivers/md/raid5.c | 38 +-------------------------------------
 2 files changed, 1 insertion(+), 48 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 91a02eeeb319..9a10313d0670 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -154,17 +154,6 @@ config MD_RAID456
 
 	  If unsure, say Y.
 
-config MULTICORE_RAID456
-	bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
-	depends on MD_RAID456
-	depends on SMP
-	depends on EXPERIMENTAL
-	---help---
-	  Enable the raid456 module to dispatch per-stripe raid operations to a
-	  thread pool.
-
-	  If unsure, say N.
-
 config MD_MULTIPATH
 	tristate "Multipath I/O support"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 19d77a026639..35031c8b2d02 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1406,7 +1406,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
 
-static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
+static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
@@ -1471,36 +1471,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	put_cpu();
 }
 
-#ifdef CONFIG_MULTICORE_RAID456
-static void async_run_ops(void *param, async_cookie_t cookie)
-{
-	struct stripe_head *sh = param;
-	unsigned long ops_request = sh->ops.request;
-
-	clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
-	wake_up(&sh->ops.wait_for_ops);
-
-	__raid_run_ops(sh, ops_request);
-	release_stripe(sh);
-}
-
-static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
-{
-	/* since handle_stripe can be called outside of raid5d context
-	 * we need to ensure sh->ops.request is de-staged before another
-	 * request arrives
-	 */
-	wait_event(sh->ops.wait_for_ops,
-		   !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
-	sh->ops.request = ops_request;
-
-	atomic_inc(&sh->count);
-	async_schedule(async_run_ops, sh);
-}
-#else
-#define raid_run_ops __raid_run_ops
-#endif
-
 static int grow_one_stripe(struct r5conf *conf)
 {
 	struct stripe_head *sh;
@@ -1509,9 +1479,6 @@ static int grow_one_stripe(struct r5conf *conf)
 		return 0;
 
 	sh->raid_conf = conf;
-	#ifdef CONFIG_MULTICORE_RAID456
-	init_waitqueue_head(&sh->ops.wait_for_ops);
-	#endif
 
 	spin_lock_init(&sh->stripe_lock);
 
@@ -1630,9 +1597,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 			break;
 
 		nsh->raid_conf = conf;
-		#ifdef CONFIG_MULTICORE_RAID456
-		init_waitqueue_head(&nsh->ops.wait_for_ops);
-		#endif
 		spin_lock_init(&nsh->stripe_lock);
 
 		list_add(&nsh->lru, &newstripes);
-- 
cgit v1.2.3-59-g8ed1b


From f3378b48705154b9089affb2d2e939622aea68f1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Feb 2013 11:59:03 +1100
Subject: md: expedite metadata update when switching  read-auto -> active

If something has failed while the array was read-auto,
then when we switch to 'active' we need to update the metadata.
This will happen anyway but it is good to expedite it, and
also to ensure any failed device has been released by the
underlying device before we try to action the ioctl which
caused us to switch to 'active' mode.

Reported-by: Joe Lawrence <Joe.Lawrence@stratus.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f363135144f6..fcb878f88796 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6532,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 			mddev->ro = 0;
 			sysfs_notify_dirent_safe(mddev->sysfs_state);
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-			md_wakeup_thread(mddev->thread);
+			/* mddev_unlock will wake thread */
+			/* If a device failed while we were read-only, we
+			 * need to make sure the metadata is updated now.
+			 */
+			if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
+				mddev_unlock(mddev);
+				wait_event(mddev->sb_wait,
+					   !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
+					   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
+				mddev_lock(mddev);
+			}
 		} else {
 			err = -EROFS;
 			goto abort_unlock;
-- 
cgit v1.2.3-59-g8ed1b