From 934d9c23b4c7e31840a895ba4b7e88d6413c81f3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 28 Oct 2008 17:01:23 +1100
Subject: md: destroy partitions and notify udev when md array is stopped.

md arrays are not currently destroyed when they are stopped - they
remain in /sys/block.  Last time I tried this I tripped over locking
too much.

A consequence of this is that udev doesn't remove anything from /dev.
This is rather ugly.

As an interim measure until proper device removal can be achieved,
make sure all partitions are removed using the BLKRRPART ioctl, and
send a KOBJ_CHANGE when an md array is stopped.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b4162f6f1b79..9abf6ed16535 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3884,6 +3884,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	if (mode == 0) {
 		mdk_rdev_t *rdev;
 		struct list_head *tmp;
+		struct block_device *bdev;
 
 		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
 
@@ -3940,6 +3941,12 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		mddev->degraded = 0;
 		mddev->barriers_work = 0;
 		mddev->safemode = 0;
+		bdev = bdget_disk(mddev->gendisk, 0);
+		if (bdev) {
+			blkdev_ioctl(bdev, 0, BLKRRPART, 0);
+			bdput(bdev);
+		}
+		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
 
 	} else if (mddev->pers)
 		printk(KERN_INFO "md: %s switched to read-only mode.\n",
-- 
cgit v1.2.3-59-g8ed1b


From b34578a48459ed1bd5396631aaa4a65d6bcc7726 Mon Sep 17 00:00:00 2001
From: Ilpo Jarvinen <ilpo.jarvinen@helsinki.fi>
Date: Thu, 30 Oct 2008 13:33:07 +0000
Subject: dm raid1: fix do_failures

Missing braces.  Commit 1f965b1943 (dm raid1: separate region_hash interface
part1) broke it.

Signed-off-by: Ilpo Jarvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: Heinz Mauelshagen <hjm@redhat.com>
---
 drivers/md/dm-raid1.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 92dcc06832a4..9d7b53ed75b2 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -656,9 +656,10 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 		return;
 
 	if (!ms->log_failure) {
-		while ((bio = bio_list_pop(failures)))
+		while ((bio = bio_list_pop(failures))) {
 			ms->in_sync = 0;
 			dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
+		}
 		return;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 60c856c8e2f57a3f69c505735ef66e3719ea0bd6 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 30 Oct 2008 13:33:12 +0000
Subject: dm snapshot: fix register_snapshot deadlock

register_snapshot() performs a GFP_KERNEL allocation while holding
_origins_lock for write, but that could write out dirty pages onto a
device that attempts to acquire _origins_lock for read, resulting in
deadlock.

So move the allocation up before taking the lock.

This path is not performance-critical, so it doesn't matter that we
allocate memory and free it if we find that we won't need it.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index b2d9d1ac28ad..746603b42f86 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -229,19 +229,21 @@ static void __insert_origin(struct origin *o)
  */
 static int register_snapshot(struct dm_snapshot *snap)
 {
-	struct origin *o;
+	struct origin *o, *new_o;
 	struct block_device *bdev = snap->origin->bdev;
 
+	new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
+	if (!new_o)
+		return -ENOMEM;
+
 	down_write(&_origins_lock);
 	o = __lookup_origin(bdev);
 
-	if (!o) {
+	if (o)
+		kfree(new_o);
+	else {
 		/* New origin */
-		o = kmalloc(sizeof(*o), GFP_KERNEL);
-		if (!o) {
-			up_write(&_origins_lock);
-			return -ENOMEM;
-		}
+		o = new_o;
 
 		/* Initialise the struct */
 		INIT_LIST_HEAD(&o->snapshots);
-- 
cgit v1.2.3-59-g8ed1b


From 879129d208f725267366296b631aef31409cf304 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 30 Oct 2008 13:33:16 +0000
Subject: dm snapshot: wait for chunks in destructor

If there are several snapshots sharing an origin and one is removed
while the origin is being written to, the snapshot's mempool may get
deleted while elements are still referenced.

Prior to dm-snapshot-use-per-device-mempools.patch the pending
exceptions may still have been referenced after the snapshot was
destroyed, but this was not a problem because the shared mempool
was still there.

This patch fixes the problem by tracking the number of mempool elements
in use.

The scenario:
- You have an origin and two snapshots 1 and 2.
- Someone writes to the origin.
- It creates two exceptions in the snapshots, snapshot 1 will be primary
exception, snapshot 2's pending_exception->primary_pe will point to the
exception in snapshot 1.
- The exceptions are being relocated, relocation of exception 1 finishes
(but it's pending_exception is still allocated, because it is referenced
by an exception from snapshot 2)
- The user lvremoves snapshot 1 --- it calls just suspend (does nothing)
and destructor. md->pending is zero (there is no I/O submitted to the
snapshot by md layer), so it won't help us.
- The destructor waits for kcopyd jobs to finish on snapshot 1 --- but
there are none.
- The destructor on snapshot 1 cleans up everything.
- The relocation of exception on snapshot 2 finishes, it drops reference
on primary_pe. This frees its primary_pe pointer. Primary_pe points to
pending exception created for snapshot 1. So it frees memory into
non-existing mempool.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 16 +++++++++++++++-
 drivers/md/dm-snap.h |  2 ++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 746603b42f86..6c96db26b87c 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -370,6 +370,7 @@ static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snaps
 	struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool,
 							     GFP_NOIO);
 
+	atomic_inc(&s->pending_exceptions_count);
 	pe->snap = s;
 
 	return pe;
@@ -377,7 +378,11 @@ static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snaps
 
 static void free_pending_exception(struct dm_snap_pending_exception *pe)
 {
-	mempool_free(pe, pe->snap->pending_pool);
+	struct dm_snapshot *s = pe->snap;
+
+	mempool_free(pe, s->pending_pool);
+	smp_mb__before_atomic_dec();
+	atomic_dec(&s->pending_exceptions_count);
 }
 
 static void insert_completed_exception(struct dm_snapshot *s,
@@ -602,6 +607,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	s->valid = 1;
 	s->active = 0;
+	atomic_set(&s->pending_exceptions_count, 0);
 	init_rwsem(&s->lock);
 	spin_lock_init(&s->pe_lock);
 	s->ti = ti;
@@ -728,6 +734,14 @@ static void snapshot_dtr(struct dm_target *ti)
 	/* After this returns there can be no new kcopyd jobs. */
 	unregister_snapshot(s);
 
+	while (atomic_read(&s->pending_exceptions_count))
+		yield();
+	/*
+	 * Ensure instructions in mempool_destroy aren't reordered
+	 * before atomic_read.
+	 */
+	smp_mb();
+
 #ifdef CONFIG_DM_DEBUG
 	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
 		BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index f07315fe2362..99c0106ede2d 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -160,6 +160,8 @@ struct dm_snapshot {
 
 	mempool_t *pending_pool;
 
+	atomic_t pending_exceptions_count;
+
 	struct exception_table pending;
 	struct exception_table complete;
 
-- 
cgit v1.2.3-59-g8ed1b


From cb3ac42b8af357fdd9ad838234245b39e5bdb7fe Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 6 Nov 2008 17:28:01 +1100
Subject: md: revert the recent addition of a call to the BLKRRPART ioctl.

It turns out that it is only safe to call blkdev_ioctl when the device
is actually open (as ->bd_disk is set to NULL on last close).  And it
is quite possible for do_md_stop to be called when the device is not
open.  So discard the call to blkdev_ioctl(BLKRRPART) which was
added in
   commit 934d9c23b4c7e31840a895ba4b7e88d6413c81f3

It is just as easy to call this ioctl from userspace when needed (on
mdadm -S) so leave it out of the kernel

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9abf6ed16535..1b1d32694f6f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3884,7 +3884,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	if (mode == 0) {
 		mdk_rdev_t *rdev;
 		struct list_head *tmp;
-		struct block_device *bdev;
 
 		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
 
@@ -3941,11 +3940,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		mddev->degraded = 0;
 		mddev->barriers_work = 0;
 		mddev->safemode = 0;
-		bdev = bdget_disk(mddev->gendisk, 0);
-		if (bdev) {
-			blkdev_ioctl(bdev, 0, BLKRRPART, 0);
-			bdput(bdev);
-		}
 		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
 
 	} else if (mddev->pers)
-- 
cgit v1.2.3-59-g8ed1b


From a53a6c85756339f82ff19e001e90cfba2d6299a8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 6 Nov 2008 17:28:20 +1100
Subject: md: fix bug in raid10 recovery.

Adding a spare to a raid10 doesn't cause recovery to start.
This is due to an silly type in
  commit 6c2fce2ef6b4821c21b5c42c7207cb9cf8c87eda
and so is a bug in 2.6.27 and .28-rc.

Thanks to Thomas Backlund for bisecting to find this.

Cc: Thomas Backlund <tmb@mandriva.org>
Cc: stable@kernel.org

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index da5129a24b18..970a96ef9b18 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1137,7 +1137,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!enough(conf))
 		return -EINVAL;
 
-	if (rdev->raid_disk)
+	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
 	if (rdev->saved_raid_disk >= 0 &&
-- 
cgit v1.2.3-59-g8ed1b


From f1cd14ae52985634d0389e934eba25b5ecf24565 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Thu, 6 Nov 2008 19:41:24 +1100
Subject: md: linear: Fix a division by zero bug for very small arrays.

We currently oops with a divide error on starting a linear software
raid array consisting of at least two very small (< 500K) devices.

The bug is caused by the calculation of the hash table size which
tries to compute sector_div(sz, base) with "base" being zero due to
the small size of the component devices of the array.

Fix this by requiring the hash spacing to be at least one which
implies that also "base" is non-zero.

This bug has existed since about 2.6.14.

Cc: stable@kernel.org
Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 190147c79e79..3b90c5c924ec 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -148,6 +148,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 
 	min_sectors = conf->array_sectors;
 	sector_div(min_sectors, PAGE_SIZE/sizeof(struct dev_info *));
+	if (min_sectors == 0)
+		min_sectors = 1;
 
 	/* min_sectors is the minimum spacing that will fit the hash
 	 * table in one PAGE.  This may be much smaller than needed.
-- 
cgit v1.2.3-59-g8ed1b


From 18776c7316545482a02bfaa2629a2aa1afc48357 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 13 Nov 2008 23:38:52 +0000
Subject: dm raid1: flush workqueue before destruction

We queue work on keventd queue --- so this queue must be flushed in the
destructor. Otherwise, keventd could access mirror_set after it was freed.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: stable@kernel.org
---
 drivers/md/dm-raid1.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9d7b53ed75b2..ec43f9fa4b2a 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1032,6 +1032,7 @@ static void mirror_dtr(struct dm_target *ti)
 
 	del_timer_sync(&ms->timer);
 	flush_workqueue(ms->kmirrord_wq);
+	flush_scheduled_work();
 	dm_kcopyd_client_destroy(ms->kcopyd_client);
 	destroy_workqueue(ms->kmirrord_wq);
 	free_context(ms, ti, ms->nr_mirrors);
-- 
cgit v1.2.3-59-g8ed1b


From 6edebdee48729ab4ba564bbfcb8dbf6a6cd68a39 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 13 Nov 2008 23:38:56 +0000
Subject: dm stripe: fix init failure

Don't proceed if dm_stripe_init() fails to register itself as a dm target.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-stripe.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index a2d068dbe9e2..9e4ef88d421e 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -320,8 +320,10 @@ int __init dm_stripe_init(void)
 	int r;
 
 	r = dm_register_target(&stripe_target);
-	if (r < 0)
+	if (r < 0) {
 		DMWARN("target registration failed");
+		return r;
+	}
 
 	kstriped = create_singlethread_workqueue("kstriped");
 	if (!kstriped) {
-- 
cgit v1.2.3-59-g8ed1b


From b81aa1c79201cb424114fd198607951900babe18 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Thu, 13 Nov 2008 23:39:00 +0000
Subject: dm mpath: avoid attempting to activate null path

Path activation code is called even when the pgpath is NULL. This could
lead to a panic in activate_path(). Such a panic is seen in -rt kernel.

This problem has been there before the pg_init() was moved to a
workqueue.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 4840733cd903..58b1015260fa 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -441,13 +441,13 @@ static void process_queued_ios(struct work_struct *work)
 		__choose_pgpath(m);
 
 	pgpath = m->current_pgpath;
-	m->pgpath_to_activate = m->current_pgpath;
 
 	if ((pgpath && !m->queue_io) ||
 	    (!pgpath && !m->queue_if_no_path))
 		must_queue = 0;
 
-	if (m->pg_init_required && !m->pg_init_in_progress) {
+	if (m->pg_init_required && !m->pg_init_in_progress && pgpath) {
+		m->pgpath_to_activate = pgpath;
 		m->pg_init_count++;
 		m->pg_init_required = 0;
 		m->pg_init_in_progress = 1;
-- 
cgit v1.2.3-59-g8ed1b


From 14e98c5ca8bed825f65cbf11cb0ffd2c09dac2f4 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Thu, 13 Nov 2008 23:39:06 +0000
Subject: dm mpath: warn if args ignored

Currently dm ignores the parameters provided to hardware handlers
without providing any notifications to the user.

This patch just prints a warning message so that the user knows that
the arguments are ignored.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 58b1015260fa..3d7f4923cd13 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -708,6 +708,10 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
 		m->hw_handler_name = NULL;
 		return -EINVAL;
 	}
+
+	if (hw_argc > 1)
+		DMWARN("Ignoring user-specified arguments for "
+		       "hardware handler \"%s\"", m->hw_handler_name);
 	consume(as, hw_argc - 1);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From d221d2e77696e70e94b13989ea15db2ba5b34f8e Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 13 Nov 2008 23:39:10 +0000
Subject: dm: move pending queue wake_up end_io_acct

This doesn't fix any bug, just moves wake_up immediately after decrementing
md->pending, for better code readability.

It must be clear to anyone manipulating md->pending to wake up
the queue if md->pending reaches zero, so move the wakeup as close to
the decrementing as possible.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6963ad148408..dc25d8a07bc7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -375,7 +375,7 @@ static void start_io_acct(struct dm_io *io)
 	dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending);
 }
 
-static int end_io_acct(struct dm_io *io)
+static void end_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->bio;
@@ -391,7 +391,9 @@ static int end_io_acct(struct dm_io *io)
 	dm_disk(md)->part0.in_flight = pending =
 		atomic_dec_return(&md->pending);
 
-	return !pending;
+	/* nudge anyone waiting on suspend queue */
+	if (!pending)
+		wake_up(&md->wait);
 }
 
 /*
@@ -499,9 +501,7 @@ static void dec_pending(struct dm_io *io, int error)
 			spin_unlock_irqrestore(&io->md->pushback_lock, flags);
 		}
 
-		if (end_io_acct(io))
-			/* nudge anyone waiting on suspend queue */
-			wake_up(&io->md->wait);
+		end_io_acct(io);
 
 		if (io->error != DM_ENDIO_REQUEUE) {
 			blk_add_trace_bio(io->md->queue, io->bio,
-- 
cgit v1.2.3-59-g8ed1b


From 8a57dfc6f943c92b861c9a19b0c86ddcb2aba768 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Thu, 13 Nov 2008 23:39:14 +0000
Subject: dm: avoid destroying table in dm_any_congested

dm_any_congested() just checks for the DMF_BLOCK_IO and has no
code to make sure that suspend waits for dm_any_congested() to
complete.  This patch adds such a check.

Without it, a race can occur with dm_table_put() attempting to
destroying the table in the wrong thread, the one running
dm_any_congested() which is meant to be quick and return
immediately.

Two examples of problems:
1. Sleeping functions called from congested code, the caller
   of which holds a spin lock.
2. An ABBA deadlock between pdflush and multipathd. The two locks
   in contention are inode lock and kernel lock.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dc25d8a07bc7..c99e4728ff41 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -937,16 +937,24 @@ static void dm_unplug_all(struct request_queue *q)
 
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
-	int r;
-	struct mapped_device *md = (struct mapped_device *) congested_data;
-	struct dm_table *map = dm_get_table(md);
+	int r = bdi_bits;
+	struct mapped_device *md = congested_data;
+	struct dm_table *map;
 
-	if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
-		r = bdi_bits;
-	else
-		r = dm_table_any_congested(map, bdi_bits);
+	atomic_inc(&md->pending);
+
+	if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
+		map = dm_get_table(md);
+		if (map) {
+			r = dm_table_any_congested(map, bdi_bits);
+			dm_table_put(map);
+		}
+	}
+
+	if (!atomic_dec_return(&md->pending))
+		/* nudge anyone waiting on suspend queue */
+		wake_up(&md->wait);
 
-	dm_table_put(map);
 	return r;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0e435ac26e3f951d83338ed3d4ab7dc0fe0055bc Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Wed, 3 Dec 2008 12:55:08 +0100
Subject: block: fix setting of max_segment_size and seg_boundary mask

Fix setting of max_segment_size and seg_boundary mask for stacked md/dm
devices.

When stacking devices (LVM over MD over SCSI) some of the request queue
parameters are not set up correctly in some cases by default, namely
max_segment_size and and seg_boundary mask.

If you create MD device over SCSI, these attributes are zeroed.

Problem become when there is over this mapping next device-mapper mapping
- queue attributes are set in DM this way:

request_queue   max_segment_size  seg_boundary_mask
SCSI                65536             0xffffffff
MD RAID1                0                      0
LVM                 65536                 -1 (64bit)

Unfortunately bio_add_page (resp.  bio_phys_segments) calculates number of
physical segments according to these parameters.

During the generic_make_request() is segment cout recalculated and can
increase bio->bi_phys_segments count over the allowed limit.  (After
bio_clone() in stack operation.)

Thi is specially problem in CCISS driver, where it produce OOPS here

    BUG_ON(creq->nr_phys_segments > MAXSGENTRIES);

(MAXSEGENTRIES is 31 by default.)

Sometimes even this command is enough to cause oops:

  dd iflag=direct if=/dev/<vg>/<lv> of=/dev/null bs=128000 count=10

This command generates bios with 250 sectors, allocated in 32 4k-pages
(last page uses only 1024 bytes).

For LVM layer, it allocates bio with 31 segments (still OK for CCISS),
unfortunatelly on lower layer it is recalculated to 32 segments and this
violates CCISS restriction and triggers BUG_ON().

The patch tries to fix it by:

 * initializing attributes above in queue request constructor
   blk_queue_make_request()

 * make sure that blk_queue_stack_limits() inherits setting

 (DM uses its own function to set the limits because it
 blk_queue_stack_limits() was introduced later.  It should probably switch
 to use generic stack limit function too.)

 * sets the default seg_boundary value in one place (blkdev.h)

 * use this mask as default in DM (instead of -1, which differs in 64bit)

Bugs related to this:
https://bugzilla.redhat.com/show_bug.cgi?id=471639
http://bugzilla.kernel.org/show_bug.cgi?id=8672

Signed-off-by: Milan Broz <mbroz@redhat.com>
Reviewed-by: Alasdair G Kergon <agk@redhat.com>
Cc: Neil Brown <neilb@suse.de>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Mike Miller <mike.miller@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 2 +-
 block/blk-settings.c   | 4 ++++
 drivers/md/dm-table.c  | 2 +-
 include/linux/blkdev.h | 2 ++
 4 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/block/blk-core.c b/block/blk-core.c
index 7a779d7c69c9..c36aa98fafa3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -592,7 +592,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 				   1 << QUEUE_FLAG_STACKABLE);
 	q->queue_lock		= lock;
 
-	blk_queue_segment_boundary(q, 0xffffffff);
+	blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK);
 
 	blk_queue_make_request(q, __make_request);
 	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 41392fbe19ff..afa55e14e278 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -125,6 +125,9 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->nr_requests = BLKDEV_MAX_RQ;
 	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
 	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
+	blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK);
+	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
+
 	q->make_request_fn = mfn;
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -314,6 +317,7 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 	/* zero is "infinity" */
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask);
 
 	t->max_phys_segments = min(t->max_phys_segments, b->max_phys_segments);
 	t->max_hw_segments = min(t->max_hw_segments, b->max_hw_segments);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a63161aec487..04e5fd742c2c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -668,7 +668,7 @@ static void check_for_valid_limits(struct io_restrictions *rs)
 	if (!rs->max_segment_size)
 		rs->max_segment_size = MAX_SEGMENT_SIZE;
 	if (!rs->seg_boundary_mask)
-		rs->seg_boundary_mask = -1;
+		rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	if (!rs->bounce_pfn)
 		rs->bounce_pfn = -1;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9cc7cc5fdce1..6dcd30d806cd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -918,6 +918,8 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define MAX_SEGMENT_SIZE	65536
 
+#define BLK_SEG_BOUNDARY_MASK	0xFFFFFFFFUL
+
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
 static inline int queue_hardsect_size(struct request_queue *q)
-- 
cgit v1.2.3-59-g8ed1b