From c1e44756fdb7b363fd22cb5514dced40752e36c5 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 8 Nov 2010 15:01:02 +0100
Subject: cfq-iosched: do cleanup

Some functions should return boolean.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9eba291eb6fd..b8174bb4a6a1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -637,11 +637,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
 {
 	if (cfq_cfqq_slice_new(cfqq))
-		return 0;
+		return false;
 	if (time_before(jiffies, cfqq->slice_end))
-		return 0;
+		return false;
 
-	return 1;
+	return true;
 }
 
 /*
@@ -1892,10 +1892,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * in their service tree.
 	 */
 	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
-		return 1;
+		return true;
 	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
 			service_tree->count);
-	return 0;
+	return false;
 }
 
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@ -2359,12 +2359,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
 {
 	/* the queue hasn't finished any request, can't estimate */
 	if (cfq_cfqq_slice_new(cfqq))
-		return 1;
+		return true;
 	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
 		cfqq->slice_end))
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-- 
cgit v1.2.3-59-g8ed1b


From d2d59e18a1ea8ecdd1c0a52af320e9a7f5391cc4 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 8 Nov 2010 15:01:03 +0100
Subject: cfq-iosched: schedule dispatch for noidle queue

A queue is idle at cfq_dispatch_requests(), but it gets noidle later. Unless
other task explictly does unplug or all requests are drained, we will not
deliever requests to the disk even cfq_arm_slice_timer doesn't make the
queue idle. For example, cfq_should_idle() returns true because of
service_tree->count == 1, and then other queues are added. Note, I didn't
see obvious performance impacts so far with the patch, but just thought
this could be a problem.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b8174bb4a6a1..986865e3fbc5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3255,6 +3255,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
 		return true;
 
+	/* An idle queue should not be idle now for some reason */
+	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
+		return true;
+
 	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
 		return false;
 
@@ -3508,8 +3512,25 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		}
 	}
 
-	if (!cfqd->rq_in_driver)
+	if (!cfqd->rq_in_driver) {
+		cfq_schedule_dispatch(cfqd);
+		return;
+	}
+	/*
+	 * A queue is idle at cfq_dispatch_requests(), but it gets noidle
+	 * later. We schedule a dispatch if the queue has no requests,
+	 * otherwise the disk is actually in idle till all requests
+	 * are finished even cfq_arm_slice_timer doesn't make the queue idle
+	 * */
+	cfqq = cfqd->active_queue;
+	if (!cfqq)
+		return;
+
+	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq) &&
+	    (!cfqd->cfq_group_idle || cfqq->cfqg->nr_cfqq > 1)) {
+		cfq_del_timer(cfqd, cfqq);
 		cfq_schedule_dispatch(cfqd);
+	}
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 8e1ac6655104bc6e1e79d67e2df88cc8fa9b6e07 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 8 Nov 2010 15:01:04 +0100
Subject: cfq-iosched: don't idle if a deep seek queue is slow

If a deep seek queue slowly deliver requests but disk is much faster, idle
for the queue just wastes disk throughput. If the queue delevers all requests
before half its slice is used, the patch disable idle for it.
In my test, application delivers 32 requests one time, the disk can accept
128 requests at maxium and disk is fast. without the patch, the throughput
is just around 30m/s, while with it, the speed is about 80m/s. The disk is
a SSD, but is detected as a rotational disk. I can configure it as SSD, but
I thought the deep seek queue logic should be fixed too, for example,
considering a fast raid.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 986865e3fbc5..ca4d19907243 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2285,6 +2285,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 		goto keep_queue;
 	}
 
+	/*
+	 * This is a deep seek queue, but the device is much faster than
+	 * the queue can deliver, don't idle
+	 **/
+	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
+	    (cfq_cfqq_slice_new(cfqq) ||
+	    (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
+		cfq_clear_cfqq_deep(cfqq);
+		cfq_clear_cfqq_idle_window(cfqq);
+	}
+
 	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
 		cfqq = NULL;
 		goto keep_queue;
-- 
cgit v1.2.3-59-g8ed1b


From 2b9408a45978dcda77407859148deeccf403c372 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 9 Nov 2010 14:51:13 +0100
Subject: cfq-iosched: don't schedule a dispatch for a non-idle queue

Vivek suggests we don't need schedule a dispatch when an idle queue
becomes nonidle. And he is right, cfq_should_preempt already covers
the logic.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ca4d19907243..f90519430be6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3523,25 +3523,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		}
 	}
 
-	if (!cfqd->rq_in_driver) {
+	if (!cfqd->rq_in_driver)
 		cfq_schedule_dispatch(cfqd);
-		return;
-	}
-	/*
-	 * A queue is idle at cfq_dispatch_requests(), but it gets noidle
-	 * later. We schedule a dispatch if the queue has no requests,
-	 * otherwise the disk is actually in idle till all requests
-	 * are finished even cfq_arm_slice_timer doesn't make the queue idle
-	 * */
-	cfqq = cfqd->active_queue;
-	if (!cfqq)
-		return;
-
-	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq) &&
-	    (!cfqd->cfq_group_idle || cfqq->cfqg->nr_cfqq > 1)) {
-		cfq_del_timer(cfqd, cfqq);
-		cfq_schedule_dispatch(cfqd);
-	}
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 34db1d595ef6f183fbc1e42cda45a3dfa0035258 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 11 Nov 2010 09:58:57 +0100
Subject: block: export 'ro' sysfs attribute for partitions

We already export 'ro' for the disk. This adds the same attribute
for partitions.

Cc: Karel Zak <kzak@redhat.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/partitions/check.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 79fbf3f390f0..861ae84fcee5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -237,6 +237,13 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
 
+ssize_t part_ro_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%d\n", p->policy ? 1 : 0);
+}
+
 ssize_t part_alignment_offset_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
@@ -312,6 +319,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
 		   NULL);
@@ -326,6 +334,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_partition.attr,
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
+	&dev_attr_ro.attr,
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_discard_alignment.attr,
 	&dev_attr_stat.attr,
-- 
cgit v1.2.3-59-g8ed1b


From 731edacb7567c15f25b1fccac18a203bd432ae5e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:16 +0100
Subject: mtd: fix bdev exclusive open bugs in block2mtd::add_device()

There are two bdev exclusive open bugs.

* open_bdev_exclusive() must not be called with NULL holder.  Use dev
  as the holder.

* open_by_devnum() doesn't open the bdev exclusively but
  block2mtd_free_device() always assumes it.  Explicitly claim the
  bdev.

The latter is rather clumsy but will be simplified with future
blkdev_get/put() cleanups.

- Updated to use local variable @mode to cache FMODE_* masks as
  suggested by Artem Bityutskiy.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: linux-mtd@lists.infradead.org
Cc: Artem Bityutskiy <dedekind1@gmail.com>
---
 drivers/mtd/devices/block2mtd.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index 2cf0cc6a4189..a9e2d3b38aeb 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -234,6 +234,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
 /* FIXME: ensure that mtd->size % erase_size == 0 */
 static struct block2mtd_dev *add_device(char *devname, int erase_size)
 {
+	const fmode_t mode = FMODE_READ | FMODE_WRITE;
 	struct block_device *bdev;
 	struct block2mtd_dev *dev;
 	char *name;
@@ -246,7 +247,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 		return NULL;
 
 	/* Get a handle on the device */
-	bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, NULL);
+	bdev = open_bdev_exclusive(devname, mode, dev);
 #ifndef MODULE
 	if (IS_ERR(bdev)) {
 
@@ -255,7 +256,15 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 
 		dev_t devt = name_to_dev_t(devname);
 		if (devt) {
-			bdev = open_by_devnum(devt, FMODE_WRITE | FMODE_READ);
+			bdev = open_by_devnum(devt, mode);
+			if (!IS_ERR(bdev)) {
+				int ret;
+				ret = bd_claim(bdev, dev);
+				if (ret) {
+					blkdev_put(bdev, mode);
+					bdev = ERR_PTR(ret);
+				}
+			}
 		}
 	}
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 37004c42f7240035bc2726c340c4efa726b4818e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:17 +0100
Subject: btrfs: close_bdev_exclusive() should use the same @flags as the
 matching open_bdev_exclusive()

In the failure path of __btrfs_open_devices(), close_bdev_exclusive()
is called with @flags which doesn't match the one used during
open_bdev_exclusive().  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cc04dc1445d6..d39596224d21 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -638,7 +638,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
 		brelse(bh);
 error_close:
-		close_bdev_exclusive(bdev, FMODE_READ);
+		close_bdev_exclusive(bdev, flags);
 error:
 		continue;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From e09b457bdb7e8d23fc54dcef0930ac697d8de895 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:17 +0100
Subject: block: simplify holder symlink handling

Code to manage symlinks in /sys/block/*/{holders|slaves} are overly
complex with multiple holder considerations, redundant extra
references to all involved kobjects, unused generic kobject holder
support and unnecessary mixup with bd_claim/release functionalities.

Strip it down to what's necessary (single gendisk holder) and make it
use a separate interface.  This is a step for cleaning up
bd_claim/release.  This patch makes dm-table slightly more complex but
it will be simplified again with further changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Brown <neilb@suse.de>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: dm-devel@redhat.com
---
 drivers/md/dm-table.c |  23 +++-
 drivers/md/md.c       |   4 +-
 fs/block_dev.c        | 322 +++++++-------------------------------------------
 include/linux/fs.h    |  16 ++-
 4 files changed, 74 insertions(+), 291 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 90267f8d64ee..2c876ffc63df 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -328,12 +328,22 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
 	bdev = open_by_devnum(dev, d->dm_dev.mode);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
-	r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md));
-	if (r)
+
+	r = bd_claim(bdev, _claim_ptr);
+	if (r) {
 		blkdev_put(bdev, d->dm_dev.mode);
-	else
-		d->dm_dev.bdev = bdev;
-	return r;
+		return r;
+	}
+
+	r = bd_link_disk_holder(bdev, dm_disk(md));
+	if (r) {
+		bd_release(bdev);
+		blkdev_put(bdev, d->dm_dev.mode);
+		return r;
+	}
+
+	d->dm_dev.bdev = bdev;
+	return 0;
 }
 
 /*
@@ -344,7 +354,8 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 	if (!d->dm_dev.bdev)
 		return;
 
-	bd_release_from_disk(d->dm_dev.bdev, dm_disk(md));
+	bd_unlink_disk_holder(d->dm_dev.bdev);
+	bd_release(d->dm_dev.bdev);
 	blkdev_put(d->dm_dev.bdev, d->dm_dev.mode);
 	d->dm_dev.bdev = NULL;
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4e957f3140a8..c47644fca1a1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1880,7 +1880,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
 
 	list_add_rcu(&rdev->same_set, &mddev->disks);
-	bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
+	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
 
 	/* May as well allow recovery to be retried once */
 	mddev->recovery_disabled = 0;
@@ -1907,7 +1907,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 		MD_BUG();
 		return;
 	}
-	bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
+	bd_unlink_disk_holder(rdev->bdev);
 	list_del_rcu(&rdev->same_set);
 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
 	rdev->mddev = NULL;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 06e8ff12b97c..9329068684d2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -426,9 +426,6 @@ static void init_once(void *foo)
 	mutex_init(&bdev->bd_mutex);
 	INIT_LIST_HEAD(&bdev->bd_inodes);
 	INIT_LIST_HEAD(&bdev->bd_list);
-#ifdef CONFIG_SYSFS
-	INIT_LIST_HEAD(&bdev->bd_holder_list);
-#endif
 	inode_init_once(&ei->vfs_inode);
 	/* Initialize mutex for freeze. */
 	mutex_init(&bdev->bd_fsfreeze_mutex);
@@ -881,314 +878,83 @@ void bd_release(struct block_device *bdev)
 EXPORT_SYMBOL(bd_release);
 
 #ifdef CONFIG_SYSFS
-/*
- * Functions for bd_claim_by_kobject / bd_release_from_kobject
- *
- *     If a kobject is passed to bd_claim_by_kobject()
- *     and the kobject has a parent directory,
- *     following symlinks are created:
- *        o from the kobject to the claimed bdev
- *        o from "holders" directory of the bdev to the parent of the kobject
- *     bd_release_from_kobject() removes these symlinks.
- *
- *     Example:
- *        If /dev/dm-0 maps to /dev/sda, kobject corresponding to
- *        /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
- *           /sys/block/dm-0/slaves/sda --> /sys/block/sda
- *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
- */
-
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
-	if (!from || !to)
-		return 0;
 	return sysfs_create_link(from, to, kobject_name(to));
 }
 
 static void del_symlink(struct kobject *from, struct kobject *to)
 {
-	if (!from || !to)
-		return;
 	sysfs_remove_link(from, kobject_name(to));
 }
 
-/*
- * 'struct bd_holder' contains pointers to kobjects symlinked by
- * bd_claim_by_kobject.
- * It's connected to bd_holder_list which is protected by bdev->bd_sem.
- */
-struct bd_holder {
-	struct list_head list;	/* chain of holders of the bdev */
-	int count;		/* references from the holder */
-	struct kobject *sdir;	/* holder object, e.g. "/block/dm-0/slaves" */
-	struct kobject *hdev;	/* e.g. "/block/dm-0" */
-	struct kobject *hdir;	/* e.g. "/block/sda/holders" */
-	struct kobject *sdev;	/* e.g. "/block/sda" */
-};
-
-/*
- * Get references of related kobjects at once.
- * Returns 1 on success. 0 on failure.
- *
- * Should call bd_holder_release_dirs() after successful use.
- */
-static int bd_holder_grab_dirs(struct block_device *bdev,
-			struct bd_holder *bo)
-{
-	if (!bdev || !bo)
-		return 0;
-
-	bo->sdir = kobject_get(bo->sdir);
-	if (!bo->sdir)
-		return 0;
-
-	bo->hdev = kobject_get(bo->sdir->parent);
-	if (!bo->hdev)
-		goto fail_put_sdir;
-
-	bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
-	if (!bo->sdev)
-		goto fail_put_hdev;
-
-	bo->hdir = kobject_get(bdev->bd_part->holder_dir);
-	if (!bo->hdir)
-		goto fail_put_sdev;
-
-	return 1;
-
-fail_put_sdev:
-	kobject_put(bo->sdev);
-fail_put_hdev:
-	kobject_put(bo->hdev);
-fail_put_sdir:
-	kobject_put(bo->sdir);
-
-	return 0;
-}
-
-/* Put references of related kobjects at once. */
-static void bd_holder_release_dirs(struct bd_holder *bo)
-{
-	kobject_put(bo->hdir);
-	kobject_put(bo->sdev);
-	kobject_put(bo->hdev);
-	kobject_put(bo->sdir);
-}
-
-static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
-{
-	struct bd_holder *bo;
-
-	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
-	if (!bo)
-		return NULL;
-
-	bo->count = 1;
-	bo->sdir = kobj;
-
-	return bo;
-}
-
-static void free_bd_holder(struct bd_holder *bo)
-{
-	kfree(bo);
-}
-
 /**
- * find_bd_holder - find matching struct bd_holder from the block device
+ * bd_link_disk_holder - create symlinks between holding disk and slave bdev
+ * @bdev: the claimed slave bdev
+ * @disk: the holding disk
  *
- * @bdev:	struct block device to be searched
- * @bo:		target struct bd_holder
+ * This functions creates the following sysfs symlinks.
  *
- * Returns matching entry with @bo in @bdev->bd_holder_list.
- * If found, increment the reference count and return the pointer.
- * If not found, returns NULL.
- */
-static struct bd_holder *find_bd_holder(struct block_device *bdev,
-					struct bd_holder *bo)
-{
-	struct bd_holder *tmp;
-
-	list_for_each_entry(tmp, &bdev->bd_holder_list, list)
-		if (tmp->sdir == bo->sdir) {
-			tmp->count++;
-			return tmp;
-		}
-
-	return NULL;
-}
-
-/**
- * add_bd_holder - create sysfs symlinks for bd_claim() relationship
+ * - from "slaves" directory of the holder @disk to the claimed @bdev
+ * - from "holders" directory of the @bdev to the holder @disk
  *
- * @bdev:	block device to be bd_claimed
- * @bo:		preallocated and initialized by alloc_bd_holder()
+ * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
+ * passed to bd_link_disk_holder(), then:
  *
- * Add @bo to @bdev->bd_holder_list, create symlinks.
+ *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
+ *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
  *
- * Returns 0 if symlinks are created.
- * Returns -ve if something fails.
- */
-static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
-{
-	int err;
-
-	if (!bo)
-		return -EINVAL;
-
-	if (!bd_holder_grab_dirs(bdev, bo))
-		return -EBUSY;
-
-	err = add_symlink(bo->sdir, bo->sdev);
-	if (err)
-		return err;
-
-	err = add_symlink(bo->hdir, bo->hdev);
-	if (err) {
-		del_symlink(bo->sdir, bo->sdev);
-		return err;
-	}
-
-	list_add_tail(&bo->list, &bdev->bd_holder_list);
-	return 0;
-}
-
-/**
- * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
- *
- * @bdev:	block device to be bd_claimed
- * @kobj:	holder's kobject
- *
- * If there is matching entry with @kobj in @bdev->bd_holder_list
- * and no other bd_claim() from the same kobject,
- * remove the struct bd_holder from the list, delete symlinks for it.
- *
- * Returns a pointer to the struct bd_holder when it's removed from the list
- * and ready to be freed.
- * Returns NULL if matching claim isn't found or there is other bd_claim()
- * by the same kobject.
- */
-static struct bd_holder *del_bd_holder(struct block_device *bdev,
-					struct kobject *kobj)
-{
-	struct bd_holder *bo;
-
-	list_for_each_entry(bo, &bdev->bd_holder_list, list) {
-		if (bo->sdir == kobj) {
-			bo->count--;
-			BUG_ON(bo->count < 0);
-			if (!bo->count) {
-				list_del(&bo->list);
-				del_symlink(bo->sdir, bo->sdev);
-				del_symlink(bo->hdir, bo->hdev);
-				bd_holder_release_dirs(bo);
-				return bo;
-			}
-			break;
-		}
-	}
-
-	return NULL;
-}
-
-/**
- * bd_claim_by_kobject - bd_claim() with additional kobject signature
- *
- * @bdev:	block device to be claimed
- * @holder:	holder's signature
- * @kobj:	holder's kobject
+ * The caller must have claimed @bdev before calling this function and
+ * ensure that both @bdev and @disk are valid during the creation and
+ * lifetime of these symlinks.
  *
- * Do bd_claim() and if it succeeds, create sysfs symlinks between
- * the bdev and the holder's kobject.
- * Use bd_release_from_kobject() when relesing the claimed bdev.
+ * CONTEXT:
+ * Might sleep.
  *
- * Returns 0 on success. (same as bd_claim())
- * Returns errno on failure.
+ * RETURNS:
+ * 0 on success, -errno on failure.
  */
-static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
-				struct kobject *kobj)
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
-	int err;
-	struct bd_holder *bo, *found;
-
-	if (!kobj)
-		return -EINVAL;
-
-	bo = alloc_bd_holder(kobj);
-	if (!bo)
-		return -ENOMEM;
+	int ret = 0;
 
 	mutex_lock(&bdev->bd_mutex);
 
-	err = bd_claim(bdev, holder);
-	if (err)
-		goto fail;
+	WARN_ON_ONCE(!bdev->bd_holder || bdev->bd_holder_disk);
 
-	found = find_bd_holder(bdev, bo);
-	if (found)
-		goto fail;
+	/* FIXME: remove the following once add_disk() handles errors */
+	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
+		goto out_unlock;
 
-	err = add_bd_holder(bdev, bo);
-	if (err)
-		bd_release(bdev);
-	else
-		bo = NULL;
-fail:
-	mutex_unlock(&bdev->bd_mutex);
-	free_bd_holder(bo);
-	return err;
-}
+	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	if (ret)
+		goto out_unlock;
 
-/**
- * bd_release_from_kobject - bd_release() with additional kobject signature
- *
- * @bdev:	block device to be released
- * @kobj:	holder's kobject
- *
- * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
- */
-static void bd_release_from_kobject(struct block_device *bdev,
-					struct kobject *kobj)
-{
-	if (!kobj)
-		return;
+	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+	if (ret) {
+		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+		goto out_unlock;
+	}
 
-	mutex_lock(&bdev->bd_mutex);
-	bd_release(bdev);
-	free_bd_holder(del_bd_holder(bdev, kobj));
+	bdev->bd_holder_disk = disk;
+out_unlock:
 	mutex_unlock(&bdev->bd_mutex);
+	return ret;
 }
+EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 
-/**
- * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
- *
- * @bdev:	block device to be claimed
- * @holder:	holder's signature
- * @disk:	holder's gendisk
- *
- * Call bd_claim_by_kobject() with getting @disk->slave_dir.
- */
-int bd_claim_by_disk(struct block_device *bdev, void *holder,
-			struct gendisk *disk)
+void bd_unlink_disk_holder(struct block_device *bdev)
 {
-	return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
-}
-EXPORT_SYMBOL_GPL(bd_claim_by_disk);
+	struct gendisk *disk = bdev->bd_holder_disk;
 
-/**
- * bd_release_from_disk - wrapper function for bd_release_from_kobject()
- *
- * @bdev:	block device to be claimed
- * @disk:	holder's gendisk
- *
- * Call bd_release_from_kobject() and put @disk->slave_dir.
- */
-void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
-{
-	bd_release_from_kobject(bdev, disk->slave_dir);
-	kobject_put(disk->slave_dir);
+	bdev->bd_holder_disk = NULL;
+	if (!disk)
+		return;
+
+	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
 }
-EXPORT_SYMBOL_GPL(bd_release_from_disk);
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 #endif
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 334d68a17108..66b7f2c5d7e9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -663,7 +663,7 @@ struct block_device {
 	void *			bd_holder;
 	int			bd_holders;
 #ifdef CONFIG_SYSFS
-	struct list_head	bd_holder_list;
+	struct gendisk *	bd_holder_disk;	/* for sysfs slave linkng */
 #endif
 	struct block_device *	bd_contains;
 	unsigned		bd_block_size;
@@ -2042,11 +2042,17 @@ extern int blkdev_put(struct block_device *, fmode_t);
 extern int bd_claim(struct block_device *, void *);
 extern void bd_release(struct block_device *);
 #ifdef CONFIG_SYSFS
-extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *);
-extern void bd_release_from_disk(struct block_device *, struct gendisk *);
+extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
+extern void bd_unlink_disk_holder(struct block_device *bdev);
 #else
-#define bd_claim_by_disk(bdev, holder, disk)	bd_claim(bdev, holder)
-#define bd_release_from_disk(bdev, disk)	bd_release(bdev)
+static inline int bd_link_disk_holder(struct block_device *bdev,
+				      struct gendisk *disk)
+{
+	return 0;
+}
+static inline void bd_unlink_disk_holder(struct block_device *bdev)
+{
+}
 #endif
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From e525fd89d380c4a94c0d63913a1dd1a593ed25e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:17 +0100
Subject: block: make blkdev_get/put() handle exclusive access

Over time, block layer has accumulated a set of APIs dealing with bdev
open, close, claim and release.

* blkdev_get/put() are the primary open and close functions.

* bd_claim/release() deal with exclusive open.

* open/close_bdev_exclusive() are combination of open and claim and
  the other way around, respectively.

* bd_link/unlink_disk_holder() to create and remove holder/slave
  symlinks.

* open_by_devnum() wraps bdget() + blkdev_get().

The interface is a bit confusing and the decoupling of open and claim
makes it impossible to properly guarantee exclusive access as
in-kernel open + claim sequence can disturb the existing exclusive
open even before the block layer knows the current open if for another
exclusive access.  Reorganize the interface such that,

* blkdev_get() is extended to include exclusive access management.
  @holder argument is added and, if is @FMODE_EXCL specified, it will
  gain exclusive access atomically w.r.t. other exclusive accesses.

* blkdev_put() is similarly extended.  It now takes @mode argument and
  if @FMODE_EXCL is set, it releases an exclusive access.  Also, when
  the last exclusive claim is released, the holder/slave symlinks are
  removed automatically.

* bd_claim/release() and close_bdev_exclusive() are no longer
  necessary and either made static or removed.

* bd_link_disk_holder() remains the same but bd_unlink_disk_holder()
  is no longer necessary and removed.

* open_bdev_exclusive() becomes a simple wrapper around lookup_bdev()
  and blkdev_get().  It also has an unexpected extra bdev_read_only()
  test which probably should be moved into blkdev_get().

* open_by_devnum() is modified to take @holder argument and pass it to
  blkdev_get().

Most of bdev open/close operations are unified into blkdev_get/put()
and most exclusive accesses are tested atomically at the open time (as
it should).  This cleans up code and removes some, both valid and
invalid, but unnecessary all the same, corner cases.

open_bdev_exclusive() and open_by_devnum() can use further cleanup -
rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop
special features.  Well, let's leave them for another day.

Most conversions are straight-forward.  drbd conversion is a bit more
involved as there was some reordering, but the logic should stay the
same.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Brown <neilb@suse.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Peter Osterlund <petero2@telia.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Alex Elder <aelder@sgi.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: dm-devel@redhat.com
Cc: drbd-dev@lists.linbit.com
Cc: Leo Chen <leochen@broadcom.com>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: Joern Engel <joern@logfs.org>
Cc: reiserfs-devel@vger.kernel.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 block/ioctl.c                   |   5 +-
 drivers/block/drbd/drbd_int.h   |   2 -
 drivers/block/drbd/drbd_main.c  |   7 +-
 drivers/block/drbd/drbd_nl.c    | 103 ++++++++++-----------------
 drivers/block/pktcdvd.c         |  22 +++---
 drivers/char/raw.c              |  14 ++--
 drivers/md/dm-table.c           |  15 +---
 drivers/md/md.c                 |  14 +---
 drivers/mtd/devices/block2mtd.c |  17 ++---
 drivers/s390/block/dasd_genhd.c |   2 +-
 fs/block_dev.c                  | 149 ++++++++++++++--------------------------
 fs/btrfs/volumes.c              |  14 ++--
 fs/ext3/super.c                 |  12 +---
 fs/ext4/super.c                 |  12 +---
 fs/gfs2/ops_fstype.c            |   4 +-
 fs/jfs/jfs_logmgr.c             |  17 ++---
 fs/logfs/dev_bdev.c             |   4 +-
 fs/nilfs2/super.c               |   4 +-
 fs/ocfs2/cluster/heartbeat.c    |   2 +-
 fs/partitions/check.c           |   2 +-
 fs/reiserfs/journal.c           |  17 ++---
 fs/super.c                      |  14 ++--
 fs/xfs/linux-2.6/xfs_super.c    |   2 +-
 include/linux/fs.h              |  14 ++--
 kernel/power/swap.c             |   5 +-
 mm/swapfile.c                   |   7 +-
 26 files changed, 162 insertions(+), 318 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index d724ceb1d465..cc46d499fd27 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -294,11 +294,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 			return -EINVAL;
 		if (get_user(n, (int __user *) arg))
 			return -EFAULT;
-		if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0)
+		if (!(mode & FMODE_EXCL) &&
+		    blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
 			return -EBUSY;
 		ret = set_blocksize(bdev, n);
 		if (!(mode & FMODE_EXCL))
-			bd_release(bdev);
+			blkdev_put(bdev, mode | FMODE_EXCL);
 		return ret;
 	case BLKPG:
 		ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 9bdcf4393c0a..0590b9f67ec6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -923,8 +923,6 @@ struct drbd_md {
 struct drbd_backing_dev {
 	struct block_device *backing_bdev;
 	struct block_device *md_bdev;
-	struct file *lo_file;
-	struct file *md_file;
 	struct drbd_md md;
 	struct disk_conf dc; /* The user provided config... */
 	sector_t known_size; /* last known size of that backing device */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 25c7a73c5062..7ec1a82064a9 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3361,11 +3361,8 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
 	if (ldev == NULL)
 		return;
 
-	bd_release(ldev->backing_bdev);
-	bd_release(ldev->md_bdev);
-
-	fput(ldev->lo_file);
-	fput(ldev->md_file);
+	blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+	blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 
 	kfree(ldev);
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 87925e97e613..fd0346090289 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -855,7 +855,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	sector_t max_possible_sectors;
 	sector_t min_md_device_sectors;
 	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
-	struct inode *inode, *inode2;
+	struct block_device *bdev;
 	struct lru_cache *resync_lru = NULL;
 	union drbd_state ns, os;
 	unsigned int max_seg_s;
@@ -902,46 +902,40 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		}
 	}
 
-	nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
-	if (IS_ERR(nbc->lo_file)) {
+	bdev = open_bdev_exclusive(nbc->dc.backing_dev,
+				   FMODE_READ | FMODE_WRITE, mdev);
+	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
-		    PTR_ERR(nbc->lo_file));
-		nbc->lo_file = NULL;
+			PTR_ERR(bdev));
 		retcode = ERR_OPEN_DISK;
 		goto fail;
 	}
+	nbc->backing_bdev = bdev;
 
-	inode = nbc->lo_file->f_dentry->d_inode;
-
-	if (!S_ISBLK(inode->i_mode)) {
-		retcode = ERR_DISK_NOT_BDEV;
-		goto fail;
-	}
-
-	nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
-	if (IS_ERR(nbc->md_file)) {
+	/*
+	 * meta_dev_idx >= 0: external fixed size, possibly multiple
+	 * drbd sharing one meta device.  TODO in that case, paranoia
+	 * check that [md_bdev, meta_dev_idx] is not yet used by some
+	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
+	 * should check it for you already; but if you don't, or
+	 * someone fooled it, we need to double check here)
+	 */
+	bdev = open_bdev_exclusive(nbc->dc.meta_dev,
+				   FMODE_READ | FMODE_WRITE,
+				   (nbc->dc.meta_dev_idx < 0) ?
+				   (void *)mdev : (void *)drbd_m_holder);
+	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
-		    PTR_ERR(nbc->md_file));
-		nbc->md_file = NULL;
+			PTR_ERR(bdev));
 		retcode = ERR_OPEN_MD_DISK;
 		goto fail;
 	}
+	nbc->md_bdev = bdev;
 
-	inode2 = nbc->md_file->f_dentry->d_inode;
-
-	if (!S_ISBLK(inode2->i_mode)) {
-		retcode = ERR_MD_NOT_BDEV;
-		goto fail;
-	}
-
-	nbc->backing_bdev = inode->i_bdev;
-	if (bd_claim(nbc->backing_bdev, mdev)) {
-		printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
-		       nbc->backing_bdev, mdev,
-		       nbc->backing_bdev->bd_holder,
-		       nbc->backing_bdev->bd_contains->bd_holder,
-		       nbc->backing_bdev->bd_holders);
-		retcode = ERR_BDCLAIM_DISK;
+	if ((nbc->backing_bdev == nbc->md_bdev) !=
+	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+		retcode = ERR_MD_IDX_INVALID;
 		goto fail;
 	}
 
@@ -950,28 +944,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 			offsetof(struct bm_extent, lce));
 	if (!resync_lru) {
 		retcode = ERR_NOMEM;
-		goto release_bdev_fail;
-	}
-
-	/* meta_dev_idx >= 0: external fixed size,
-	 * possibly multiple drbd sharing one meta device.
-	 * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
-	 * not yet used by some other drbd minor!
-	 * (if you use drbd.conf + drbdadm,
-	 * that should check it for you already; but if you don't, or someone
-	 * fooled it, we need to double check here) */
-	nbc->md_bdev = inode2->i_bdev;
-	if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
-				: (void *) drbd_m_holder)) {
-		retcode = ERR_BDCLAIM_MD_DISK;
-		goto release_bdev_fail;
-	}
-
-	if ((nbc->backing_bdev == nbc->md_bdev) !=
-	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
-	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
-		retcode = ERR_MD_IDX_INVALID;
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
@@ -982,7 +955,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 			(unsigned long long) drbd_get_max_capacity(nbc),
 			(unsigned long long) nbc->dc.disk_size);
 		retcode = ERR_DISK_TO_SMALL;
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	if (nbc->dc.meta_dev_idx < 0) {
@@ -999,7 +972,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		dev_warn(DEV, "refusing attach: md-device too small, "
 		     "at least %llu sectors needed for this meta-disk type\n",
 		     (unsigned long long) min_md_device_sectors);
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	/* Make sure the new disk is big enough
@@ -1007,7 +980,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	if (drbd_get_max_capacity(nbc) <
 	    drbd_get_capacity(mdev->this_bdev)) {
 		retcode = ERR_DISK_TO_SMALL;
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
@@ -1030,7 +1003,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
 	drbd_resume_io(mdev);
 	if (retcode < SS_SUCCESS)
-		goto release_bdev2_fail;
+		goto fail;
 
 	if (!get_ldev_if_state(mdev, D_ATTACHING))
 		goto force_diskless;
@@ -1264,18 +1237,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
  force_diskless:
 	drbd_force_state(mdev, NS(disk, D_DISKLESS));
 	drbd_md_sync(mdev);
- release_bdev2_fail:
-	if (nbc)
-		bd_release(nbc->md_bdev);
- release_bdev_fail:
-	if (nbc)
-		bd_release(nbc->backing_bdev);
  fail:
 	if (nbc) {
-		if (nbc->lo_file)
-			fput(nbc->lo_file);
-		if (nbc->md_file)
-			fput(nbc->md_file);
+		if (nbc->backing_bdev)
+			blkdev_put(nbc->backing_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		if (nbc->md_bdev)
+			blkdev_put(nbc->md_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 		kfree(nbc);
 	}
 	lc_destroy(resync_lru);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 19b3568e9326..77d70eebb6b2 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2296,15 +2296,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	 * so bdget() can't fail.
 	 */
 	bdget(pd->bdev->bd_dev);
-	if ((ret = blkdev_get(pd->bdev, FMODE_READ)))
+	if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
 		goto out;
 
-	if ((ret = bd_claim(pd->bdev, pd)))
-		goto out_putdev;
-
 	if ((ret = pkt_get_last_written(pd, &lba))) {
 		printk(DRIVER_NAME": pkt_get_last_written failed\n");
-		goto out_unclaim;
+		goto out_putdev;
 	}
 
 	set_capacity(pd->disk, lba << 2);
@@ -2314,7 +2311,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	q = bdev_get_queue(pd->bdev);
 	if (write) {
 		if ((ret = pkt_open_write(pd)))
-			goto out_unclaim;
+			goto out_putdev;
 		/*
 		 * Some CDRW drives can not handle writes larger than one packet,
 		 * even if the size is a multiple of the packet size.
@@ -2329,23 +2326,21 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	}
 
 	if ((ret = pkt_set_segment_merging(pd, q)))
-		goto out_unclaim;
+		goto out_putdev;
 
 	if (write) {
 		if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
 			printk(DRIVER_NAME": not enough memory for buffers\n");
 			ret = -ENOMEM;
-			goto out_unclaim;
+			goto out_putdev;
 		}
 		printk(DRIVER_NAME": %lukB available on disc\n", lba << 1);
 	}
 
 	return 0;
 
-out_unclaim:
-	bd_release(pd->bdev);
 out_putdev:
-	blkdev_put(pd->bdev, FMODE_READ);
+	blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
 out:
 	return ret;
 }
@@ -2362,8 +2357,7 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
 	pkt_lock_door(pd, 0);
 
 	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
-	bd_release(pd->bdev);
-	blkdev_put(pd->bdev, FMODE_READ);
+	blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
 
 	pkt_shrink_pktlist(pd);
 }
@@ -2733,7 +2727,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	bdev = bdget(dev);
 	if (!bdev)
 		return -ENOMEM;
-	ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY);
+	ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
 	if (ret)
 		return ret;
 
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index bfe25ea9766b..b4b9d5a47885 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -65,15 +65,12 @@ static int raw_open(struct inode *inode, struct file *filp)
 	if (!bdev)
 		goto out;
 	igrab(bdev->bd_inode);
-	err = blkdev_get(bdev, filp->f_mode);
+	err = blkdev_get(bdev, filp->f_mode | FMODE_EXCL, raw_open);
 	if (err)
 		goto out;
-	err = bd_claim(bdev, raw_open);
-	if (err)
-		goto out1;
 	err = set_blocksize(bdev, bdev_logical_block_size(bdev));
 	if (err)
-		goto out2;
+		goto out1;
 	filp->f_flags |= O_DIRECT;
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 	if (++raw_devices[minor].inuse == 1)
@@ -83,10 +80,8 @@ static int raw_open(struct inode *inode, struct file *filp)
 	mutex_unlock(&raw_mutex);
 	return 0;
 
-out2:
-	bd_release(bdev);
 out1:
-	blkdev_put(bdev, filp->f_mode);
+	blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
 out:
 	mutex_unlock(&raw_mutex);
 	return err;
@@ -110,8 +105,7 @@ static int raw_release(struct inode *inode, struct file *filp)
 	}
 	mutex_unlock(&raw_mutex);
 
-	bd_release(bdev);
-	blkdev_put(bdev, filp->f_mode);
+	blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
 	return 0;
 }
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2c876ffc63df..9e88ca0c55e9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -325,20 +325,13 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
 
 	BUG_ON(d->dm_dev.bdev);
 
-	bdev = open_by_devnum(dev, d->dm_dev.mode);
+	bdev = open_by_devnum(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
-	r = bd_claim(bdev, _claim_ptr);
-	if (r) {
-		blkdev_put(bdev, d->dm_dev.mode);
-		return r;
-	}
-
 	r = bd_link_disk_holder(bdev, dm_disk(md));
 	if (r) {
-		bd_release(bdev);
-		blkdev_put(bdev, d->dm_dev.mode);
+		blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL);
 		return r;
 	}
 
@@ -354,9 +347,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 	if (!d->dm_dev.bdev)
 		return;
 
-	bd_unlink_disk_holder(d->dm_dev.bdev);
-	bd_release(d->dm_dev.bdev);
-	blkdev_put(d->dm_dev.bdev, d->dm_dev.mode);
+	blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL);
 	d->dm_dev.bdev = NULL;
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c47644fca1a1..6af951ffe0bb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1907,7 +1907,6 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 		MD_BUG();
 		return;
 	}
-	bd_unlink_disk_holder(rdev->bdev);
 	list_del_rcu(&rdev->same_set);
 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
 	rdev->mddev = NULL;
@@ -1935,19 +1934,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+			      shared ? (mdk_rdev_t *)lock_rdev : rdev);
 	if (IS_ERR(bdev)) {
 		printk(KERN_ERR "md: could not open %s.\n",
 			__bdevname(dev, b));
 		return PTR_ERR(bdev);
 	}
-	err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
-	if (err) {
-		printk(KERN_ERR "md: could not bd_claim %s.\n",
-			bdevname(bdev, b));
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-		return err;
-	}
 	if (!shared)
 		set_bit(AllReserved, &rdev->flags);
 	rdev->bdev = bdev;
@@ -1960,8 +1953,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
 	rdev->bdev = NULL;
 	if (!bdev)
 		MD_BUG();
-	bd_release(bdev);
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 void md_autodetect_dev(dev_t dev);
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index a9e2d3b38aeb..aa557beb8f51 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -224,7 +224,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
 	if (dev->blkdev) {
 		invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping,
 					0, -1);
-		close_bdev_exclusive(dev->blkdev, FMODE_READ|FMODE_WRITE);
+		blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 	}
 
 	kfree(dev);
@@ -234,7 +234,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
 /* FIXME: ensure that mtd->size % erase_size == 0 */
 static struct block2mtd_dev *add_device(char *devname, int erase_size)
 {
-	const fmode_t mode = FMODE_READ | FMODE_WRITE;
+	const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
 	struct block_device *bdev;
 	struct block2mtd_dev *dev;
 	char *name;
@@ -255,17 +255,8 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 		   to resolve the device name by other means. */
 
 		dev_t devt = name_to_dev_t(devname);
-		if (devt) {
-			bdev = open_by_devnum(devt, mode);
-			if (!IS_ERR(bdev)) {
-				int ret;
-				ret = bd_claim(bdev, dev);
-				if (ret) {
-					blkdev_put(bdev, mode);
-					bdev = ERR_PTR(ret);
-				}
-			}
-		}
+		if (devt)
+			bdev = open_by_devnum(devt, mode, dev);
 	}
 #endif
 
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index 30a1ca3d08b7..5505bc07e1e7 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -103,7 +103,7 @@ int dasd_scan_partitions(struct dasd_block *block)
 	struct block_device *bdev;
 
 	bdev = bdget_disk(block->gdp, 0);
-	if (!bdev || blkdev_get(bdev, FMODE_READ) < 0)
+	if (!bdev || blkdev_get(bdev, FMODE_READ, NULL) < 0)
 		return -ENODEV;
 	/*
 	 * See fs/partition/check.c:register_disk,rescan_partitions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9329068684d2..fc48912354d1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -660,7 +660,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
 	else if (bdev->bd_contains == bdev)
 		return true;  	 /* is a whole device which isn't held */
 
-	else if (whole->bd_holder == bd_claim)
+	else if (whole->bd_holder == bd_may_claim)
 		return true; 	 /* is a partition of a device that is being partitioned */
 	else if (whole->bd_holder != NULL)
 		return false;	 /* is a partition of a held device */
@@ -807,10 +807,10 @@ static void __bd_claim(struct block_device *bdev, struct block_device *whole,
 {
 	/* note that for a whole device bd_holders
 	 * will be incremented twice, and bd_holder will
-	 * be set to bd_claim before being set to holder
+	 * be set to bd_may_claim before being set to holder
 	 */
 	whole->bd_holders++;
-	whole->bd_holder = bd_claim;
+	whole->bd_holder = bd_may_claim;
 	bdev->bd_holders++;
 	bdev->bd_holder = holder;
 }
@@ -835,37 +835,7 @@ static void bd_finish_claiming(struct block_device *bdev,
 	__bd_abort_claiming(whole, holder); /* not actually an abort */
 }
 
-/**
- * bd_claim - claim a block device
- * @bdev: block device to claim
- * @holder: holder trying to claim @bdev
- *
- * Try to claim @bdev which must have been opened successfully.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * 0 if successful, -EBUSY if @bdev is already claimed.
- */
-int bd_claim(struct block_device *bdev, void *holder)
-{
-	struct block_device *whole = bdev->bd_contains;
-	int res;
-
-	might_sleep();
-
-	spin_lock(&bdev_lock);
-	res = bd_prepare_to_claim(bdev, whole, holder);
-	if (res == 0)
-		__bd_claim(bdev, whole, holder);
-	spin_unlock(&bdev_lock);
-
-	return res;
-}
-EXPORT_SYMBOL(bd_claim);
-
-void bd_release(struct block_device *bdev)
+static void bd_release(struct block_device *bdev)
 {
 	spin_lock(&bdev_lock);
 	if (!--bdev->bd_contains->bd_holders)
@@ -875,8 +845,6 @@ void bd_release(struct block_device *bdev)
 	spin_unlock(&bdev_lock);
 }
 
-EXPORT_SYMBOL(bd_release);
-
 #ifdef CONFIG_SYSFS
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
@@ -943,7 +911,7 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 
-void bd_unlink_disk_holder(struct block_device *bdev)
+static void bd_unlink_disk_holder(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_holder_disk;
 
@@ -954,7 +922,9 @@ void bd_unlink_disk_holder(struct block_device *bdev)
 	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
 	del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
 }
-EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
+#else
+static inline void bd_unlink_disk_holder(struct block_device *bdev)
+{ }
 #endif
 
 /*
@@ -964,12 +934,12 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
  * to be used for internal purposes.  If you ever need it - reconsider
  * your API.
  */
-struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
+struct block_device *open_by_devnum(dev_t dev, fmode_t mode, void *holder)
 {
 	struct block_device *bdev = bdget(dev);
 	int err = -ENOMEM;
 	if (bdev)
-		err = blkdev_get(bdev, mode);
+		err = blkdev_get(bdev, mode, holder);
 	return err ? ERR_PTR(err) : bdev;
 }
 
@@ -1235,17 +1205,37 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	return ret;
 }
 
-int blkdev_get(struct block_device *bdev, fmode_t mode)
+int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 {
-	return __blkdev_get(bdev, mode, 0);
+	struct block_device *whole = NULL;
+	int res;
+
+	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
+
+	if ((mode & FMODE_EXCL) && holder) {
+		whole = bd_start_claiming(bdev, holder);
+		if (IS_ERR(whole)) {
+			bdput(bdev);
+			return PTR_ERR(whole);
+		}
+	}
+
+	res = __blkdev_get(bdev, mode, 0);
+
+	if (whole) {
+		if (res == 0)
+			bd_finish_claiming(bdev, whole, holder);
+		else
+			bd_abort_claiming(whole, holder);
+	}
+
+	return res;
 }
 EXPORT_SYMBOL(blkdev_get);
 
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
-	struct block_device *whole = NULL;
 	struct block_device *bdev;
-	int res;
 
 	/*
 	 * Preserve backwards compatibility and allow large file access
@@ -1266,26 +1256,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if (bdev == NULL)
 		return -ENOMEM;
 
-	if (filp->f_mode & FMODE_EXCL) {
-		whole = bd_start_claiming(bdev, filp);
-		if (IS_ERR(whole)) {
-			bdput(bdev);
-			return PTR_ERR(whole);
-		}
-	}
-
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 
-	res = blkdev_get(bdev, filp->f_mode);
-
-	if (whole) {
-		if (res == 0)
-			bd_finish_claiming(bdev, whole, filp);
-		else
-			bd_abort_claiming(whole, filp);
-	}
-
-	return res;
+	return blkdev_get(bdev, filp->f_mode, filp);
 }
 
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1329,6 +1302,13 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 
 int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
+	if (mode & FMODE_EXCL) {
+		mutex_lock(&bdev->bd_mutex);
+		bd_release(bdev);
+		if (!bdev->bd_holders)
+			bd_unlink_disk_holder(bdev);
+		mutex_unlock(&bdev->bd_mutex);
+	}
 	return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
@@ -1336,8 +1316,7 @@ EXPORT_SYMBOL(blkdev_put);
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-	if (bdev->bd_holder == filp)
-		bd_release(bdev);
+
 	return blkdev_put(bdev, filp->f_mode);
 }
 
@@ -1494,55 +1473,27 @@ EXPORT_SYMBOL(lookup_bdev);
  */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
-	struct block_device *bdev, *whole;
+	struct block_device *bdev;
 	int error;
 
 	bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return bdev;
 
-	whole = bd_start_claiming(bdev, holder);
-	if (IS_ERR(whole)) {
-		bdput(bdev);
-		return whole;
-	}
-
-	error = blkdev_get(bdev, mode);
+	error = blkdev_get(bdev, mode | FMODE_EXCL, holder);
 	if (error)
-		goto out_abort_claiming;
+		return ERR_PTR(error);
 
-	error = -EACCES;
-	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-		goto out_blkdev_put;
+	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+		blkdev_put(bdev, mode);
+		return ERR_PTR(-EACCES);
+	}
 
-	bd_finish_claiming(bdev, whole, holder);
 	return bdev;
-
-out_blkdev_put:
-	blkdev_put(bdev, mode);
-out_abort_claiming:
-	bd_abort_claiming(whole, holder);
-	return ERR_PTR(error);
 }
 
 EXPORT_SYMBOL(open_bdev_exclusive);
 
-/**
- * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
- *
- * @bdev:	blockdevice to close
- * @mode:	mode, must match that used to open.
- *
- * This is the counterpart to open_bdev_exclusive().
- */
-void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
-{
-	bd_release(bdev);
-	blkdev_put(bdev, mode);
-}
-
-EXPORT_SYMBOL(close_bdev_exclusive);
-
 int __invalidate_device(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d39596224d21..f1b729d3b883 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -489,7 +489,7 @@ again:
 			continue;
 
 		if (device->bdev) {
-			close_bdev_exclusive(device->bdev, device->mode);
+			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
 			device->bdev = NULL;
 			fs_devices->open_devices--;
 		}
@@ -523,7 +523,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->bdev) {
-			close_bdev_exclusive(device->bdev, device->mode);
+			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
 			fs_devices->open_devices--;
 		}
 		if (device->writeable) {
@@ -638,7 +638,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
 		brelse(bh);
 error_close:
-		close_bdev_exclusive(bdev, flags);
+		blkdev_put(bdev, flags | FMODE_EXCL);
 error:
 		continue;
 	}
@@ -716,7 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	brelse(bh);
 error_close:
-	close_bdev_exclusive(bdev, flags);
+	blkdev_put(bdev, flags | FMODE_EXCL);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1244,7 +1244,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
 
 	if (device->bdev) {
-		close_bdev_exclusive(device->bdev, device->mode);
+		blkdev_put(device->bdev, device->mode | FMODE_EXCL);
 		device->bdev = NULL;
 		device->fs_devices->open_devices--;
 	}
@@ -1287,7 +1287,7 @@ error_brelse:
 	brelse(bh);
 error_close:
 	if (bdev)
-		close_bdev_exclusive(bdev, FMODE_READ);
+		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
@@ -1565,7 +1565,7 @@ out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 error:
-	close_bdev_exclusive(bdev, 0);
+	blkdev_put(bdev, FMODE_EXCL);
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2fedaf8b5012..23e7513dba9c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -347,7 +347,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
@@ -364,8 +364,7 @@ fail:
  */
 static int ext3_blkdev_put(struct block_device *bdev)
 {
-	bd_release(bdev);
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -2136,13 +2135,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	if (bdev == NULL)
 		return NULL;
 
-	if (bd_claim(bdev, sb)) {
-		ext3_msg(sb, KERN_ERR,
-			"error: failed to claim external journal device");
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-		return NULL;
-	}
-
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 61182fe6254e..5dd0b3e76fa8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -647,7 +647,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
@@ -663,8 +663,7 @@ fail:
  */
 static int ext4_blkdev_put(struct block_device *bdev)
 {
-	bd_release(bdev);
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -3758,13 +3757,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	if (bdev == NULL)
 		return NULL;
 
-	if (bd_claim(bdev, sb)) {
-		ext4_msg(sb, KERN_ERR,
-			"failed to claim external journal device");
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-		return NULL;
-	}
-
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3eb1393f7b81..c1f0763a022b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 		goto error_bdev;
 
 	if (s->s_root)
-		close_bdev_exclusive(bdev, mode);
+		blkdev_put(bdev, mode | FMODE_EXCL);
 
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
 	deactivate_locked_super(s);
 	return ERR_PTR(error);
 error_bdev:
-	close_bdev_exclusive(bdev, mode);
+	blkdev_put(bdev, mode | FMODE_EXCL);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e1b8493b9aaa..5a290f22dcc3 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb)
 	 * file systems to log may have n-to-1 relationship;
 	 */
 
-	bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+			      log);
 	if (IS_ERR(bdev)) {
 		rc = -PTR_ERR(bdev);
 		goto free;
 	}
 
-	if ((rc = bd_claim(bdev, log))) {
-		goto close;
-	}
-
 	log->bdev = bdev;
 	memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
 
@@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb)
 	 * initialize log:
 	 */
 	if ((rc = lmLogInit(log)))
-		goto unclaim;
+		goto close;
 
 	list_add(&log->journal_list, &jfs_external_logs);
 
@@ -1163,11 +1160,8 @@ journal_found:
 	list_del(&log->journal_list);
 	lbmLogShutdown(log);
 
-      unclaim:
-	bd_release(bdev);
-
       close:		/* close external log device */
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 
       free:		/* free log descriptor */
 	mutex_unlock(&jfs_log_mutex);
@@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb)
 	bdev = log->bdev;
 	rc = lmLogShutdown(log);
 
-	bd_release(bdev);
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 
 	kfree(log);
 
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 92ca6fbe09bd..734b9025858e 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
 
 static void bdev_put_device(struct logfs_super *s)
 {
-	close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -331,7 +331,7 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
 
 	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
 		int mtdnr = MINOR(bdev->bd_dev);
-		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 		return logfs_get_sb_mtd(p, mtdnr);
 	}
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f804d41ec9d3..756a6798d7c8 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1233,7 +1233,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	if (!s_new)
-		close_bdev_exclusive(sd.bdev, mode);
+		blkdev_put(sd.bdev, mode | FMODE_EXCL);
 
 	return root_dentry;
 
@@ -1242,7 +1242,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 
  failed:
 	if (!s_new)
-		close_bdev_exclusive(sd.bdev, mode);
+		blkdev_put(sd.bdev, mode | FMODE_EXCL);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 52c7557f3e25..d0a2721eaceb 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1674,7 +1674,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 		goto out;
 
 	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
-	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
 	if (ret) {
 		reg->hr_bdev = NULL;
 		goto out;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0a8b0ad0c7e2..2e6501d034ab 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -549,7 +549,7 @@ void register_disk(struct gendisk *disk)
 		goto exit;
 
 	bdev->bd_invalidated = 1;
-	err = blkdev_get(bdev, FMODE_READ);
+	err = blkdev_get(bdev, FMODE_READ, NULL);
 	if (err < 0)
 		goto exit;
 	blkdev_put(bdev, FMODE_READ);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 076c8b194682..b488136f5ace 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2552,8 +2552,6 @@ static int release_journal_dev(struct super_block *super,
 	result = 0;
 
 	if (journal->j_dev_bd != NULL) {
-		if (journal->j_dev_bd->bd_dev != super->s_dev)
-			bd_release(journal->j_dev_bd);
 		result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
 		journal->j_dev_bd = NULL;
 	}
@@ -2571,7 +2569,7 @@ static int journal_init_dev(struct super_block *super,
 {
 	int result;
 	dev_t jdev;
-	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
+	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
 	char b[BDEVNAME_SIZE];
 
 	result = 0;
@@ -2585,7 +2583,9 @@ static int journal_init_dev(struct super_block *super,
 
 	/* there is no "jdev" option and journal is on separate device */
 	if ((!jdev_name || !jdev_name[0])) {
-		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+		if (jdev == super->s_dev)
+			blkdev_mode &= ~FMODE_EXCL;
+		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode, journal);
 		journal->j_dev_mode = blkdev_mode;
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
@@ -2594,15 +2594,8 @@ static int journal_init_dev(struct super_block *super,
 					 "cannot init journal device '%s': %i",
 					 __bdevname(jdev, b), result);
 			return result;
-		} else if (jdev != super->s_dev) {
-			result = bd_claim(journal->j_dev_bd, journal);
-			if (result) {
-				blkdev_put(journal->j_dev_bd, blkdev_mode);
-				return result;
-			}
-
+		} else if (jdev != super->s_dev)
 			set_blocksize(journal->j_dev_bd, super->s_blocksize);
-		}
 
 		return 0;
 	}
diff --git a/fs/super.c b/fs/super.c
index ca696155cd9a..22374bf0ba87 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -801,13 +801,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 
 		/*
 		 * s_umount nests inside bd_mutex during
-		 * __invalidate_device().  close_bdev_exclusive()
-		 * acquires bd_mutex and can't be called under
-		 * s_umount.  Drop s_umount temporarily.  This is safe
-		 * as we're holding an active reference.
+		 * __invalidate_device().  blkdev_put() acquires
+		 * bd_mutex and can't be called under s_umount.  Drop
+		 * s_umount temporarily.  This is safe as we're
+		 * holding an active reference.
 		 */
 		up_write(&s->s_umount);
-		close_bdev_exclusive(bdev, mode);
+		blkdev_put(bdev, mode | FMODE_EXCL);
 		down_write(&s->s_umount);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -831,7 +831,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	close_bdev_exclusive(bdev, mode);
+	blkdev_put(bdev, mode | FMODE_EXCL);
 error:
 	return ERR_PTR(error);
 }
@@ -862,7 +862,7 @@ void kill_block_super(struct super_block *sb)
 	bdev->bd_super = NULL;
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
-	close_bdev_exclusive(bdev, mode);
+	blkdev_put(bdev, mode | FMODE_EXCL);
 }
 
 EXPORT_SYMBOL(kill_block_super);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f3a78fe6ae4..a1a6e5ceea67 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -623,7 +623,7 @@ xfs_blkdev_put(
 	struct block_device	*bdev)
 {
 	if (bdev)
-		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 66b7f2c5d7e9..1a033e8ebe4c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2006,7 +2006,8 @@ extern struct block_device *bdgrab(struct block_device *bdev);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
-extern struct block_device *open_by_devnum(dev_t, fmode_t);
+extern struct block_device *open_by_devnum(dev_t dev, fmode_t mode,
+					   void *holder);
 extern void invalidate_bdev(struct block_device *);
 extern int sync_blockdev(struct block_device *bdev);
 extern struct super_block *freeze_bdev(struct block_device *);
@@ -2037,22 +2038,16 @@ extern const struct file_operations def_fifo_fops;
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
-extern int blkdev_get(struct block_device *, fmode_t);
-extern int blkdev_put(struct block_device *, fmode_t);
-extern int bd_claim(struct block_device *, void *);
-extern void bd_release(struct block_device *);
+extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
+extern int blkdev_put(struct block_device *bdev, fmode_t mode);
 #ifdef CONFIG_SYSFS
 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
-extern void bd_unlink_disk_holder(struct block_device *bdev);
 #else
 static inline int bd_link_disk_holder(struct block_device *bdev,
 				      struct gendisk *disk)
 {
 	return 0;
 }
-static inline void bd_unlink_disk_holder(struct block_device *bdev)
-{
-}
 #endif
 #endif
 
@@ -2089,7 +2084,6 @@ extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
 extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *);
-extern void close_bdev_exclusive(struct block_device *, fmode_t);
 extern void blkdev_show(struct seq_file *,off_t);
 
 #else
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0e4a86ccf94..513a77f1a0b3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -223,7 +223,7 @@ static int swsusp_swap_check(void)
 		return res;
 
 	root_swap = res;
-	res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
+	res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
 	if (res)
 		return res;
 
@@ -907,7 +907,8 @@ int swsusp_check(void)
 {
 	int error;
 
-	hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+	hib_resume_bdev = open_by_devnum(swsusp_resume_device,
+					 FMODE_READ, NULL);
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 67ddaaf98c74..b6adcfbf6f48 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1677,7 +1677,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	if (S_ISBLK(inode->i_mode)) {
 		struct block_device *bdev = I_BDEV(inode);
 		set_blocksize(bdev, p->old_block_size);
-		bd_release(bdev);
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 	} else {
 		mutex_lock(&inode->i_mutex);
 		inode->i_flags &= ~S_SWAPFILE;
@@ -1939,7 +1939,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	error = -EINVAL;
 	if (S_ISBLK(inode->i_mode)) {
 		bdev = I_BDEV(inode);
-		error = bd_claim(bdev, sys_swapon);
+		error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+				   sys_swapon);
 		if (error < 0) {
 			bdev = NULL;
 			error = -EINVAL;
@@ -2136,7 +2137,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 bad_swap:
 	if (bdev) {
 		set_blocksize(bdev, p->old_block_size);
-		bd_release(bdev);
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 	}
 	destroy_swap_extents(p);
 	swap_cgroup_swapoff(type);
-- 
cgit v1.2.3-59-g8ed1b


From 6a027eff62f6ae32d49f2ae5dadd6f4eee1ddae2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:17 +0100
Subject: block: reorganize claim/release implementation

With claim/release rolled into blkdev_get/put(), there's no reason to
keep bd_abort/finish_claim(), __bd_claim() and bd_release() as
separate functions.  It only makes the code difficult to follow.
Collapse them into blkdev_get/put().  This will ease future changes
around claim/release.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 fs/block_dev.c | 127 ++++++++++++++++++++++-----------------------------------
 1 file changed, 48 insertions(+), 79 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index fc48912354d1..269bfbbd10fc 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -772,79 +772,6 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
 	}
 }
 
-/* releases bdev_lock */
-static void __bd_abort_claiming(struct block_device *whole, void *holder)
-{
-	BUG_ON(whole->bd_claiming != holder);
-	whole->bd_claiming = NULL;
-	wake_up_bit(&whole->bd_claiming, 0);
-
-	spin_unlock(&bdev_lock);
-	bdput(whole);
-}
-
-/**
- * bd_abort_claiming - abort claiming a block device
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Abort a claiming block started by bd_start_claiming().  Note that
- * @whole is not the block device to be claimed but the whole device
- * returned by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_abort_claiming(struct block_device *whole, void *holder)
-{
-	spin_lock(&bdev_lock);
-	__bd_abort_claiming(whole, holder);		/* releases bdev_lock */
-}
-
-/* increment holders when we have a legitimate claim. requires bdev_lock */
-static void __bd_claim(struct block_device *bdev, struct block_device *whole,
-					void *holder)
-{
-	/* note that for a whole device bd_holders
-	 * will be incremented twice, and bd_holder will
-	 * be set to bd_may_claim before being set to holder
-	 */
-	whole->bd_holders++;
-	whole->bd_holder = bd_may_claim;
-	bdev->bd_holders++;
-	bdev->bd_holder = holder;
-}
-
-/**
- * bd_finish_claiming - finish claiming a block device
- * @bdev: block device of interest (passed to bd_start_claiming())
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Finish a claiming block started by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_finish_claiming(struct block_device *bdev,
-				struct block_device *whole, void *holder)
-{
-	spin_lock(&bdev_lock);
-	BUG_ON(!bd_may_claim(bdev, whole, holder));
-	__bd_claim(bdev, whole, holder);
-	__bd_abort_claiming(whole, holder); /* not actually an abort */
-}
-
-static void bd_release(struct block_device *bdev)
-{
-	spin_lock(&bdev_lock);
-	if (!--bdev->bd_contains->bd_holders)
-		bdev->bd_contains->bd_holder = NULL;
-	if (!--bdev->bd_holders)
-		bdev->bd_holder = NULL;
-	spin_unlock(&bdev_lock);
-}
-
 #ifdef CONFIG_SYSFS
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
@@ -1223,10 +1150,30 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 	res = __blkdev_get(bdev, mode, 0);
 
 	if (whole) {
-		if (res == 0)
-			bd_finish_claiming(bdev, whole, holder);
-		else
-			bd_abort_claiming(whole, holder);
+		/* finish claiming */
+		spin_lock(&bdev_lock);
+
+		if (res == 0) {
+			BUG_ON(!bd_may_claim(bdev, whole, holder));
+			/*
+			 * Note that for a whole device bd_holders
+			 * will be incremented twice, and bd_holder
+			 * will be set to bd_may_claim before being
+			 * set to holder
+			 */
+			whole->bd_holders++;
+			whole->bd_holder = bd_may_claim;
+			bdev->bd_holders++;
+			bdev->bd_holder = holder;
+		}
+
+		/* tell others that we're done */
+		BUG_ON(whole->bd_claiming != holder);
+		whole->bd_claiming = NULL;
+		wake_up_bit(&whole->bd_claiming, 0);
+
+		spin_unlock(&bdev_lock);
+		bdput(whole);
 	}
 
 	return res;
@@ -1272,6 +1219,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_part_count--;
 
 	if (!--bdev->bd_openers) {
+		WARN_ON_ONCE(bdev->bd_holders);
 		sync_blockdev(bdev);
 		kill_bdev(bdev);
 	}
@@ -1303,10 +1251,31 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
 	if (mode & FMODE_EXCL) {
+		bool bdev_free;
+
+		/*
+		 * Release a claim on the device.  The holder fields
+		 * are protected with bdev_lock.  bd_mutex is to
+		 * synchronize disk_holder unlinking.
+		 */
 		mutex_lock(&bdev->bd_mutex);
-		bd_release(bdev);
-		if (!bdev->bd_holders)
+		spin_lock(&bdev_lock);
+
+		WARN_ON_ONCE(--bdev->bd_holders < 0);
+		WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
+
+		/* bd_contains might point to self, check in a separate step */
+		if ((bdev_free = !bdev->bd_holders))
+			bdev->bd_holder = NULL;
+		if (!bdev->bd_contains->bd_holders)
+			bdev->bd_contains->bd_holder = NULL;
+
+		spin_unlock(&bdev_lock);
+
+		/* if this was the last claim, holder link should go too */
+		if (bdev_free)
 			bd_unlink_disk_holder(bdev);
+
 		mutex_unlock(&bdev->bd_mutex);
 	}
 	return __blkdev_put(bdev, mode, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 75f1dc0d076d1c1168f2115f1941ea627d38bd5a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:17 +0100
Subject: block: check bdev_read_only() from blkdev_get()

bdev read-only status can be queried using bdev_read_only() and may
change while the device is being opened.  Enforce it by checking it
from blkdev_get() after open succeeds.

This makes bdev_read_only() check in open_bdev_exclusive() and
fsg_lun_open() unnecessary.  Drop them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: linux-usb@vger.kernel.org
---
 drivers/usb/gadget/storage_common.c |  7 ++-----
 fs/block_dev.c                      | 11 ++++++-----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/usb/gadget/storage_common.c b/drivers/usb/gadget/storage_common.c
index 3b513bafaf2a..b015561fd602 100644
--- a/drivers/usb/gadget/storage_common.c
+++ b/drivers/usb/gadget/storage_common.c
@@ -543,7 +543,7 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
 	ro = curlun->initially_ro;
 	if (!ro) {
 		filp = filp_open(filename, O_RDWR | O_LARGEFILE, 0);
-		if (-EROFS == PTR_ERR(filp))
+		if (PTR_ERR(filp) == -EROFS || PTR_ERR(filp) == -EACCES)
 			ro = 1;
 	}
 	if (ro)
@@ -558,10 +558,7 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
 
 	if (filp->f_path.dentry)
 		inode = filp->f_path.dentry->d_inode;
-	if (inode && S_ISBLK(inode->i_mode)) {
-		if (bdev_read_only(inode->i_bdev))
-			ro = 1;
-	} else if (!inode || !S_ISREG(inode->i_mode)) {
+	if (!inode || (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) {
 		LINFO(curlun, "invalid file type: %s\n", filename);
 		goto out;
 	}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 269bfbbd10fc..606a5259f87f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1149,6 +1149,12 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 
 	res = __blkdev_get(bdev, mode, 0);
 
+	/* __blkdev_get() may alter read only status, check it afterwards */
+	if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+		__blkdev_put(bdev, mode, 0);
+		res = -EACCES;
+	}
+
 	if (whole) {
 		/* finish claiming */
 		spin_lock(&bdev_lock);
@@ -1453,11 +1459,6 @@ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *h
 	if (error)
 		return ERR_PTR(error);
 
-	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
-		blkdev_put(bdev, mode);
-		return ERR_PTR(-EACCES);
-	}
-
 	return bdev;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d4d77629953eabd3c14f6fa5746f6b28babfc55f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:18 +0100
Subject: block: clean up blkdev_get() wrappers and their users

After recent blkdev_get() modifications, open_by_devnum() and
open_bdev_exclusive() are simple wrappers around blkdev_get().
Replace them with blkdev_get_by_dev() and blkdev_get_by_path().

blkdev_get_by_dev() is identical to open_by_devnum().
blkdev_get_by_path() is slightly different in that it doesn't
automatically add %FMODE_EXCL to @mode.

All users are converted.  Most conversions are mechanical and don't
introduce any behavior difference.  There are several exceptions.

* btrfs now sets FMODE_EXCL in btrfs_device->mode, so there's no
  reason to OR it explicitly on blkdev_put().

* gfs2, nilfs2 and the generic mount_bdev() now set FMODE_EXCL in
  sb->s_mode.

* With the above changes, sb->s_mode now always should contain
  FMODE_EXCL.  WARN_ON_ONCE() added to kill_block_super() to detect
  errors.

The new blkdev_get_*() functions are with proper docbook comments.
While at it, add function description to blkdev_get() too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Joern Engel <joern@lazybastard.org>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Jan Kara <jack@suse.cz>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp>
Cc: reiserfs-devel@vger.kernel.org
Cc: xfs-masters@oss.sgi.com
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/drbd/drbd_nl.c    |  12 ++--
 drivers/md/dm-table.c           |   2 +-
 drivers/md/md.c                 |   4 +-
 drivers/mtd/devices/block2mtd.c |   4 +-
 fs/block_dev.c                  | 139 +++++++++++++++++++++++++++-------------
 fs/btrfs/volumes.c              |  24 ++++---
 fs/btrfs/volumes.h              |   2 +-
 fs/ext3/super.c                 |   2 +-
 fs/ext4/super.c                 |   2 +-
 fs/gfs2/ops_fstype.c            |   8 +--
 fs/jfs/jfs_logmgr.c             |   4 +-
 fs/logfs/dev_bdev.c             |   3 +-
 fs/nilfs2/super.c               |   8 +--
 fs/reiserfs/journal.c           |   6 +-
 fs/super.c                      |   9 +--
 fs/xfs/linux-2.6/xfs_super.c    |   3 +-
 include/linux/fs.h              |   7 +-
 kernel/power/swap.c             |   4 +-
 18 files changed, 149 insertions(+), 94 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index fd0346090289..650e43ba4f7c 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -902,8 +902,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		}
 	}
 
-	bdev = open_bdev_exclusive(nbc->dc.backing_dev,
-				   FMODE_READ | FMODE_WRITE, mdev);
+	bdev = blkdev_get_by_path(nbc->dc.backing_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
 	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
 			PTR_ERR(bdev));
@@ -920,10 +920,10 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	 * should check it for you already; but if you don't, or
 	 * someone fooled it, we need to double check here)
 	 */
-	bdev = open_bdev_exclusive(nbc->dc.meta_dev,
-				   FMODE_READ | FMODE_WRITE,
-				   (nbc->dc.meta_dev_idx < 0) ?
-				   (void *)mdev : (void *)drbd_m_holder);
+	bdev = blkdev_get_by_path(nbc->dc.meta_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+				  (nbc->dc.meta_dev_idx < 0) ?
+				  (void *)mdev : (void *)drbd_m_holder);
 	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
 			PTR_ERR(bdev));
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9e88ca0c55e9..67150c32986c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -325,7 +325,7 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
 
 	BUG_ON(d->dm_dev.bdev);
 
-	bdev = open_by_devnum(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
+	bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6af951ffe0bb..5aaa6bfbe638 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1934,8 +1934,8 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
-			      shared ? (mdk_rdev_t *)lock_rdev : rdev);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				 shared ? (mdk_rdev_t *)lock_rdev : rdev);
 	if (IS_ERR(bdev)) {
 		printk(KERN_ERR "md: could not open %s.\n",
 			__bdevname(dev, b));
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index aa557beb8f51..f29a6f9df6e7 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -247,7 +247,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 		return NULL;
 
 	/* Get a handle on the device */
-	bdev = open_bdev_exclusive(devname, mode, dev);
+	bdev = blkdev_get_by_path(devname, mode, dev);
 #ifndef MODULE
 	if (IS_ERR(bdev)) {
 
@@ -256,7 +256,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 
 		dev_t devt = name_to_dev_t(devname);
 		if (devt)
-			bdev = open_by_devnum(devt, mode, dev);
+			bdev = blkdev_get_by_dev(devt, mode, dev);
 	}
 #endif
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 606a5259f87f..c1c1b8c3fb99 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -854,24 +854,6 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev)
 { }
 #endif
 
-/*
- * Tries to open block device by device number.  Use it ONLY if you
- * really do not have anything better - i.e. when you are behind a
- * truly sucky interface and all you are given is a device number.  _Never_
- * to be used for internal purposes.  If you ever need it - reconsider
- * your API.
- */
-struct block_device *open_by_devnum(dev_t dev, fmode_t mode, void *holder)
-{
-	struct block_device *bdev = bdget(dev);
-	int err = -ENOMEM;
-	if (bdev)
-		err = blkdev_get(bdev, mode, holder);
-	return err ? ERR_PTR(err) : bdev;
-}
-
-EXPORT_SYMBOL(open_by_devnum);
-
 /**
  * flush_disk - invalidates all buffer-cache entries on a disk
  *
@@ -1132,6 +1114,25 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	return ret;
 }
 
+/**
+ * blkdev_get - open a block device
+ * @bdev: block_device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
+ * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
+ * @holder is invalid.  Exclusive opens may nest for the same @holder.
+ *
+ * On success, the reference count of @bdev is unchanged.  On failure,
+ * @bdev is put.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
 int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 {
 	struct block_device *whole = NULL;
@@ -1186,6 +1187,80 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 }
 EXPORT_SYMBOL(blkdev_get);
 
+/**
+ * blkdev_get_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by the device file at @path.  @mode
+ * and @holder are identical to blkdev_get().
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+					void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = lookup_bdev(path);
+	if (IS_ERR(bdev))
+		return bdev;
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_path);
+
+/**
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by device number @dev.  @mode and
+ * @holder are identical to blkdev_get().
+ *
+ * Use it ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a
+ * device number.  _Never_ to be used for internal purposes.  If you
+ * ever need it - reconsider your API.
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = bdget(dev);
+	if (!bdev)
+		return ERR_PTR(-ENOMEM);
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
+
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev;
@@ -1436,34 +1511,6 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
 
-/**
- * open_bdev_exclusive  -  open a block device by name and set it up for use
- *
- * @path:	special file representing the block device
- * @mode:	FMODE_... combination to pass be used
- * @holder:	owner for exclusion
- *
- * Open the blockdevice described by the special file at @path, claim it
- * for the @holder.
- */
-struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
-{
-	struct block_device *bdev;
-	int error;
-
-	bdev = lookup_bdev(path);
-	if (IS_ERR(bdev))
-		return bdev;
-
-	error = blkdev_get(bdev, mode | FMODE_EXCL, holder);
-	if (error)
-		return ERR_PTR(error);
-
-	return bdev;
-}
-
-EXPORT_SYMBOL(open_bdev_exclusive);
-
 int __invalidate_device(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f1b729d3b883..95324e9f9280 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -489,7 +489,7 @@ again:
 			continue;
 
 		if (device->bdev) {
-			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
+			blkdev_put(device->bdev, device->mode);
 			device->bdev = NULL;
 			fs_devices->open_devices--;
 		}
@@ -523,7 +523,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->bdev) {
-			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
+			blkdev_put(device->bdev, device->mode);
 			fs_devices->open_devices--;
 		}
 		if (device->writeable) {
@@ -580,13 +580,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	int seeding = 1;
 	int ret = 0;
 
+	flags |= FMODE_EXCL;
+
 	list_for_each_entry(device, head, dev_list) {
 		if (device->bdev)
 			continue;
 		if (!device->name)
 			continue;
 
-		bdev = open_bdev_exclusive(device->name, flags, holder);
+		bdev = blkdev_get_by_path(device->name, flags, holder);
 		if (IS_ERR(bdev)) {
 			printk(KERN_INFO "open %s failed\n", device->name);
 			goto error;
@@ -638,7 +640,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
 		brelse(bh);
 error_close:
-		blkdev_put(bdev, flags | FMODE_EXCL);
+		blkdev_put(bdev, flags);
 error:
 		continue;
 	}
@@ -684,7 +686,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	mutex_lock(&uuid_mutex);
 
-	bdev = open_bdev_exclusive(path, flags, holder);
+	flags |= FMODE_EXCL;
+	bdev = blkdev_get_by_path(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
 		ret = PTR_ERR(bdev);
@@ -716,7 +719,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	brelse(bh);
 error_close:
-	blkdev_put(bdev, flags | FMODE_EXCL);
+	blkdev_put(bdev, flags);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1179,8 +1182,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto out;
 		}
 	} else {
-		bdev = open_bdev_exclusive(device_path, FMODE_READ,
-				      root->fs_info->bdev_holder);
+		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
+					  root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
 			goto out;
@@ -1244,7 +1247,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
 
 	if (device->bdev) {
-		blkdev_put(device->bdev, device->mode | FMODE_EXCL);
+		blkdev_put(device->bdev, device->mode);
 		device->bdev = NULL;
 		device->fs_devices->open_devices--;
 	}
@@ -1439,7 +1442,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
 		return -EINVAL;
 
-	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+	bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+				  root->fs_info->bdev_holder);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2b638b6e4eea..856e75770304 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -49,7 +49,7 @@ struct btrfs_device {
 
 	struct block_device *bdev;
 
-	/* the mode sent to open_bdev_exclusive */
+	/* the mode sent to blkdev_get */
 	fmode_t mode;
 
 	char *name;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 23e7513dba9c..123720ba786d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -347,7 +347,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5dd0b3e76fa8..bd63e6927219 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -647,7 +647,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c1f0763a022b..bc56ccf98ffd 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 {
 	struct block_device *bdev;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	int error;
 	struct gfs2_args args;
 	struct gfs2_sbd *sdp;
@@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 		goto error_bdev;
 
 	if (s->s_root)
-		blkdev_put(bdev, mode | FMODE_EXCL);
+		blkdev_put(bdev, mode);
 
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
 	deactivate_locked_super(s);
 	return ERR_PTR(error);
 error_bdev:
-	blkdev_put(bdev, mode | FMODE_EXCL);
+	blkdev_put(bdev, mode);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 5a290f22dcc3..278e3fb40b71 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,8 +1120,8 @@ int lmLogOpen(struct super_block *sb)
 	 * file systems to log may have n-to-1 relationship;
 	 */
 
-	bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
-			      log);
+	bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				 log);
 	if (IS_ERR(bdev)) {
 		rc = -PTR_ERR(bdev);
 		goto free;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 734b9025858e..723bc5bca09a 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -325,7 +325,8 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
 {
 	struct block_device *bdev;
 
-	bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+	bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				  type);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 756a6798d7c8..0030640e2d72 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1147,14 +1147,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 {
 	struct nilfs_super_data sd;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	struct dentry *root_dentry;
 	int err, s_new = false;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(sd.bdev))
 		return ERR_CAST(sd.bdev);
 
@@ -1233,7 +1233,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	if (!s_new)
-		blkdev_put(sd.bdev, mode | FMODE_EXCL);
+		blkdev_put(sd.bdev, mode);
 
 	return root_dentry;
 
@@ -1242,7 +1242,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 
  failed:
 	if (!s_new)
-		blkdev_put(sd.bdev, mode | FMODE_EXCL);
+		blkdev_put(sd.bdev, mode);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index b488136f5ace..e2fce519c0f2 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2585,7 +2585,8 @@ static int journal_init_dev(struct super_block *super,
 	if ((!jdev_name || !jdev_name[0])) {
 		if (jdev == super->s_dev)
 			blkdev_mode &= ~FMODE_EXCL;
-		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode, journal);
+		journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
+						      journal);
 		journal->j_dev_mode = blkdev_mode;
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
@@ -2601,8 +2602,7 @@ static int journal_init_dev(struct super_block *super,
 	}
 
 	journal->j_dev_mode = blkdev_mode;
-	journal->j_dev_bd = open_bdev_exclusive(jdev_name,
-						blkdev_mode, journal);
+	journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
 	if (IS_ERR(journal->j_dev_bd)) {
 		result = PTR_ERR(journal->j_dev_bd);
 		journal->j_dev_bd = NULL;
diff --git a/fs/super.c b/fs/super.c
index 22374bf0ba87..5d9a4497849a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -766,13 +766,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 {
 	struct block_device *bdev;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	int error = 0;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
@@ -807,7 +807,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 		 * holding an active reference.
 		 */
 		up_write(&s->s_umount);
-		blkdev_put(bdev, mode | FMODE_EXCL);
+		blkdev_put(bdev, mode);
 		down_write(&s->s_umount);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -831,7 +831,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	blkdev_put(bdev, mode | FMODE_EXCL);
+	blkdev_put(bdev, mode);
 error:
 	return ERR_PTR(error);
 }
@@ -862,6 +862,7 @@ void kill_block_super(struct super_block *sb)
 	bdev->bd_super = NULL;
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
+	WARN_ON_ONCE(!(mode & FMODE_EXCL));
 	blkdev_put(bdev, mode | FMODE_EXCL);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a1a6e5ceea67..9209cd199c47 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -609,7 +609,8 @@ xfs_blkdev_get(
 {
 	int			error = 0;
 
-	*bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
+	*bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				    mp);
 	if (IS_ERR(*bdevp)) {
 		error = PTR_ERR(*bdevp);
 		printk("XFS: Invalid device [%s], error=%d\n", name, error);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1a033e8ebe4c..f48501563917 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2006,8 +2006,6 @@ extern struct block_device *bdgrab(struct block_device *bdev);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
-extern struct block_device *open_by_devnum(dev_t dev, fmode_t mode,
-					   void *holder);
 extern void invalidate_bdev(struct block_device *);
 extern int sync_blockdev(struct block_device *bdev);
 extern struct super_block *freeze_bdev(struct block_device *);
@@ -2039,6 +2037,10 @@ extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
 extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
+extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+					       void *holder);
+extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
+					      void *holder);
 extern int blkdev_put(struct block_device *bdev, fmode_t mode);
 #ifdef CONFIG_SYSFS
 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
@@ -2083,7 +2085,6 @@ static inline void unregister_chrdev(unsigned int major, const char *name)
 extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
-extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *);
 extern void blkdev_show(struct seq_file *,off_t);
 
 #else
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 513a77f1a0b3..b019609d1b45 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -907,8 +907,8 @@ int swsusp_check(void)
 {
 	int error;
 
-	hib_resume_bdev = open_by_devnum(swsusp_resume_device,
-					 FMODE_READ, NULL);
+	hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+					    FMODE_READ, NULL);
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
-- 
cgit v1.2.3-59-g8ed1b


From bdc85df7a8417b9893443ff5520804699416b6f3 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 15 Nov 2010 19:37:36 +0100
Subject: blk-cgroup: Allow creation of hierarchical cgroups

o Allow hierarchical cgroup creation for blkio controller

o Currently we disallow it as both the io controller policies (throttling
  as well as proportion bandwidth) do not support hierarhical accounting
  and control. But the flip side is that blkio controller can not be used with
  libvirt as libvirt creates a cgroup hierarchy deeper than 1 level.

  <top-level-cgroup-dir>/<controller>/libvirt/qemu/<virtual-machine-groups>

o So this patch will allow creation of cgroup hierarhcy but at the backend
  everything will be treated as flat. So if somebody created a an hierarchy
  like as follows.

			root
			/  \
		     test1 test2
			|
		     test3

  CFQ and throttling will practically treat all groups at same level.

				pivot
			     /  |   \  \
			root  test1 test2  test3

o Once we have actual support for hierarchical accounting and control
  then we can introduce another cgroup tunable file "blkio.use_hierarchy"
  which will be 0 by default but if user wants to enforce hierarhical
  control then it can be set to 1. This way there should not be any
  ABI problems down the line.

o The only not so pretty part is introduction of extra file "use_hierarchy"
  down the line. Kame-san had mentioned that hierarhical accounting is
  expensive in memory controller hence they keep it off by default. I
  suspect same will be the case for IO controller also as for each IO
  completion we shall have to account IO through hierarchy up to the root.
  if yes, then it probably is not a very bad idea to introduce this extra
  file so that it will be used only when somebody needs it and some people
  might enable hierarchy only in part of the hierarchy.

o This is how basically memory controller also uses "use_hierarhcy" and
  they also allowed creation of hierarchies when actual backend support
  was not available.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Reviewed-by: Ciju Rajan K <ciju@linux.vnet.ibm.com>
Tested-by: Ciju Rajan K <ciju@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/cgroups/blkio-controller.txt | 27 +++++++++++++++++++++++++++
 block/blk-cgroup.c                         |  4 ----
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index d6da611f8f63..4ed7b5ceeed2 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -89,6 +89,33 @@ Throttling/Upper Limit policy
 
  Limits for writes can be put using blkio.write_bps_device file.
 
+Hierarchical Cgroups
+====================
+- Currently none of the IO control policy supports hierarhical groups. But
+  cgroup interface does allow creation of hierarhical cgroups and internally
+  IO policies treat them as flat hierarchy.
+
+  So this patch will allow creation of cgroup hierarhcy but at the backend
+  everything will be treated as flat. So if somebody created a hierarchy like
+  as follows.
+
+			root
+			/  \
+		     test1 test2
+			|
+		     test3
+
+  CFQ and throttling will practically treat all groups at same level.
+
+				pivot
+			     /  |   \  \
+			root  test1 test2  test3
+
+  Down the line we can implement hierarchical accounting/control support
+  and also introduce a new cgroup file "use_hierarchy" which will control
+  whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
+  This is how memory controller also has implemented the things.
+
 Various user visible config options
 ===================================
 CONFIG_BLK_CGROUP
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b1febd0f6d2a..455768a3eb9e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1452,10 +1452,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		goto done;
 	}
 
-	/* Currently we do not support hierarchy deeper than two level (0,1) */
-	if (parent != cgroup->top_cgroup)
-		return ERR_PTR(-EPERM);
-
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 	if (!blkcg)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3-59-g8ed1b


From d07335e51df0c6dec202d315fc4f1f7e100eec4e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 16 Nov 2010 12:52:38 +0100
Subject: block: Rename "block_remap" tracepoint to "block_bio_remap" to
 clarify the event.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c             | 10 +++++-----
 drivers/md/dm.c              |  4 ++--
 include/trace/events/block.h |  6 +++---
 kernel/trace/blktrace.c      | 12 ++++++------
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 4ce953f1b390..151070541e21 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,7 +33,7 @@
 
 #include "blk.h"
 
-EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
@@ -1329,9 +1329,9 @@ static inline void blk_partition_remap(struct bio *bio)
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 
-		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
-				    bdev->bd_dev,
-				    bio->bi_sector - p->start_sect);
+		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
+				      bdev->bd_dev,
+				      bio->bi_sector - p->start_sect);
 	}
 }
 
@@ -1500,7 +1500,7 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 
 		if (old_sector != -1)
-			trace_block_remap(q, bio, old_dev, old_sector);
+			trace_block_bio_remap(q, bio, old_dev, old_sector);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7cb1352f7e7a..0a2b5516bc21 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -990,8 +990,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
 
-		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev, sector);
+		trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
+				      tio->io->bio->bi_bdev->bd_dev, sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index d8ce278515c3..b56c65dc105d 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -486,16 +486,16 @@ TRACE_EVENT(block_split,
 );
 
 /**
- * block_remap - map request for a partition to the raw device
+ * block_bio_remap - map request for a logical device to the raw device
  * @q: queue holding the operation
  * @bio: revised operation
  * @dev: device for the operation
  * @from: original sector for the operation
  *
- * An operation for a partition on a block device has been mapped to the
+ * An operation for a logical device has been mapped to the
  * raw block device.
  */
-TRACE_EVENT(block_remap,
+TRACE_EVENT(block_bio_remap,
 
 	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
 		 sector_t from),
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7b8ec0281548..2b8e2ee7c0ef 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -887,7 +887,7 @@ static void blk_add_trace_split(void *ignore,
 }
 
 /**
- * blk_add_trace_remap - Add a trace for a remap operation
+ * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
  * @ignore:	trace callback data parameter (not used)
  * @q:		queue the io is for
  * @bio:	the source bio
@@ -899,9 +899,9 @@ static void blk_add_trace_split(void *ignore,
  *     it spans a stripe (or similar). Add a trace for that action.
  *
  **/
-static void blk_add_trace_remap(void *ignore,
-				struct request_queue *q, struct bio *bio,
-				dev_t dev, sector_t from)
+static void blk_add_trace_bio_remap(void *ignore,
+				    struct request_queue *q, struct bio *bio,
+				    dev_t dev, sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
@@ -1016,7 +1016,7 @@ static void blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_split(blk_add_trace_split, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_remap(blk_add_trace_remap, NULL);
+	ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
 	WARN_ON(ret);
@@ -1025,7 +1025,7 @@ static void blk_register_tracepoints(void)
 static void blk_unregister_tracepoints(void)
 {
 	unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
-	unregister_trace_block_remap(blk_add_trace_remap, NULL);
+	unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
 	unregister_trace_block_split(blk_add_trace_split, NULL);
 	unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
 	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
-- 
cgit v1.2.3-59-g8ed1b


From b54ce60eb7f61f8e314b8b241b0469eda3bb1d42 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 30 Nov 2010 20:52:46 +0100
Subject: cfq-iosched: Get rid of st->active

When a cfq group is running, it won't be dequeued from service tree, so
there's no need to store the active one in st->active. Just gid rid of it.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 73a58628f54a..e18d316ae652 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -87,7 +87,6 @@ struct cfq_rb_root {
 	unsigned count;
 	unsigned total_weight;
 	u64 min_vdisktime;
-	struct rb_node *active;
 };
 #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
 			.count = 0, .min_vdisktime = 0, }
@@ -563,11 +562,6 @@ static void update_min_vdisktime(struct cfq_rb_root *st)
 	u64 vdisktime = st->min_vdisktime;
 	struct cfq_group *cfqg;
 
-	if (st->active) {
-		cfqg = rb_entry_cfqg(st->active);
-		vdisktime = cfqg->vdisktime;
-	}
-
 	if (st->left) {
 		cfqg = rb_entry_cfqg(st->left);
 		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
@@ -894,9 +888,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 
-	if (st->active == &cfqg->rb_node)
-		st->active = NULL;
-
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
 
@@ -1095,7 +1086,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 	if (!atomic_dec_and_test(&cfqg->ref))
 		return;
 	for_each_cfqg_st(cfqg, i, j, st)
-		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
+		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
 	kfree(cfqg);
 }
 
@@ -1687,9 +1678,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfqq == cfqd->active_queue)
 		cfqd->active_queue = NULL;
 
-	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
-		cfqd->grp_service_tree.active = NULL;
-
 	if (cfqd->active_cic) {
 		put_io_context(cfqd->active_cic->ioc);
 		cfqd->active_cic = NULL;
@@ -2199,7 +2187,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
 	if (RB_EMPTY_ROOT(&st->rb))
 		return NULL;
 	cfqg = cfq_rb_first_group(st);
-	st->active = &cfqg->rb_node;
 	update_min_vdisktime(st);
 	return cfqg;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 760701bfe14faee8ea0608a9cab2046071d98a39 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 30 Nov 2010 20:52:47 +0100
Subject: cfq-iosched: Get rid of on_st flag

It's able to check whether a CFQ group on a service tree by
checking "cfqg->rb_node". There's no need to maintain an
extra flag here.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e18d316ae652..5d0349d602fe 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -179,7 +179,6 @@ struct cfq_group {
 	/* group service_tree key */
 	u64 vdisktime;
 	unsigned int weight;
-	bool on_st;
 
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
@@ -863,7 +862,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	struct rb_node *n;
 
 	cfqg->nr_cfqq++;
-	if (cfqg->on_st)
+	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		return;
 
 	/*
@@ -879,7 +878,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		cfqg->vdisktime = st->min_vdisktime;
 
 	__cfq_group_service_tree_add(st, cfqg);
-	cfqg->on_st = true;
 	st->total_weight += cfqg->weight;
 }
 
@@ -896,7 +894,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		return;
 
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	cfqg->on_st = false;
 	st->total_weight -= cfqg->weight;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
-- 
cgit v1.2.3-59-g8ed1b


From e4ea0c16a85d221ebcc3a21f32e321440459e0fc Mon Sep 17 00:00:00 2001
From: Shaohua Li writes <shaohua.li@intel.com>
Date: Mon, 13 Dec 2010 14:32:22 +0100
Subject: block cfq: select new workload if priority changed

If priority is changed, continuing to check workload_expires and service tree
count of the previous workload does not make sense. We should always choose
the workload with lowest key of new priority in such case.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5d0349d602fe..9b186fd6bf47 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2101,6 +2101,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	unsigned count;
 	struct cfq_rb_root *st;
 	unsigned group_slice;
+	enum wl_prio_t original_prio = cfqd->serving_prio;
 
 	if (!cfqg) {
 		cfqd->serving_prio = IDLE_WORKLOAD;
@@ -2119,6 +2120,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		return;
 	}
 
+	if (original_prio != cfqd->serving_prio)
+		goto new_workload;
+
 	/*
 	 * For RT and BE, we have to choose also the type
 	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
@@ -2133,6 +2137,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	if (count && !time_after(jiffies, cfqd->workload_expires))
 		return;
 
+new_workload:
 	/* otherwise select new workload type */
 	cfqd->serving_type =
 		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
-- 
cgit v1.2.3-59-g8ed1b


From dddd9dc340ae1a41d90e084529ca979c77c4ecfe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:35 +0100
Subject: block: kill genhd_media_change_notify()

There's no user of the facility.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c         | 25 -------------------------
 include/linux/genhd.h |  1 -
 2 files changed, 26 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 5fa2b44a72ff..0905ab22c8cf 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1110,29 +1110,6 @@ static int __init proc_genhd_init(void)
 module_init(proc_genhd_init);
 #endif /* CONFIG_PROC_FS */
 
-static void media_change_notify_thread(struct work_struct *work)
-{
-	struct gendisk *gd = container_of(work, struct gendisk, async_notify);
-	char event[] = "MEDIA_CHANGE=1";
-	char *envp[] = { event, NULL };
-
-	/*
-	 * set enviroment vars to indicate which event this is for
-	 * so that user space will know to go check the media status.
-	 */
-	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
-	put_device(gd->driverfs_dev);
-}
-
-#if 0
-void genhd_media_change_notify(struct gendisk *disk)
-{
-	get_device(disk->driverfs_dev);
-	schedule_work(&disk->async_notify);
-}
-EXPORT_SYMBOL_GPL(genhd_media_change_notify);
-#endif  /*  0  */
-
 dev_t blk_lookup_devt(const char *name, int partno)
 {
 	dev_t devt = MKDEV(0, 0);
@@ -1198,8 +1175,6 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 		disk_to_dev(disk)->class = &block_class;
 		disk_to_dev(disk)->type = &disk_type;
 		device_initialize(disk_to_dev(disk));
-		INIT_WORK(&disk->async_notify,
-			media_change_notify_thread);
 	}
 	return disk;
 }
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7a7b9c1644e4..5e4e6928387c 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -173,7 +173,6 @@ struct gendisk {
 	struct timer_rand_state *random;
 
 	atomic_t sync_io;		/* RAID */
-	struct work_struct async_notify;
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity *integrity;
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From d2bf1b6723ed0eab378363649d15b7893bf14e91 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:36 +0100
Subject: block: move register_disk() and del_gendisk() to block/genhd.c

There's no reason for register_disk() and del_gendisk() to be in
fs/partitions/check.c.  Move both to genhd.c.  While at it, collapse
unlink_gendisk(), which was artificially in a separate function due to
genhd.c / check.c split, into del_gendisk().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c          | 90 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/partitions/check.c  | 89 -------------------------------------------------
 include/linux/blkdev.h |  1 -
 include/linux/genhd.h  |  1 -
 4 files changed, 87 insertions(+), 94 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 0905ab22c8cf..2e5e4c0a1133 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -502,6 +502,64 @@ static int exact_lock(dev_t devt, void *data)
 	return 0;
 }
 
+void register_disk(struct gendisk *disk)
+{
+	struct device *ddev = disk_to_dev(disk);
+	struct block_device *bdev;
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+	int err;
+
+	ddev->parent = disk->driverfs_dev;
+
+	dev_set_name(ddev, disk->disk_name);
+
+	/* delay uevents, until we scanned partition table */
+	dev_set_uevent_suppress(ddev, 1);
+
+	if (device_add(ddev))
+		return;
+	if (!sysfs_deprecated) {
+		err = sysfs_create_link(block_depr, &ddev->kobj,
+					kobject_name(&ddev->kobj));
+		if (err) {
+			device_del(ddev);
+			return;
+		}
+	}
+	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
+
+	/* No minors to use for partitions */
+	if (!disk_partitionable(disk))
+		goto exit;
+
+	/* No such device (e.g., media were just removed) */
+	if (!get_capacity(disk))
+		goto exit;
+
+	bdev = bdget_disk(disk, 0);
+	if (!bdev)
+		goto exit;
+
+	bdev->bd_invalidated = 1;
+	err = blkdev_get(bdev, FMODE_READ, NULL);
+	if (err < 0)
+		goto exit;
+	blkdev_put(bdev, FMODE_READ);
+
+exit:
+	/* announce disk after possible partitions are created */
+	dev_set_uevent_suppress(ddev, 0);
+	kobject_uevent(&ddev->kobj, KOBJ_ADD);
+
+	/* announce possible partitions */
+	disk_part_iter_init(&piter, disk, 0);
+	while ((part = disk_part_iter_next(&piter)))
+		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
+	disk_part_iter_exit(&piter);
+}
+
 /**
  * add_disk - add partitioning information to kernel list
  * @disk: per-device partitioning information
@@ -552,17 +610,43 @@ void add_disk(struct gendisk *disk)
 				   "bdi");
 	WARN_ON(retval);
 }
-
 EXPORT_SYMBOL(add_disk);
-EXPORT_SYMBOL(del_gendisk);	/* in partitions/check.c */
 
-void unlink_gendisk(struct gendisk *disk)
+void del_gendisk(struct gendisk *disk)
 {
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+
+	/* invalidate stuff */
+	disk_part_iter_init(&piter, disk,
+			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
+	while ((part = disk_part_iter_next(&piter))) {
+		invalidate_partition(disk, part->partno);
+		delete_partition(disk, part->partno);
+	}
+	disk_part_iter_exit(&piter);
+
+	invalidate_partition(disk, 0);
+	blk_free_devt(disk_to_dev(disk)->devt);
+	set_capacity(disk, 0);
+	disk->flags &= ~GENHD_FL_UP;
+
 	sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 	bdi_unregister(&disk->queue->backing_dev_info);
 	blk_unregister_queue(disk);
 	blk_unregister_region(disk_devt(disk), disk->minors);
+
+	part_stat_set_all(&disk->part0, 0);
+	disk->part0.stamp = 0;
+
+	kobject_put(disk->part0.holder_dir);
+	kobject_put(disk->slave_dir);
+	disk->driverfs_dev = NULL;
+	if (!sysfs_deprecated)
+		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+	device_del(disk_to_dev(disk));
 }
+EXPORT_SYMBOL(del_gendisk);
 
 /**
  * get_gendisk - get partitioning information for a given device
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index bdf8d3cc95a4..9a48d65d9855 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -516,65 +516,6 @@ out_put:
 	return ERR_PTR(err);
 }
 
-/* Not exported, helper to add_disk(). */
-void register_disk(struct gendisk *disk)
-{
-	struct device *ddev = disk_to_dev(disk);
-	struct block_device *bdev;
-	struct disk_part_iter piter;
-	struct hd_struct *part;
-	int err;
-
-	ddev->parent = disk->driverfs_dev;
-
-	dev_set_name(ddev, disk->disk_name);
-
-	/* delay uevents, until we scanned partition table */
-	dev_set_uevent_suppress(ddev, 1);
-
-	if (device_add(ddev))
-		return;
-	if (!sysfs_deprecated) {
-		err = sysfs_create_link(block_depr, &ddev->kobj,
-					kobject_name(&ddev->kobj));
-		if (err) {
-			device_del(ddev);
-			return;
-		}
-	}
-	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
-	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
-
-	/* No minors to use for partitions */
-	if (!disk_partitionable(disk))
-		goto exit;
-
-	/* No such device (e.g., media were just removed) */
-	if (!get_capacity(disk))
-		goto exit;
-
-	bdev = bdget_disk(disk, 0);
-	if (!bdev)
-		goto exit;
-
-	bdev->bd_invalidated = 1;
-	err = blkdev_get(bdev, FMODE_READ, NULL);
-	if (err < 0)
-		goto exit;
-	blkdev_put(bdev, FMODE_READ);
-
-exit:
-	/* announce disk after possible partitions are created */
-	dev_set_uevent_suppress(ddev, 0);
-	kobject_uevent(&ddev->kobj, KOBJ_ADD);
-
-	/* announce possible partitions */
-	disk_part_iter_init(&piter, disk, 0);
-	while ((part = disk_part_iter_next(&piter)))
-		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
-	disk_part_iter_exit(&piter);
-}
-
 static bool disk_unlock_native_capacity(struct gendisk *disk)
 {
 	const struct block_device_operations *bdops = disk->fops;
@@ -737,33 +678,3 @@ fail:
 }
 
 EXPORT_SYMBOL(read_dev_sector);
-
-void del_gendisk(struct gendisk *disk)
-{
-	struct disk_part_iter piter;
-	struct hd_struct *part;
-
-	/* invalidate stuff */
-	disk_part_iter_init(&piter, disk,
-			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
-	while ((part = disk_part_iter_next(&piter))) {
-		invalidate_partition(disk, part->partno);
-		delete_partition(disk, part->partno);
-	}
-	disk_part_iter_exit(&piter);
-
-	invalidate_partition(disk, 0);
-	blk_free_devt(disk_to_dev(disk)->devt);
-	set_capacity(disk, 0);
-	disk->flags &= ~GENHD_FL_UP;
-	unlink_gendisk(disk);
-	part_stat_set_all(&disk->part0, 0);
-	disk->part0.stamp = 0;
-
-	kobject_put(disk->part0.holder_dir);
-	kobject_put(disk->slave_dir);
-	disk->driverfs_dev = NULL;
-	if (!sysfs_deprecated)
-		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
-	device_del(disk_to_dev(disk));
-}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aae86fd10c4f..83031bcf8366 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -643,7 +643,6 @@ static inline void rq_flush_dcache_pages(struct request *rq)
 
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
-extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_put_request(struct request *);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5e4e6928387c..56e17ed24816 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -394,7 +394,6 @@ extern void part_round_stats(int cpu, struct hd_struct *part);
 /* block/genhd.c */
 extern void add_disk(struct gendisk *disk);
 extern void del_gendisk(struct gendisk *gp);
-extern void unlink_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
 extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
 
-- 
cgit v1.2.3-59-g8ed1b


From 77ea887e433ad8389d416826936c110fa7910f80 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:37 +0100
Subject: implement in-kernel gendisk events handling

Currently, media presence polling for removeable block devices is done
from userland.  There are several issues with this.

* Polling is done by periodically opening the device.  For SCSI
  devices, the command sequence generated by such action involves a
  few different commands including TEST_UNIT_READY.  This behavior,
  while perfectly legal, is different from Windows which only issues
  single command, GET_EVENT_STATUS_NOTIFICATION.  Unfortunately, some
  ATAPI devices lock up after being periodically queried such command
  sequences.

* There is no reliable and unintrusive way for a userland program to
  tell whether the target device is safe for media presence polling.
  For example, polling for media presence during an on-going burning
  session can make it fail.  The polling program can avoid this by
  opening the device with O_EXCL but then it risks making a valid
  exclusive user of the device fail w/ -EBUSY.

* Userland polling is unnecessarily heavy and in-kernel implementation
  is lighter and better coordinated (workqueue, timer slack).

This patch implements framework for in-kernel disk event handling,
which includes media presence polling.

* bdops->check_events() is added, which supercedes ->media_changed().
  It should check whether there's any pending event and return if so.
  Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
  DISK_EVENT_EJECT_REQUEST.  ->check_events() is guaranteed not to be
  called parallelly.

* gendisk->events and ->async_events are added.  These should be
  initialized by block driver before passing the device to add_disk().
  The former contains the mask of all supported events and the latter
  the mask of all events which the device can report without polling.
  /sys/block/*/events[_async] export these to userland.

* Kernel parameter block.events_dfl_poll_msecs controls the system
  polling interval (default is 0 which means disable) and
  /sys/block/*/events_poll_msecs control polling intervals for
  individual devices (default is -1 meaning use system setting).  Note
  that if a device can report all supported events asynchronously and
  its polling interval isn't explicitly set, the device won't be
  polled regardless of the system polling interval.

* If a device is opened exclusively with write access, event checking
  is automatically disabled until all write exclusive accesses are
  released.

* There are event 'clearing' events.  For example, both of currently
  defined events are cleared after the device has been successfully
  opened.  This information is passed to ->check_events() callback
  using @clearing argument as a hint.

* Event checking is always performed from system_nrt_wq and timer
  slack is set to 25% for polling.

* Nothing changes for drivers which implement ->media_changed() but
  not ->check_events().  Going forward, all drivers will be converted
  to ->check_events() and ->media_change() will be dropped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c          | 429 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/block_dev.c         |  41 ++++-
 include/linux/blkdev.h |   3 +
 include/linux/fs.h     |   1 +
 include/linux/genhd.h  |  18 ++-
 5 files changed, 484 insertions(+), 8 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 2e5e4c0a1133..5465a824d489 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,6 +18,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
+#include <linux/log2.h>
 
 #include "blk.h"
 
@@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr);
 
 static struct device_type disk_type;
 
+static void disk_add_events(struct gendisk *disk);
+static void disk_del_events(struct gendisk *disk);
+static void disk_release_events(struct gendisk *disk);
+
 /**
  * disk_get_part - get partition
  * @disk: disk to look partition from
@@ -609,6 +614,8 @@ void add_disk(struct gendisk *disk)
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);
+
+	disk_add_events(disk);
 }
 EXPORT_SYMBOL(add_disk);
 
@@ -617,6 +624,8 @@ void del_gendisk(struct gendisk *disk)
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 
+	disk_del_events(disk);
+
 	/* invalidate stuff */
 	disk_part_iter_init(&piter, disk,
 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
@@ -1089,6 +1098,7 @@ static void disk_release(struct device *dev)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
+	disk_release_events(disk);
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
@@ -1350,3 +1360,422 @@ int invalidate_partition(struct gendisk *disk, int partno)
 }
 
 EXPORT_SYMBOL(invalidate_partition);
+
+/*
+ * Disk events - monitor disk events like media change and eject request.
+ */
+struct disk_events {
+	struct list_head	node;		/* all disk_event's */
+	struct gendisk		*disk;		/* the associated disk */
+	spinlock_t		lock;
+
+	int			block;		/* event blocking depth */
+	unsigned int		pending;	/* events already sent out */
+	unsigned int		clearing;	/* events being cleared */
+
+	long			poll_msecs;	/* interval, -1 for default */
+	struct delayed_work	dwork;
+};
+
+static const char *disk_events_strs[] = {
+	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "media_change",
+	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "eject_request",
+};
+
+static char *disk_uevents[] = {
+	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "DISK_MEDIA_CHANGE=1",
+	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "DISK_EJECT_REQUEST=1",
+};
+
+/* list of all disk_events */
+static DEFINE_MUTEX(disk_events_mutex);
+static LIST_HEAD(disk_events);
+
+/* disable in-kernel polling by default */
+static unsigned long disk_events_dfl_poll_msecs	= 0;
+
+static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
+{
+	struct disk_events *ev = disk->ev;
+	long intv_msecs = 0;
+
+	/*
+	 * If device-specific poll interval is set, always use it.  If
+	 * the default is being used, poll iff there are events which
+	 * can't be monitored asynchronously.
+	 */
+	if (ev->poll_msecs >= 0)
+		intv_msecs = ev->poll_msecs;
+	else if (disk->events & ~disk->async_events)
+		intv_msecs = disk_events_dfl_poll_msecs;
+
+	return msecs_to_jiffies(intv_msecs);
+}
+
+static void __disk_block_events(struct gendisk *disk, bool sync)
+{
+	struct disk_events *ev = disk->ev;
+	unsigned long flags;
+	bool cancel;
+
+	spin_lock_irqsave(&ev->lock, flags);
+	cancel = !ev->block++;
+	spin_unlock_irqrestore(&ev->lock, flags);
+
+	if (cancel) {
+		if (sync)
+			cancel_delayed_work_sync(&disk->ev->dwork);
+		else
+			cancel_delayed_work(&disk->ev->dwork);
+	}
+}
+
+static void __disk_unblock_events(struct gendisk *disk, bool check_now)
+{
+	struct disk_events *ev = disk->ev;
+	unsigned long intv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ev->lock, flags);
+
+	if (WARN_ON_ONCE(ev->block <= 0))
+		goto out_unlock;
+
+	if (--ev->block)
+		goto out_unlock;
+
+	/*
+	 * Not exactly a latency critical operation, set poll timer
+	 * slack to 25% and kick event check.
+	 */
+	intv = disk_events_poll_jiffies(disk);
+	set_timer_slack(&ev->dwork.timer, intv / 4);
+	if (check_now)
+		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+	else if (intv)
+		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+out_unlock:
+	spin_unlock_irqrestore(&ev->lock, flags);
+}
+
+/**
+ * disk_block_events - block and flush disk event checking
+ * @disk: disk to block events for
+ *
+ * On return from this function, it is guaranteed that event checking
+ * isn't in progress and won't happen until unblocked by
+ * disk_unblock_events().  Events blocking is counted and the actual
+ * unblocking happens after the matching number of unblocks are done.
+ *
+ * Note that this intentionally does not block event checking from
+ * disk_clear_events().
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void disk_block_events(struct gendisk *disk)
+{
+	if (disk->ev)
+		__disk_block_events(disk, true);
+}
+
+/**
+ * disk_unblock_events - unblock disk event checking
+ * @disk: disk to unblock events for
+ *
+ * Undo disk_block_events().  When the block count reaches zero, it
+ * starts events polling if configured.
+ *
+ * CONTEXT:
+ * Don't care.  Safe to call from irq context.
+ */
+void disk_unblock_events(struct gendisk *disk)
+{
+	if (disk->ev)
+		__disk_unblock_events(disk, true);
+}
+
+/**
+ * disk_check_events - schedule immediate event checking
+ * @disk: disk to check events for
+ *
+ * Schedule immediate event checking on @disk if not blocked.
+ *
+ * CONTEXT:
+ * Don't care.  Safe to call from irq context.
+ */
+void disk_check_events(struct gendisk *disk)
+{
+	if (disk->ev) {
+		__disk_block_events(disk, false);
+		__disk_unblock_events(disk, true);
+	}
+}
+EXPORT_SYMBOL_GPL(disk_check_events);
+
+/**
+ * disk_clear_events - synchronously check, clear and return pending events
+ * @disk: disk to fetch and clear events from
+ * @mask: mask of events to be fetched and clearted
+ *
+ * Disk events are synchronously checked and pending events in @mask
+ * are cleared and returned.  This ignores the block count.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
+{
+	const struct block_device_operations *bdops = disk->fops;
+	struct disk_events *ev = disk->ev;
+	unsigned int pending;
+
+	if (!ev) {
+		/* for drivers still using the old ->media_changed method */
+		if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
+		    bdops->media_changed && bdops->media_changed(disk))
+			return DISK_EVENT_MEDIA_CHANGE;
+		return 0;
+	}
+
+	/* tell the workfn about the events being cleared */
+	spin_lock_irq(&ev->lock);
+	ev->clearing |= mask;
+	spin_unlock_irq(&ev->lock);
+
+	/* uncondtionally schedule event check and wait for it to finish */
+	__disk_block_events(disk, true);
+	queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+	flush_delayed_work(&ev->dwork);
+	__disk_unblock_events(disk, false);
+
+	/* then, fetch and clear pending events */
+	spin_lock_irq(&ev->lock);
+	WARN_ON_ONCE(ev->clearing & mask);	/* cleared by workfn */
+	pending = ev->pending & mask;
+	ev->pending &= ~mask;
+	spin_unlock_irq(&ev->lock);
+
+	return pending;
+}
+
+static void disk_events_workfn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
+	struct gendisk *disk = ev->disk;
+	char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
+	unsigned int clearing = ev->clearing;
+	unsigned int events;
+	unsigned long intv;
+	int nr_events = 0, i;
+
+	/* check events */
+	events = disk->fops->check_events(disk, clearing);
+
+	/* accumulate pending events and schedule next poll if necessary */
+	spin_lock_irq(&ev->lock);
+
+	events &= ~ev->pending;
+	ev->pending |= events;
+	ev->clearing &= ~clearing;
+
+	intv = disk_events_poll_jiffies(disk);
+	if (!ev->block && intv)
+		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+
+	spin_unlock_irq(&ev->lock);
+
+	/* tell userland about new events */
+	for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
+		if (events & (1 << i))
+			envp[nr_events++] = disk_uevents[i];
+
+	if (nr_events)
+		kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+}
+
+/*
+ * A disk events enabled device has the following sysfs nodes under
+ * its /sys/block/X/ directory.
+ *
+ * events		: list of all supported events
+ * events_async		: list of events which can be detected w/o polling
+ * events_poll_msecs	: polling interval, 0: disable, -1: system default
+ */
+static ssize_t __disk_events_show(unsigned int events, char *buf)
+{
+	const char *delim = "";
+	ssize_t pos = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
+		if (events & (1 << i)) {
+			pos += sprintf(buf + pos, "%s%s",
+				       delim, disk_events_strs[i]);
+			delim = " ";
+		}
+	if (pos)
+		pos += sprintf(buf + pos, "\n");
+	return pos;
+}
+
+static ssize_t disk_events_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return __disk_events_show(disk->events, buf);
+}
+
+static ssize_t disk_events_async_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return __disk_events_show(disk->async_events, buf);
+}
+
+static ssize_t disk_events_poll_msecs_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
+}
+
+static ssize_t disk_events_poll_msecs_store(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	long intv;
+
+	if (!count || !sscanf(buf, "%ld", &intv))
+		return -EINVAL;
+
+	if (intv < 0 && intv != -1)
+		return -EINVAL;
+
+	__disk_block_events(disk, true);
+	disk->ev->poll_msecs = intv;
+	__disk_unblock_events(disk, true);
+
+	return count;
+}
+
+static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
+static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
+static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
+			 disk_events_poll_msecs_show,
+			 disk_events_poll_msecs_store);
+
+static const struct attribute *disk_events_attrs[] = {
+	&dev_attr_events.attr,
+	&dev_attr_events_async.attr,
+	&dev_attr_events_poll_msecs.attr,
+	NULL,
+};
+
+/*
+ * The default polling interval can be specified by the kernel
+ * parameter block.events_dfl_poll_msecs which defaults to 0
+ * (disable).  This can also be modified runtime by writing to
+ * /sys/module/block/events_dfl_poll_msecs.
+ */
+static int disk_events_set_dfl_poll_msecs(const char *val,
+					  const struct kernel_param *kp)
+{
+	struct disk_events *ev;
+	int ret;
+
+	ret = param_set_ulong(val, kp);
+	if (ret < 0)
+		return ret;
+
+	mutex_lock(&disk_events_mutex);
+
+	list_for_each_entry(ev, &disk_events, node)
+		disk_check_events(ev->disk);
+
+	mutex_unlock(&disk_events_mutex);
+
+	return 0;
+}
+
+static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
+	.set	= disk_events_set_dfl_poll_msecs,
+	.get	= param_get_ulong,
+};
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX	"block."
+
+module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
+		&disk_events_dfl_poll_msecs, 0644);
+
+/*
+ * disk_{add|del|release}_events - initialize and destroy disk_events.
+ */
+static void disk_add_events(struct gendisk *disk)
+{
+	struct disk_events *ev;
+
+	if (!disk->fops->check_events || !(disk->events | disk->async_events))
+		return;
+
+	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+	if (!ev) {
+		pr_warn("%s: failed to initialize events\n", disk->disk_name);
+		return;
+	}
+
+	if (sysfs_create_files(&disk_to_dev(disk)->kobj,
+			       disk_events_attrs) < 0) {
+		pr_warn("%s: failed to create sysfs files for events\n",
+			disk->disk_name);
+		kfree(ev);
+		return;
+	}
+
+	disk->ev = ev;
+
+	INIT_LIST_HEAD(&ev->node);
+	ev->disk = disk;
+	spin_lock_init(&ev->lock);
+	ev->block = 1;
+	ev->poll_msecs = -1;
+	INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
+
+	mutex_lock(&disk_events_mutex);
+	list_add_tail(&ev->node, &disk_events);
+	mutex_unlock(&disk_events_mutex);
+
+	/*
+	 * Block count is initialized to 1 and the following initial
+	 * unblock kicks it into action.
+	 */
+	__disk_unblock_events(disk, true);
+}
+
+static void disk_del_events(struct gendisk *disk)
+{
+	if (!disk->ev)
+		return;
+
+	__disk_block_events(disk, true);
+
+	mutex_lock(&disk_events_mutex);
+	list_del_init(&disk->ev->node);
+	mutex_unlock(&disk_events_mutex);
+
+	sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
+}
+
+static void disk_release_events(struct gendisk *disk)
+{
+	/* the block count should be 1 from disk_del_events() */
+	WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
+	kfree(disk->ev);
+}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c1c1b8c3fb99..6017389711ee 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -948,10 +948,11 @@ int check_disk_change(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	const struct block_device_operations *bdops = disk->fops;
+	unsigned int events;
 
-	if (!bdops->media_changed)
-		return 0;
-	if (!bdops->media_changed(bdev->bd_disk))
+	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
+				   DISK_EVENT_EJECT_REQUEST);
+	if (!(events & DISK_EVENT_MEDIA_CHANGE))
 		return 0;
 
 	flush_disk(bdev);
@@ -1158,9 +1159,10 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 
 	if (whole) {
 		/* finish claiming */
+		mutex_lock(&bdev->bd_mutex);
 		spin_lock(&bdev_lock);
 
-		if (res == 0) {
+		if (!res) {
 			BUG_ON(!bd_may_claim(bdev, whole, holder));
 			/*
 			 * Note that for a whole device bd_holders
@@ -1180,6 +1182,20 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 		wake_up_bit(&whole->bd_claiming, 0);
 
 		spin_unlock(&bdev_lock);
+
+		/*
+		 * Block event polling for write claims.  Any write
+		 * holder makes the write_holder state stick until all
+		 * are released.  This is good enough and tracking
+		 * individual writeable reference is too fragile given
+		 * the way @mode is used in blkdev_get/put().
+		 */
+		if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+			bdev->bd_write_holder = true;
+			disk_block_events(bdev->bd_disk);
+		}
+
+		mutex_unlock(&bdev->bd_mutex);
 		bdput(whole);
 	}
 
@@ -1353,12 +1369,23 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 
 		spin_unlock(&bdev_lock);
 
-		/* if this was the last claim, holder link should go too */
-		if (bdev_free)
+		/*
+		 * If this was the last claim, remove holder link and
+		 * unblock evpoll if it was a write holder.
+		 */
+		if (bdev_free) {
 			bd_unlink_disk_holder(bdev);
+			if (bdev->bd_write_holder) {
+				disk_unblock_events(bdev->bd_disk);
+				bdev->bd_write_holder = false;
+			} else
+				disk_check_events(bdev->bd_disk);
+		}
 
 		mutex_unlock(&bdev->bd_mutex);
-	}
+	} else
+		disk_check_events(bdev->bd_disk);
+
 	return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 83031bcf8366..05667e6989f1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1251,6 +1251,9 @@ struct block_device_operations {
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*direct_access) (struct block_device *, sector_t,
 						void **, unsigned long *);
+	unsigned int (*check_events) (struct gendisk *disk,
+				      unsigned int clearing);
+	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
 	int (*media_changed) (struct gendisk *);
 	void (*unlock_native_capacity) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f48501563917..997d22efdef0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -662,6 +662,7 @@ struct block_device {
 	void *			bd_claiming;
 	void *			bd_holder;
 	int			bd_holders;
+	bool			bd_write_holder;
 #ifdef CONFIG_SYSFS
 	struct gendisk *	bd_holder_disk;	/* for sysfs slave linkng */
 #endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 56e17ed24816..13893aa2ac9d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -127,6 +127,11 @@ struct hd_struct {
 #define GENHD_FL_EXT_DEVT			64 /* allow extended devt */
 #define GENHD_FL_NATIVE_CAPACITY		128
 
+enum {
+	DISK_EVENT_MEDIA_CHANGE			= 1 << 0, /* media changed */
+	DISK_EVENT_EJECT_REQUEST		= 1 << 1, /* eject requested */
+};
+
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
 
@@ -143,6 +148,8 @@ struct disk_part_tbl {
 	struct hd_struct __rcu *part[];
 };
 
+struct disk_events;
+
 struct gendisk {
 	/* major, first_minor and minors are input parameters only,
 	 * don't use directly.  Use disk_devt() and disk_max_parts().
@@ -154,6 +161,10 @@ struct gendisk {
 
 	char disk_name[DISK_NAME_LEN];	/* name of major driver */
 	char *(*devnode)(struct gendisk *gd, mode_t *mode);
+
+	unsigned int events;		/* supported events */
+	unsigned int async_events;	/* async events, subset of all */
+
 	/* Array of pointers to partitions indexed by partno.
 	 * Protected with matching bdev lock but stat and other
 	 * non-critical accesses use RCU.  Always access through
@@ -171,8 +182,8 @@ struct gendisk {
 	struct kobject *slave_dir;
 
 	struct timer_rand_state *random;
-
 	atomic_t sync_io;		/* RAID */
+	struct disk_events *ev;
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity *integrity;
 #endif
@@ -405,6 +416,11 @@ static inline int get_disk_ro(struct gendisk *disk)
 	return disk->part0.policy;
 }
 
+extern void disk_block_events(struct gendisk *disk);
+extern void disk_unblock_events(struct gendisk *disk);
+extern void disk_check_events(struct gendisk *disk);
+extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);
+
 /* drivers/char/random.c */
 extern void add_disk_randomness(struct gendisk *disk);
 extern void rand_initialize_disk(struct gendisk *disk);
-- 
cgit v1.2.3-59-g8ed1b


From 2d9217296bfa6fdc0d3707264076e5296faffdbd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:38 +0100
Subject: cdrom: add ->check_events() support

In principle, cdrom just needs to pass through ->check_events() but
CDROM_MEDIA_CHANGED ioctl makes things a bit more complex.  Just as
with ->media_changed() support, cdrom code needs to buffer the events
and serve them to ioctl and vfs as requested.

As the code has to deal with both ->check_events() and
->media_changed(), and vfs and ioctl event buffering, this patch adds
check_events caching on top of the existing cdi->mc_flags buffering.

It may be a good idea to deprecate CDROM_MEDIA_CHANGED ioctl and
remove all this mess.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/cdrom/cdrom.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/cdrom.h |  6 ++++++
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index af13c62dc473..1ea8f8d363c6 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -1348,7 +1348,10 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
 	if (!CDROM_CAN(CDC_SELECT_DISC))
 		return -EDRIVE_CANT_DO_THIS;
 
-	(void) cdi->ops->media_changed(cdi, slot);
+	if (cdi->ops->check_events)
+		cdi->ops->check_events(cdi, 0, slot);
+	else
+		cdi->ops->media_changed(cdi, slot);
 
 	if (slot == CDSL_NONE) {
 		/* set media changed bits, on both queues */
@@ -1392,6 +1395,41 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
 	return slot;
 }
 
+/*
+ * As cdrom implements an extra ioctl consumer for media changed
+ * event, it needs to buffer ->check_events() output, such that event
+ * is not lost for both the usual VFS and ioctl paths.
+ * cdi->{vfs|ioctl}_events are used to buffer pending events for each
+ * path.
+ *
+ * XXX: Locking is non-existent.  cdi->ops->check_events() can be
+ * called in parallel and buffering fields are accessed without any
+ * exclusion.  The original media_changed code had the same problem.
+ * It might be better to simply deprecate CDROM_MEDIA_CHANGED ioctl
+ * and remove this cruft altogether.  It doesn't have much usefulness
+ * at this point.
+ */
+static void cdrom_update_events(struct cdrom_device_info *cdi,
+				unsigned int clearing)
+{
+	unsigned int events;
+
+	events = cdi->ops->check_events(cdi, clearing, CDSL_CURRENT);
+	cdi->vfs_events |= events;
+	cdi->ioctl_events |= events;
+}
+
+unsigned int cdrom_check_events(struct cdrom_device_info *cdi,
+				unsigned int clearing)
+{
+	unsigned int events;
+
+	cdrom_update_events(cdi, clearing);
+	events = cdi->vfs_events;
+	cdi->vfs_events = 0;
+	return events;
+}
+
 /* We want to make media_changed accessible to the user through an
  * ioctl. The main problem now is that we must double-buffer the
  * low-level implementation, to assure that the VFS and the user both
@@ -1403,15 +1441,26 @@ int media_changed(struct cdrom_device_info *cdi, int queue)
 {
 	unsigned int mask = (1 << (queue & 1));
 	int ret = !!(cdi->mc_flags & mask);
+	bool changed;
 
 	if (!CDROM_CAN(CDC_MEDIA_CHANGED))
-	    return ret;
+		return ret;
+
 	/* changed since last call? */
-	if (cdi->ops->media_changed(cdi, CDSL_CURRENT)) {
+	if (cdi->ops->check_events) {
+		BUG_ON(!queue);	/* shouldn't be called from VFS path */
+		cdrom_update_events(cdi, DISK_EVENT_MEDIA_CHANGE);
+		changed = cdi->ioctl_events & DISK_EVENT_MEDIA_CHANGE;
+		cdi->ioctl_events = 0;
+	} else
+		changed = cdi->ops->media_changed(cdi, CDSL_CURRENT);
+
+	if (changed) {
 		cdi->mc_flags = 0x3;    /* set bit on both queues */
 		ret |= 1;
 		cdi->media_written = 0;
 	}
+
 	cdi->mc_flags &= ~mask;         /* clear bit */
 	return ret;
 }
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index 78e904796622..35eae4b67503 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -946,6 +946,8 @@ struct cdrom_device_info {
 /* device-related storage */
 	unsigned int options	: 30;	/* options flags */
 	unsigned mc_flags	: 2;	/* media change buffer flags */
+	unsigned int vfs_events;	/* cached events for vfs path */
+	unsigned int ioctl_events;	/* cached events for ioctl path */
     	int use_count;                  /* number of times device opened */
     	char name[20];                  /* name of the device type */
 /* per-device flags */
@@ -965,6 +967,8 @@ struct cdrom_device_ops {
 	int (*open) (struct cdrom_device_info *, int);
 	void (*release) (struct cdrom_device_info *);
 	int (*drive_status) (struct cdrom_device_info *, int);
+	unsigned int (*check_events) (struct cdrom_device_info *cdi,
+				      unsigned int clearing, int slot);
 	int (*media_changed) (struct cdrom_device_info *, int);
 	int (*tray_move) (struct cdrom_device_info *, int);
 	int (*lock_door) (struct cdrom_device_info *, int);
@@ -993,6 +997,8 @@ extern int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev,
 extern void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode);
 extern int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
 		       fmode_t mode, unsigned int cmd, unsigned long arg);
+extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi,
+				       unsigned int clearing);
 extern int cdrom_media_changed(struct cdrom_device_info *);
 
 extern int register_cdrom(struct cdrom_device_info *cdi);
-- 
cgit v1.2.3-59-g8ed1b


From 638428ece619495edc9579b1e21493eb00f9687c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Dec 2010 11:18:42 +0100
Subject: scsi: fix TUR error handling in sr_media_change()

sr_test_unit_ready() returns 0 iff TUR succeeded - IOW, when media is
present and the device is actually ready, so the return value wouldn't
be zero when TUR ends with sense data. sr_media_change() incorrectly
tests (retval || (scsi_sense_valid(sshdr)...)) when it tries to test
whether TUR failed without sense data or with sense data indicating
media-not-present.

Fix the test using scsi_status_is_good() and update comments.

- Fixed a comment typo spotted by Eike.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rolf Eike Beer <eike-kernel@sf-tec.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/scsi/sr.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index d7b383c96d5d..c7e29bdec2cd 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -214,13 +214,17 @@ static int sr_media_change(struct cdrom_device_info *cdi, int slot)
 
 	sshdr =  kzalloc(sizeof(*sshdr), GFP_KERNEL);
 	retval = sr_test_unit_ready(cd->device, sshdr);
-	if (retval || (scsi_sense_valid(sshdr) &&
-		       /* 0x3a is medium not present */
-		       sshdr->asc == 0x3a)) {
-		/* Media not present or unable to test, unit probably not
-		 * ready. This usually means there is no disc in the drive.
-		 * Mark as changed, and we will figure it out later once
-		 * the drive is available again.
+	/*
+	 * Media is considered to be present if TUR succeeds or fails with
+	 * sense data indicating something other than media-not-present
+	 * (ASC 0x3a).
+	 */
+	if (!scsi_status_is_good(retval) &&
+	    (!scsi_sense_valid(sshdr) || sshdr->asc == 0x3a)) {
+		/*
+		 * Probably no media in the device.  Mark as changed, and
+		 * we will figure it out later once the drive is available
+		 * again.
 		 */
 		cd->device->changed = 1;
 		/* This will force a flush, if called from check_disk_change */
-- 
cgit v1.2.3-59-g8ed1b


From 9f8a2c23c6c1140f515f601265c4dff7522110b7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:40 +0100
Subject: scsi: replace sr_test_unit_ready() with scsi_test_unit_ready()

The usage of TUR has been confusing involving several different
commits updating different parts over time.  Currently, the only
differences between scsi_test_unit_ready() and sr_test_unit_ready()
are,

* scsi_test_unit_ready() also sets sdev->changed on NOT_READY.

* scsi_test_unit_ready() returns 0 if TUR ended with UNIT_ATTENTION or
  NOT_READY.

Due to the above two differences, sr is using its own
sr_test_unit_ready(), but sd - the sole user of the above extra
handling - doesn't even need them.

Where scsi_test_unit_ready() is used in sd_media_changed(), the code
is looking for device ready w/ media present state which is true iff
TUR succeeds w/o sense data or UA, and when the device is not ready
for whatever reason sd_media_changed() explicitly marks media as
missing so there's no reason to set sdev->changed automatically from
scsi_test_unit_ready() on NOT_READY.

Drop both special handlings from scsi_test_unit_ready(), which makes
it equivalant to sr_test_unit_ready(), and replace
sr_test_unit_ready() with scsi_test_unit_ready().  Also, drop the
unnecessary explicit NOT_READY check from sd_media_changed().
Checking return value is enough for testing device readiness.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/scsi/scsi_lib.c | 13 +------------
 drivers/scsi/sd.c       | 10 +---------
 drivers/scsi/sr.c       | 31 +++----------------------------
 drivers/scsi/sr.h       |  1 -
 drivers/scsi/sr_ioctl.c |  2 +-
 5 files changed, 6 insertions(+), 51 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index eafeeda6e194..13bf89145b18 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1984,8 +1984,7 @@ EXPORT_SYMBOL(scsi_mode_sense);
  *		in.
  *
  *	Returns zero if unsuccessful or an error if TUR failed.  For
- *	removable media, a return of NOT_READY or UNIT_ATTENTION is
- *	translated to success, with the ->changed flag updated.
+ *	removable media, UNIT_ATTENTION sets ->changed flag.
  **/
 int
 scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries,
@@ -2012,16 +2011,6 @@ scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries,
 	} while (scsi_sense_valid(sshdr) &&
 		 sshdr->sense_key == UNIT_ATTENTION && --retries);
 
-	if (!sshdr)
-		/* could not allocate sense buffer, so can't process it */
-		return result;
-
-	if (sdev->removable && scsi_sense_valid(sshdr) &&
-	    (sshdr->sense_key == UNIT_ATTENTION ||
-	     sshdr->sense_key == NOT_READY)) {
-		sdev->changed = 1;
-		result = 0;
-	}
 	if (!sshdr_external)
 		kfree(sshdr);
 	return result;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index b9ab3a590e4b..8d488a9fef00 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1045,15 +1045,7 @@ static int sd_media_changed(struct gendisk *disk)
 					      sshdr);
 	}
 
-	/*
-	 * Unable to test, unit probably not ready.   This usually
-	 * means there is no disc in the drive.  Mark as changed,
-	 * and we will figure it out later once the drive is
-	 * available again.
-	 */
-	if (retval || (scsi_sense_valid(sshdr) &&
-		       /* 0x3a is medium not present */
-		       sshdr->asc == 0x3a)) {
+	if (retval) {
 		set_media_not_present(sdkp);
 		retval = 1;
 		goto out;
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index c7e29bdec2cd..310b3fac6313 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -165,32 +165,6 @@ static void scsi_cd_put(struct scsi_cd *cd)
 	mutex_unlock(&sr_ref_mutex);
 }
 
-/* identical to scsi_test_unit_ready except that it doesn't
- * eat the NOT_READY returns for removable media */
-int sr_test_unit_ready(struct scsi_device *sdev, struct scsi_sense_hdr *sshdr)
-{
-	int retries = MAX_RETRIES;
-	int the_result;
-	u8 cmd[] = {TEST_UNIT_READY, 0, 0, 0, 0, 0 };
-
-	/* issue TEST_UNIT_READY until the initial startup UNIT_ATTENTION
-	 * conditions are gone, or a timeout happens
-	 */
-	do {
-		the_result = scsi_execute_req(sdev, cmd, DMA_NONE, NULL,
-					      0, sshdr, SR_TIMEOUT,
-					      retries--, NULL);
-		if (scsi_sense_valid(sshdr) &&
-		    sshdr->sense_key == UNIT_ATTENTION)
-			sdev->changed = 1;
-
-	} while (retries > 0 &&
-		 (!scsi_status_is_good(the_result) ||
-		  (scsi_sense_valid(sshdr) &&
-		   sshdr->sense_key == UNIT_ATTENTION)));
-	return the_result;
-}
-
 /*
  * This function checks to see if the media has been changed in the
  * CDROM drive.  It is possible that we have already sensed a change,
@@ -213,7 +187,8 @@ static int sr_media_change(struct cdrom_device_info *cdi, int slot)
 	}
 
 	sshdr =  kzalloc(sizeof(*sshdr), GFP_KERNEL);
-	retval = sr_test_unit_ready(cd->device, sshdr);
+	retval = scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES,
+				      sshdr);
 	/*
 	 * Media is considered to be present if TUR succeeds or fails with
 	 * sense data indicating something other than media-not-present
@@ -784,7 +759,7 @@ static void get_capabilities(struct scsi_cd *cd)
 	}
 
 	/* eat unit attentions */
-	sr_test_unit_ready(cd->device, &sshdr);
+	scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr);
 
 	/* ask for mode page 0x2a */
 	rc = scsi_mode_sense(cd->device, 0, 0x2a, buffer, 128,
diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h
index 1e144dfdbd4b..81fbc0b78a52 100644
--- a/drivers/scsi/sr.h
+++ b/drivers/scsi/sr.h
@@ -61,7 +61,6 @@ int sr_select_speed(struct cdrom_device_info *cdi, int speed);
 int sr_audio_ioctl(struct cdrom_device_info *, unsigned int, void *);
 
 int sr_is_xa(Scsi_CD *);
-int sr_test_unit_ready(struct scsi_device *sdev, struct scsi_sense_hdr *sshdr);
 
 /* sr_vendor.c */
 void sr_vendor_init(Scsi_CD *);
diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c
index 3cd8ffbad577..8be30554119b 100644
--- a/drivers/scsi/sr_ioctl.c
+++ b/drivers/scsi/sr_ioctl.c
@@ -307,7 +307,7 @@ int sr_drive_status(struct cdrom_device_info *cdi, int slot)
 		/* we have no changer support */
 		return -EINVAL;
 	}
-	if (0 == sr_test_unit_ready(cd->device, &sshdr))
+	if (!scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr))
 		return CDS_DISC_OK;
 
 	/* SK/ASC/ASCQ of 2/4/1 means "unit is becoming ready" */
-- 
cgit v1.2.3-59-g8ed1b


From 93aae17af1172c40c6f74b7294e93a90c3cfaa5d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 16 Dec 2010 17:52:17 +0100
Subject: sr: implement sr_check_events()

Replace sr_media_change() with sr_check_events().  It normally only
uses GET_EVENT_STATUS_NOTIFICATION to check both media change and
eject request.  If @clearing includes DISK_EVENT_MEDIA_CHANGE, it
issues TUR and compares whether media presence has changed.  The SCSI
specific media change uevent is kept for compatibility.

sr_media_change() was doing both media change check and revalidation.
The revalidation part is split into sr_block_revalidate_disk().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/scsi/sr.c   | 149 ++++++++++++++++++++++++++++++++++------------------
 drivers/scsi/sr.h   |   2 +-
 include/scsi/scsi.h |   1 +
 3 files changed, 99 insertions(+), 53 deletions(-)

diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 310b3fac6313..be6baf8ad704 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -104,14 +104,15 @@ static void sr_release(struct cdrom_device_info *);
 static void get_sectorsize(struct scsi_cd *);
 static void get_capabilities(struct scsi_cd *);
 
-static int sr_media_change(struct cdrom_device_info *, int);
+static unsigned int sr_check_events(struct cdrom_device_info *cdi,
+				    unsigned int clearing, int slot);
 static int sr_packet(struct cdrom_device_info *, struct packet_command *);
 
 static struct cdrom_device_ops sr_dops = {
 	.open			= sr_open,
 	.release	 	= sr_release,
 	.drive_status	 	= sr_drive_status,
-	.media_changed		= sr_media_change,
+	.check_events		= sr_check_events,
 	.tray_move		= sr_tray_move,
 	.lock_door		= sr_lock_door,
 	.select_speed		= sr_select_speed,
@@ -165,69 +166,96 @@ static void scsi_cd_put(struct scsi_cd *cd)
 	mutex_unlock(&sr_ref_mutex);
 }
 
+static unsigned int sr_get_events(struct scsi_device *sdev)
+{
+	u8 buf[8];
+	u8 cmd[] = { GET_EVENT_STATUS_NOTIFICATION,
+		     1,			/* polled */
+		     0, 0,		/* reserved */
+		     1 << 4,		/* notification class: media */
+		     0, 0,		/* reserved */
+		     0, sizeof(buf),	/* allocation length */
+		     0,			/* control */
+	};
+	struct event_header *eh = (void *)buf;
+	struct media_event_desc *med = (void *)(buf + 4);
+	struct scsi_sense_hdr sshdr;
+	int result;
+
+	result = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buf, sizeof(buf),
+				  &sshdr, SR_TIMEOUT, MAX_RETRIES, NULL);
+	if (scsi_sense_valid(&sshdr) && sshdr.sense_key == UNIT_ATTENTION)
+		return DISK_EVENT_MEDIA_CHANGE;
+
+	if (result || be16_to_cpu(eh->data_len) < sizeof(*med))
+		return 0;
+
+	if (eh->nea || eh->notification_class != 0x4)
+		return 0;
+
+	if (med->media_event_code == 1)
+		return DISK_EVENT_EJECT_REQUEST;
+	else if (med->media_event_code == 2)
+		return DISK_EVENT_MEDIA_CHANGE;
+	return 0;
+}
+
 /*
- * This function checks to see if the media has been changed in the
- * CDROM drive.  It is possible that we have already sensed a change,
- * or the drive may have sensed one and not yet reported it.  We must
- * be ready for either case. This function always reports the current
- * value of the changed bit.  If flag is 0, then the changed bit is reset.
- * This function could be done as an ioctl, but we would need to have
- * an inode for that to work, and we do not always have one.
+ * This function checks to see if the media has been changed or eject
+ * button has been pressed.  It is possible that we have already
+ * sensed a change, or the drive may have sensed one and not yet
+ * reported it.  The past events are accumulated in sdev->changed and
+ * returned together with the current state.
  */
-
-static int sr_media_change(struct cdrom_device_info *cdi, int slot)
+static unsigned int sr_check_events(struct cdrom_device_info *cdi,
+				    unsigned int clearing, int slot)
 {
 	struct scsi_cd *cd = cdi->handle;
-	int retval;
-	struct scsi_sense_hdr *sshdr;
+	bool last_present;
+	struct scsi_sense_hdr sshdr;
+	unsigned int events;
+	int ret;
 
-	if (CDSL_CURRENT != slot) {
-		/* no changer support */
-		return -EINVAL;
-	}
+	/* no changer support */
+	if (CDSL_CURRENT != slot)
+		return 0;
+
+	events = sr_get_events(cd->device);
+	/*
+	 * GET_EVENT_STATUS_NOTIFICATION is enough unless MEDIA_CHANGE
+	 * is being cleared.  Note that there are devices which hang
+	 * if asked to execute TUR repeatedly.
+	 */
+	if (!(clearing & DISK_EVENT_MEDIA_CHANGE))
+		goto skip_tur;
+
+	/* let's see whether the media is there with TUR */
+	last_present = cd->media_present;
+	ret = scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr);
 
-	sshdr =  kzalloc(sizeof(*sshdr), GFP_KERNEL);
-	retval = scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES,
-				      sshdr);
 	/*
 	 * Media is considered to be present if TUR succeeds or fails with
 	 * sense data indicating something other than media-not-present
 	 * (ASC 0x3a).
 	 */
-	if (!scsi_status_is_good(retval) &&
-	    (!scsi_sense_valid(sshdr) || sshdr->asc == 0x3a)) {
-		/*
-		 * Probably no media in the device.  Mark as changed, and
-		 * we will figure it out later once the drive is available
-		 * again.
-		 */
-		cd->device->changed = 1;
-		/* This will force a flush, if called from check_disk_change */
-		retval = 1;
-		goto out;
-	};
+	cd->media_present = scsi_status_is_good(ret) ||
+		(scsi_sense_valid(&sshdr) && sshdr.asc != 0x3a);
 
-	retval = cd->device->changed;
-	cd->device->changed = 0;
-	/* If the disk changed, the capacity will now be different,
-	 * so we force a re-read of this information */
-	if (retval) {
-		/* check multisession offset etc */
-		sr_cd_check(cdi);
-		get_sectorsize(cd);
+	if (last_present != cd->media_present)
+		events |= DISK_EVENT_MEDIA_CHANGE;
+skip_tur:
+	if (cd->device->changed) {
+		events |= DISK_EVENT_MEDIA_CHANGE;
+		cd->device->changed = 0;
 	}
 
-out:
-	/* Notify userspace, that media has changed. */
-	if (retval != cd->previous_state)
+	/* for backward compatibility */
+	if (events & DISK_EVENT_MEDIA_CHANGE)
 		sdev_evt_send_simple(cd->device, SDEV_EVT_MEDIA_CHANGE,
 				     GFP_KERNEL);
-	cd->previous_state = retval;
-	kfree(sshdr);
-
-	return retval;
+	return events;
 }
- 
+
 /*
  * sr_done is the interrupt routine for the device driver.
  *
@@ -512,10 +540,25 @@ out:
 	return ret;
 }
 
-static int sr_block_media_changed(struct gendisk *disk)
+static unsigned int sr_block_check_events(struct gendisk *disk,
+					  unsigned int clearing)
 {
 	struct scsi_cd *cd = scsi_cd(disk);
-	return cdrom_media_changed(&cd->cdi);
+	return cdrom_check_events(&cd->cdi, clearing);
+}
+
+static int sr_block_revalidate_disk(struct gendisk *disk)
+{
+	struct scsi_cd *cd = scsi_cd(disk);
+	struct scsi_sense_hdr sshdr;
+
+	/* if the unit is not ready, nothing more to do */
+	if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr))
+		return 0;
+
+	sr_cd_check(&cd->cdi);
+	get_sectorsize(cd);
+	return 0;
 }
 
 static const struct block_device_operations sr_bdops =
@@ -524,7 +567,8 @@ static const struct block_device_operations sr_bdops =
 	.open		= sr_block_open,
 	.release	= sr_block_release,
 	.ioctl		= sr_block_ioctl,
-	.media_changed	= sr_block_media_changed,
+	.check_events	= sr_block_check_events,
+	.revalidate_disk = sr_block_revalidate_disk,
 	/* 
 	 * No compat_ioctl for now because sr_block_ioctl never
 	 * seems to pass arbitary ioctls down to host drivers.
@@ -597,6 +641,7 @@ static int sr_probe(struct device *dev)
 	sprintf(disk->disk_name, "sr%d", minor);
 	disk->fops = &sr_bdops;
 	disk->flags = GENHD_FL_CD;
+	disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST;
 
 	blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT);
 
@@ -606,7 +651,7 @@ static int sr_probe(struct device *dev)
 	cd->disk = disk;
 	cd->capacity = 0x1fffff;
 	cd->device->changed = 1;	/* force recheck CD type */
-	cd->previous_state = 1;
+	cd->media_present = 1;
 	cd->use = 1;
 	cd->readcd_known = 0;
 	cd->readcd_cdda = 0;
diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h
index 81fbc0b78a52..e036f1dc83c8 100644
--- a/drivers/scsi/sr.h
+++ b/drivers/scsi/sr.h
@@ -40,7 +40,7 @@ typedef struct scsi_cd {
 	unsigned xa_flag:1;	/* CD has XA sectors ? */
 	unsigned readcd_known:1;	/* drive supports READ_CD (0xbe) */
 	unsigned readcd_cdda:1;	/* reading audio data using READ_CD */
-	unsigned previous_state:1;	/* media has changed */
+	unsigned media_present:1;	/* media is present */
 	struct cdrom_device_info cdi;
 	/* We hold gendisk and scsi_device references on probe and use
 	 * the refs on this kref to decide when to release them */
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 216af8538cc9..86758618d4f8 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -104,6 +104,7 @@ struct scsi_cmnd;
 #define UNMAP		      0x42
 #define READ_TOC              0x43
 #define READ_HEADER           0x44
+#define GET_EVENT_STATUS_NOTIFICATION 0x4a
 #define LOG_SELECT            0x4c
 #define LOG_SENSE             0x4d
 #define XDWRITEREAD_10        0x53
-- 
cgit v1.2.3-59-g8ed1b


From c8d2e937355d02db3055c2fc203e5f017297ee1f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:42 +0100
Subject: sd: implement sd_check_events()

Replace sd_media_change() with sd_check_events().  sd used to set the
changed state whenever the device is not ready, which can cause event
loop while the device is not ready.  Media presence handling code is
changed such that the changed state is set iff the media presence
actually changes.  UA still always sets the changed state and
NOT_READY always (at least where it used to set ->changed) clears
media presence, so no event is lost.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/scsi/sd.c | 87 ++++++++++++++++++++++++++++++-------------------------
 drivers/scsi/sd.h |  1 -
 2 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 8d488a9fef00..b179b0e39a3b 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -991,30 +991,50 @@ out:
 
 static void set_media_not_present(struct scsi_disk *sdkp)
 {
+	if (sdkp->media_present)
+		sdkp->device->changed = 1;
 	sdkp->media_present = 0;
 	sdkp->capacity = 0;
-	sdkp->device->changed = 1;
+}
+
+static int media_not_present(struct scsi_disk *sdkp,
+			     struct scsi_sense_hdr *sshdr)
+{
+	if (!scsi_sense_valid(sshdr))
+		return 0;
+
+	/* not invoked for commands that could return deferred errors */
+	switch (sshdr->sense_key) {
+	case UNIT_ATTENTION:
+		sdkp->device->changed = 1;
+		/* fall through */
+	case NOT_READY:
+		/* medium not present */
+		if (sshdr->asc == 0x3A) {
+			set_media_not_present(sdkp);
+			return 1;
+		}
+	}
+	return 0;
 }
 
 /**
- *	sd_media_changed - check if our medium changed
- *	@disk: kernel device descriptor 
+ *	sd_check_events - check media events
+ *	@disk: kernel device descriptor
+ *	@clearing: disk events currently being cleared
  *
- *	Returns 0 if not applicable or no change; 1 if change
+ *	Returns mask of DISK_EVENT_*.
  *
  *	Note: this function is invoked from the block subsystem.
  **/
-static int sd_media_changed(struct gendisk *disk)
+static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing)
 {
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	struct scsi_device *sdp = sdkp->device;
 	struct scsi_sense_hdr *sshdr = NULL;
 	int retval;
 
-	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_media_changed\n"));
-
-	if (!sdp->removable)
-		return 0;
+	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_check_events\n"));
 
 	/*
 	 * If the device is offline, don't send any commands - just pretend as
@@ -1024,7 +1044,6 @@ static int sd_media_changed(struct gendisk *disk)
 	 */
 	if (!scsi_device_online(sdp)) {
 		set_media_not_present(sdkp);
-		retval = 1;
 		goto out;
 	}
 
@@ -1045,26 +1064,30 @@ static int sd_media_changed(struct gendisk *disk)
 					      sshdr);
 	}
 
-	if (retval) {
+	/* failed to execute TUR, assume media not present */
+	if (host_byte(retval)) {
 		set_media_not_present(sdkp);
-		retval = 1;
 		goto out;
 	}
 
+	if (media_not_present(sdkp, sshdr))
+		goto out;
+
 	/*
 	 * For removable scsi disk we have to recognise the presence
-	 * of a disk in the drive. This is kept in the struct scsi_disk
-	 * struct and tested at open !  Daniel Roche (dan@lectra.fr)
+	 * of a disk in the drive.
 	 */
+	if (!sdkp->media_present)
+		sdp->changed = 1;
 	sdkp->media_present = 1;
-
-	retval = sdp->changed;
-	sdp->changed = 0;
 out:
-	if (retval != sdkp->previous_state)
+	/* for backward compatibility */
+	if (sdp->changed)
 		sdev_evt_send_simple(sdp, SDEV_EVT_MEDIA_CHANGE, GFP_KERNEL);
-	sdkp->previous_state = retval;
 	kfree(sshdr);
+
+	retval = sdp->changed ? DISK_EVENT_MEDIA_CHANGE : 0;
+	sdp->changed = 0;
 	return retval;
 }
 
@@ -1157,7 +1180,7 @@ static const struct block_device_operations sd_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl		= sd_compat_ioctl,
 #endif
-	.media_changed		= sd_media_changed,
+	.check_events		= sd_check_events,
 	.revalidate_disk	= sd_revalidate_disk,
 	.unlock_native_capacity	= sd_unlock_native_capacity,
 };
@@ -1293,23 +1316,6 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 	return good_bytes;
 }
 
-static int media_not_present(struct scsi_disk *sdkp,
-			     struct scsi_sense_hdr *sshdr)
-{
-
-	if (!scsi_sense_valid(sshdr))
-		return 0;
-	/* not invoked for commands that could return deferred errors */
-	if (sshdr->sense_key != NOT_READY &&
-	    sshdr->sense_key != UNIT_ATTENTION)
-		return 0;
-	if (sshdr->asc != 0x3A) /* medium not present */
-		return 0;
-
-	set_media_not_present(sdkp);
-	return 1;
-}
-
 /*
  * spinup disk - called only in sd_revalidate_disk()
  */
@@ -1484,7 +1490,7 @@ static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp,
 	 */
 	if (sdp->removable &&
 	    sense_valid && sshdr->sense_key == NOT_READY)
-		sdp->changed = 1;
+		set_media_not_present(sdkp);
 
 	/*
 	 * We used to set media_present to 0 here to indicate no media
@@ -2339,8 +2345,10 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
 
 	gd->driverfs_dev = &sdp->sdev_gendev;
 	gd->flags = GENHD_FL_EXT_DEVT;
-	if (sdp->removable)
+	if (sdp->removable) {
 		gd->flags |= GENHD_FL_REMOVABLE;
+		gd->events |= DISK_EVENT_MEDIA_CHANGE;
+	}
 
 	add_disk(gd);
 	sd_dif_config_host(sdkp);
@@ -2422,7 +2430,6 @@ static int sd_probe(struct device *dev)
 	sdkp->disk = gd;
 	sdkp->index = index;
 	atomic_set(&sdkp->openers, 0);
-	sdkp->previous_state = 1;
 
 	if (!sdp->request_queue->rq_timeout) {
 		if (sdp->type != TYPE_MOD)
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 55488faf0815..c9d8f6ca49e2 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -55,7 +55,6 @@ struct scsi_disk {
 	u8		media_present;
 	u8		write_prot;
 	u8		protection_type;/* Data Integrity Field */
-	unsigned	previous_state : 1;
 	unsigned	ATO : 1;	/* state of disk ATO bit */
 	unsigned	WCE : 1;	/* state of disk WCE bit */
 	unsigned	RCD : 1;	/* state of disk RCD bit, unused */
-- 
cgit v1.2.3-59-g8ed1b


From 30a7caabbb8e8702b9cf453fee83f773d8b3430d Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 16 Dec 2010 17:58:50 +0100
Subject: cdrom: export cdrom_check_events()

It's used by sr, so we need to export it.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/cdrom/cdrom.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 1ea8f8d363c6..14033a36bcd0 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -1429,6 +1429,7 @@ unsigned int cdrom_check_events(struct cdrom_device_info *cdi,
 	cdi->vfs_events = 0;
 	return events;
 }
+EXPORT_SYMBOL(cdrom_check_events);
 
 /* We want to make media_changed accessible to the user through an
  * ioctl. The main problem now is that we must double-buffer the
-- 
cgit v1.2.3-59-g8ed1b


From a8adbe378b56acd5945df70753c7e8f6fe223304 Mon Sep 17 00:00:00 2001
From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Fri, 17 Dec 2010 08:56:44 +0100
Subject: fs/splice: Pull buf->ops->confirm() from splice_from_pipe actors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch pulls calls to buf->ops->confirm() from all actors passed
(also indirectly) to splice_from_pipe_feed().

Is avoiding the call to buf->ops->confirm() while splice()ing to
/dev/null is an intentional optimization? No other user does that
and this will remove this special case.

Against current linux.git 6313e3c21743cc88bb5bd8aa72948ee1e83937b6.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/loop.c |  4 ----
 fs/nfsd/vfs.c        |  4 ----
 fs/splice.c          | 43 ++++++++++++++-----------------------------
 3 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 7ea0bea2f7e3..c87b0840d2cb 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -397,10 +397,6 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	sector_t IV;
 	int size, ret;
 
-	ret = buf->ops->confirm(pipe, buf);
-	if (unlikely(ret))
-		return ret;
-
 	IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +
 							(buf->offset >> 9);
 	size = sd->len;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 184938fcff04..c6e08661adee 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -847,10 +847,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	size_t size;
 	int ret;
 
-	ret = buf->ops->confirm(pipe, buf);
-	if (unlikely(ret))
-		return ret;
-
 	size = sd->len;
 
 	if (rqstp->rq_res.page_len == 0) {
diff --git a/fs/splice.c b/fs/splice.c
index 8f1dfaecc8f0..d2026382ac3e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -682,19 +682,14 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 {
 	struct file *file = sd->u.file;
 	loff_t pos = sd->pos;
-	int ret, more;
-
-	ret = buf->ops->confirm(pipe, buf);
-	if (!ret) {
-		more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
-		if (file->f_op && file->f_op->sendpage)
-			ret = file->f_op->sendpage(file, buf->page, buf->offset,
-						   sd->len, &pos, more);
-		else
-			ret = -EINVAL;
-	}
+	int more;
 
-	return ret;
+	if (!likely(file->f_op && file->f_op->sendpage))
+		return -EINVAL;
+
+	more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+	return file->f_op->sendpage(file, buf->page, buf->offset,
+				    sd->len, &pos, more);
 }
 
 /*
@@ -727,13 +722,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	void *fsdata;
 	int ret;
 
-	/*
-	 * make sure the data in this buffer is uptodate
-	 */
-	ret = buf->ops->confirm(pipe, buf);
-	if (unlikely(ret))
-		return ret;
-
 	offset = sd->pos & ~PAGE_CACHE_MASK;
 
 	this_len = sd->len;
@@ -805,12 +793,17 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 		if (sd->len > sd->total_len)
 			sd->len = sd->total_len;
 
-		ret = actor(pipe, buf, sd);
-		if (ret <= 0) {
+		ret = buf->ops->confirm(pipe, buf);
+		if (unlikely(ret)) {
 			if (ret == -ENODATA)
 				ret = 0;
 			return ret;
 		}
+
+		ret = actor(pipe, buf, sd);
+		if (ret <= 0)
+			return ret;
+
 		buf->offset += ret;
 		buf->len -= ret;
 
@@ -1044,10 +1037,6 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	int ret;
 	void *data;
 
-	ret = buf->ops->confirm(pipe, buf);
-	if (ret)
-		return ret;
-
 	data = buf->ops->map(pipe, buf, 0);
 	ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
 	buf->ops->unmap(pipe, buf, data);
@@ -1507,10 +1496,6 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	char *src;
 	int ret;
 
-	ret = buf->ops->confirm(pipe, buf);
-	if (unlikely(ret))
-		return ret;
-
 	/*
 	 * See if we can use the atomic maps, by prefaulting in the
 	 * pages and doing an atomic copy
-- 
cgit v1.2.3-59-g8ed1b


From 7278c9c19bd85cf33213a2e0b538a18d3ac8ad00 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Fri, 17 Dec 2010 08:57:14 +0100
Subject: cfq-iosched: don't check cfqg in choose_service_tree()

When cfq_choose_cfqg() is called in select_queue(), there must be at least one
backlogged CFQ queue waiting for dispatching, hence there must be at least one
backlogged CFQ group on service tree. So we never call choose_service_tree()
with cfqg == NULL.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9b186fd6bf47..c19d015ac5a5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2103,12 +2103,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	unsigned group_slice;
 	enum wl_prio_t original_prio = cfqd->serving_prio;
 
-	if (!cfqg) {
-		cfqd->serving_prio = IDLE_WORKLOAD;
-		cfqd->workload_expires = jiffies + 1;
-		return;
-	}
-
 	/* Choose next priority. RT > BE > IDLE */
 	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
 		cfqd->serving_prio = RT_WORKLOAD;
-- 
cgit v1.2.3-59-g8ed1b


From b9f985b6e05ebd7af2aaef0eb3ae369390ef191f Mon Sep 17 00:00:00 2001
From: Yang Zhang <kthreadd@gmail.com>
Date: Fri, 17 Dec 2010 08:58:36 +0100
Subject: block: convert !IS_ERR(p) && p to !IS_ERR_NOR_NULL(p)

Signed-off-by: Yang Zhang <kthreadd@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index 5fa2b44a72ff..79b9e327f3cb 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -735,7 +735,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
 	static void *p;
 
 	p = disk_seqf_start(seqf, pos);
-	if (!IS_ERR(p) && p && !*pos)
+	if (!IS_ERR_OR_NULL(p) && !*pos)
 		seq_puts(seqf, "major minor  #blocks  name\n\n");
 	return p;
 }
-- 
cgit v1.2.3-59-g8ed1b


From e61eb2e93fe86931d46831752a82dab25a5335ca Mon Sep 17 00:00:00 2001
From: Yang Zhang <kthreadd@gmail.com>
Date: Fri, 17 Dec 2010 09:00:18 +0100
Subject: fs/block: type signature of major_to_index(int) to
 major_to_index(unsigned)

The major/minor device numbers are always defined and used as `unsigned'.

Signed-off-by: Yang Zhang <kthreadd@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 2 +-
 fs/char_dev.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 79b9e327f3cb..16ccc0d2d5d3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -239,7 +239,7 @@ static struct blk_major_name {
 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
 
 /* index in the above - for now: assume no multimajor ranges */
-static inline int major_to_index(int major)
+static inline int major_to_index(unsigned major)
 {
 	return major % BLKDEV_MAJOR_HASH_SIZE;
 }
diff --git a/fs/char_dev.c b/fs/char_dev.c
index e5b9df993b93..143f0207c7eb 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -59,7 +59,7 @@ static struct char_device_struct {
 } *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
 
 /* index in the above */
-static inline int major_to_index(int major)
+static inline int major_to_index(unsigned major)
 {
 	return major % CHRDEV_MAJOR_HASH_SIZE;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 3603b8eaccc8e41d3f355b3cadd662a3dd6699fd Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 20 Dec 2010 09:15:19 +0100
Subject: Fix compile warnings due to missing removal of a 'ret' variable

Commit a8adbe3 forgot to remove the return variable, kill it.

drivers/block/loop.c: In function 'lo_splice_actor':
drivers/block/loop.c:398: warning: unused variable 'ret'
[...]
fs/nfsd/vfs.c: In function 'nfsd_splice_actor':
fs/nfsd/vfs.c:848: warning: unused variable 'ret'

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/loop.c | 2 +-
 fs/nfsd/vfs.c        | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c87b0840d2cb..44e18c073c44 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -395,7 +395,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	struct loop_device *lo = p->lo;
 	struct page *page = buf->page;
 	sector_t IV;
-	int size, ret;
+	int size;
 
 	IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +
 							(buf->offset >> 9);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c6e08661adee..106ed482f119 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -845,7 +845,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
 	struct page *page = buf->page;
 	size_t size;
-	int ret;
 
 	size = sd->len;
 
-- 
cgit v1.2.3-59-g8ed1b


From 27667c996f6a0bed4ad1e10ac0a0dbb6037968db Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 21 Dec 2010 15:07:45 +0100
Subject: block: Clean up exit_io_context() source code.

This patch fixes a spelling error in a source code comment and removes
superfluous braces in the function exit_io_context().

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Cc: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-ioc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 3c7a339fe381..b791022beef3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -64,7 +64,7 @@ static void cfq_exit(struct io_context *ioc)
 	rcu_read_unlock();
 }
 
-/* Called by the exitting task */
+/* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
@@ -74,10 +74,9 @@ void exit_io_context(struct task_struct *task)
 	task->io_context = NULL;
 	task_unlock(task);
 
-	if (atomic_dec_and_test(&ioc->nr_tasks)) {
+	if (atomic_dec_and_test(&ioc->nr_tasks))
 		cfq_exit(ioc);
 
-	}
 	put_io_context(ioc);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From fcc57045d53edc35bcce456e60ac4aa802712934 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 22 Dec 2010 09:48:49 +0100
Subject: Revert "sd: implement sd_check_events()"

This reverts commit c8d2e937355d02db3055c2fc203e5f017297ee1f.

We run into merging problems with the SCSI tree, revert this one
so it can be handled by a postmerge tree there.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/scsi/sd.c | 87 +++++++++++++++++++++++++------------------------------
 drivers/scsi/sd.h |  1 +
 2 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index b179b0e39a3b..8d488a9fef00 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -991,50 +991,30 @@ out:
 
 static void set_media_not_present(struct scsi_disk *sdkp)
 {
-	if (sdkp->media_present)
-		sdkp->device->changed = 1;
 	sdkp->media_present = 0;
 	sdkp->capacity = 0;
-}
-
-static int media_not_present(struct scsi_disk *sdkp,
-			     struct scsi_sense_hdr *sshdr)
-{
-	if (!scsi_sense_valid(sshdr))
-		return 0;
-
-	/* not invoked for commands that could return deferred errors */
-	switch (sshdr->sense_key) {
-	case UNIT_ATTENTION:
-		sdkp->device->changed = 1;
-		/* fall through */
-	case NOT_READY:
-		/* medium not present */
-		if (sshdr->asc == 0x3A) {
-			set_media_not_present(sdkp);
-			return 1;
-		}
-	}
-	return 0;
+	sdkp->device->changed = 1;
 }
 
 /**
- *	sd_check_events - check media events
- *	@disk: kernel device descriptor
- *	@clearing: disk events currently being cleared
+ *	sd_media_changed - check if our medium changed
+ *	@disk: kernel device descriptor 
  *
- *	Returns mask of DISK_EVENT_*.
+ *	Returns 0 if not applicable or no change; 1 if change
  *
  *	Note: this function is invoked from the block subsystem.
  **/
-static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing)
+static int sd_media_changed(struct gendisk *disk)
 {
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	struct scsi_device *sdp = sdkp->device;
 	struct scsi_sense_hdr *sshdr = NULL;
 	int retval;
 
-	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_check_events\n"));
+	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_media_changed\n"));
+
+	if (!sdp->removable)
+		return 0;
 
 	/*
 	 * If the device is offline, don't send any commands - just pretend as
@@ -1044,6 +1024,7 @@ static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing)
 	 */
 	if (!scsi_device_online(sdp)) {
 		set_media_not_present(sdkp);
+		retval = 1;
 		goto out;
 	}
 
@@ -1064,30 +1045,26 @@ static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing)
 					      sshdr);
 	}
 
-	/* failed to execute TUR, assume media not present */
-	if (host_byte(retval)) {
+	if (retval) {
 		set_media_not_present(sdkp);
+		retval = 1;
 		goto out;
 	}
 
-	if (media_not_present(sdkp, sshdr))
-		goto out;
-
 	/*
 	 * For removable scsi disk we have to recognise the presence
-	 * of a disk in the drive.
+	 * of a disk in the drive. This is kept in the struct scsi_disk
+	 * struct and tested at open !  Daniel Roche (dan@lectra.fr)
 	 */
-	if (!sdkp->media_present)
-		sdp->changed = 1;
 	sdkp->media_present = 1;
+
+	retval = sdp->changed;
+	sdp->changed = 0;
 out:
-	/* for backward compatibility */
-	if (sdp->changed)
+	if (retval != sdkp->previous_state)
 		sdev_evt_send_simple(sdp, SDEV_EVT_MEDIA_CHANGE, GFP_KERNEL);
+	sdkp->previous_state = retval;
 	kfree(sshdr);
-
-	retval = sdp->changed ? DISK_EVENT_MEDIA_CHANGE : 0;
-	sdp->changed = 0;
 	return retval;
 }
 
@@ -1180,7 +1157,7 @@ static const struct block_device_operations sd_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl		= sd_compat_ioctl,
 #endif
-	.check_events		= sd_check_events,
+	.media_changed		= sd_media_changed,
 	.revalidate_disk	= sd_revalidate_disk,
 	.unlock_native_capacity	= sd_unlock_native_capacity,
 };
@@ -1316,6 +1293,23 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 	return good_bytes;
 }
 
+static int media_not_present(struct scsi_disk *sdkp,
+			     struct scsi_sense_hdr *sshdr)
+{
+
+	if (!scsi_sense_valid(sshdr))
+		return 0;
+	/* not invoked for commands that could return deferred errors */
+	if (sshdr->sense_key != NOT_READY &&
+	    sshdr->sense_key != UNIT_ATTENTION)
+		return 0;
+	if (sshdr->asc != 0x3A) /* medium not present */
+		return 0;
+
+	set_media_not_present(sdkp);
+	return 1;
+}
+
 /*
  * spinup disk - called only in sd_revalidate_disk()
  */
@@ -1490,7 +1484,7 @@ static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp,
 	 */
 	if (sdp->removable &&
 	    sense_valid && sshdr->sense_key == NOT_READY)
-		set_media_not_present(sdkp);
+		sdp->changed = 1;
 
 	/*
 	 * We used to set media_present to 0 here to indicate no media
@@ -2345,10 +2339,8 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
 
 	gd->driverfs_dev = &sdp->sdev_gendev;
 	gd->flags = GENHD_FL_EXT_DEVT;
-	if (sdp->removable) {
+	if (sdp->removable)
 		gd->flags |= GENHD_FL_REMOVABLE;
-		gd->events |= DISK_EVENT_MEDIA_CHANGE;
-	}
 
 	add_disk(gd);
 	sd_dif_config_host(sdkp);
@@ -2430,6 +2422,7 @@ static int sd_probe(struct device *dev)
 	sdkp->disk = gd;
 	sdkp->index = index;
 	atomic_set(&sdkp->openers, 0);
+	sdkp->previous_state = 1;
 
 	if (!sdp->request_queue->rq_timeout) {
 		if (sdp->type != TYPE_MOD)
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index c9d8f6ca49e2..55488faf0815 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -55,6 +55,7 @@ struct scsi_disk {
 	u8		media_present;
 	u8		write_prot;
 	u8		protection_type;/* Data Integrity Field */
+	unsigned	previous_state : 1;
 	unsigned	ATO : 1;	/* state of disk ATO bit */
 	unsigned	WCE : 1;	/* state of disk WCE bit */
 	unsigned	RCD : 1;	/* state of disk RCD bit, unused */
-- 
cgit v1.2.3-59-g8ed1b


From 89b90be2d877a904b1704e4029db65655bfc6282 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 3 Jan 2011 15:01:47 +0100
Subject: block: make kblockd_workqueue smarter

kblockd is used for unplugging and may affect IO latency and
throughput and the max number of concurrent work items are bound by
the number of block devices.  Make it HIGHPRI workqueue w/ default max
concurrency.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 151070541e21..3689319a5974 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2606,7 +2606,9 @@ int __init blk_dev_init(void)
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
 			sizeof(((struct request *)0)->cmd_flags));
 
-	kblockd_workqueue = create_workqueue("kblockd");
+	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
+	kblockd_workqueue = alloc_workqueue("kblockd",
+					    WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
 
-- 
cgit v1.2.3-59-g8ed1b


From a6e8dc46ff0b7defbfa4f29a71aee263377ec573 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 3 Jan 2011 15:01:48 +0100
Subject: bio-integrity: mark kintegrityd_wq highpri and CPU intensive

Work items processed by kintegrityd_wq won't block much, may burn a
lot of CPU cycles and affect IO latency.  Use alloc_workqueue() to
mark it highpri and CPU intensive with max concurrency of 1.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/bio-integrity.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4d0ff5ee27b8..e49cce234c65 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -782,7 +782,12 @@ void __init bio_integrity_init(void)
 {
 	unsigned int i;
 
-	kintegrityd_wq = create_workqueue("kintegrityd");
+	/*
+	 * kintegrityd won't block much but may burn a lot of CPU cycles.
+	 * Make it highpri CPU intensive wq with max concurrency of 1.
+	 */
+	kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
+					 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
 	if (!kintegrityd_wq)
 		panic("Failed to create kintegrityd\n");
 
-- 
cgit v1.2.3-59-g8ed1b


From e4a683c899cd5a49f8d684a054c95bd115a0c005 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 5 Jan 2011 16:57:37 +0100
Subject: kref: add kref_test_and_get

Add kref_test_and_get() function, which atomically add a reference only if
refcount is not zero. This prevent to add a reference to an object that is
already being removed.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/kref.h |  1 +
 lib/kref.c           | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/linux/kref.h b/include/linux/kref.h
index 6cc38fc07ab7..90b9e44abf54 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -23,6 +23,7 @@ struct kref {
 
 void kref_init(struct kref *kref);
 void kref_get(struct kref *kref);
+int kref_test_and_get(struct kref *kref);
 int kref_put(struct kref *kref, void (*release) (struct kref *kref));
 
 #endif /* _KREF_H_ */
diff --git a/lib/kref.c b/lib/kref.c
index d3d227a08a4b..e7a6e1067122 100644
--- a/lib/kref.c
+++ b/lib/kref.c
@@ -36,6 +36,18 @@ void kref_get(struct kref *kref)
 	smp_mb__after_atomic_inc();
 }
 
+/**
+ * kref_test_and_get - increment refcount for object only if refcount is not
+ * zero.
+ * @kref: object.
+ *
+ * Return non-zero if the refcount was incremented, 0 otherwise
+ */
+int kref_test_and_get(struct kref *kref)
+{
+	return atomic_inc_not_zero(&kref->refcount);
+}
+
 /**
  * kref_put - decrement refcount for object.
  * @kref: object.
-- 
cgit v1.2.3-59-g8ed1b


From 09e099d4bafea3b15be003d548bdf94b4b6e0e17 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 5 Jan 2011 16:57:38 +0100
Subject: block: fix accounting bug on cross partition merges

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.

Also add a refcount to struct hd_struct to keep the partition in
memory as long as users exist. We use kref_test_and_get() to ensure
we don't add a reference to a partition which is going away.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c       | 26 +++++++++++++++++++++-----
 block/blk-merge.c      |  3 ++-
 block/genhd.c          |  1 +
 fs/partitions/check.c  | 10 +++++++++-
 include/linux/blkdev.h |  1 +
 include/linux/genhd.h  |  2 ++
 6 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3689319a5974..500c080a6a6b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-	if (!new_io)
+	if (!new_io) {
+		part = rq->part;
 		part_stat_inc(cpu, part, merges[rw]);
-	else {
+	} else {
+		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+		if (!kref_test_and_get(&part->ref)) {
+			/*
+			 * The partition is already being removed,
+			 * the request will be accounted on the disk only
+			 *
+			 * We take a reference on disk->part0 although that
+			 * partition will never be deleted, so we can treat
+			 * it as any other partition.
+			 */
+			part = &rq->rq_disk->part0;
+			kref_get(&part->ref);
+		}
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
+		rq->part = part;
 	}
 
 	part_stat_unlock();
@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
+	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rw);
 
+		kref_put(&part->ref, __delete_partition);
 		part_stat_unlock();
 	}
 }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 77b7c26df6b5..b06b83b89d89 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
 
+		kref_put(&part->ref, __delete_partition);
 		part_stat_unlock();
 	}
 }
diff --git a/block/genhd.c b/block/genhd.c
index 16ccc0d2d5d3..85c150598830 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 			return NULL;
 		}
 		disk->part_tbl->part[0] = &disk->part0;
+		kref_init(&disk->part0.ref);
 
 		disk->minors = minors;
 		rand_initialize_disk(disk);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index bdf8d3cc95a4..48209f58522b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -381,6 +381,13 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 	put_device(part_to_dev(part));
 }
 
+void __delete_partition(struct kref *ref)
+{
+	struct hd_struct *part = container_of(ref, struct hd_struct, ref);
+
+	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
+
 void delete_partition(struct gendisk *disk, int partno)
 {
 	struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -399,7 +406,7 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
-	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+	kref_put(&part->ref, __delete_partition);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -498,6 +505,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!dev_get_uevent_suppress(ddev))
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
+	kref_init(&p->ref);
 	return p;
 
 out_free_info:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aae86fd10c4f..482a7fd48831 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ struct request {
 	void *elevator_private3;
 
 	struct gendisk *rq_disk;
+	struct hd_struct *part;
 	unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
 	unsigned long long start_time_ns;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7a7b9c1644e4..2ba2792a3dd4 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -116,6 +116,7 @@ struct hd_struct {
 	struct disk_stats dkstats;
 #endif
 	struct rcu_head rcu_head;
+	struct kref ref;
 };
 
 #define GENHD_FL_REMOVABLE			1
@@ -583,6 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
 						     sector_t len, int flags,
 						     struct partition_meta_info
 						       *info);
+extern void __delete_partition(struct kref *ref);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
-- 
cgit v1.2.3-59-g8ed1b


From 6c23a9681c0fe7fb7dd331b39dda11926f43746e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 7 Jan 2011 08:43:37 +0100
Subject: block: add internal hd part table references

We can't use krefs since it's apparently restricted to very basic
reference counting.

This reverts commit e4a683c8.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c      |  6 +++---
 block/blk-merge.c     |  2 +-
 block/genhd.c         |  3 ++-
 fs/partitions/check.c |  8 +++-----
 include/linux/genhd.h | 27 +++++++++++++++++++++++++--
 include/linux/kref.h  |  1 -
 lib/kref.c            | 12 ------------
 7 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 500c080a6a6b..2f4002f79a24 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -70,7 +70,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		part_stat_inc(cpu, part, merges[rw]);
 	} else {
 		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
-		if (!kref_test_and_get(&part->ref)) {
+		if (!hd_struct_try_get(part)) {
 			/*
 			 * The partition is already being removed,
 			 * the request will be accounted on the disk only
@@ -80,7 +80,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 			 * it as any other partition.
 			 */
 			part = &rq->rq_disk->part0;
-			kref_get(&part->ref);
+			hd_struct_get(part);
 		}
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
@@ -1818,7 +1818,7 @@ static void blk_account_io_done(struct request *req)
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rw);
 
-		kref_put(&part->ref, __delete_partition);
+		hd_struct_put(part);
 		part_stat_unlock();
 	}
 }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b06b83b89d89..00b7d31b38a2 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -356,7 +356,7 @@ static void blk_account_io_merge(struct request *req)
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
 
-		kref_put(&part->ref, __delete_partition);
+		hd_struct_put(part);
 		part_stat_unlock();
 	}
 }
diff --git a/block/genhd.c b/block/genhd.c
index 85c150598830..399d37ec7412 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1192,7 +1192,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 			return NULL;
 		}
 		disk->part_tbl->part[0] = &disk->part0;
-		kref_init(&disk->part0.ref);
+
+		hd_ref_init(&disk->part0);
 
 		disk->minors = minors;
 		rand_initialize_disk(disk);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 48209f58522b..011520df71ae 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -381,10 +381,8 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 	put_device(part_to_dev(part));
 }
 
-void __delete_partition(struct kref *ref)
+void __delete_partition(struct hd_struct *part)
 {
-	struct hd_struct *part = container_of(ref, struct hd_struct, ref);
-
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
@@ -406,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
-	kref_put(&part->ref, __delete_partition);
+	hd_struct_put(part);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -505,7 +503,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!dev_get_uevent_suppress(ddev))
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
-	kref_init(&p->ref);
+	hd_ref_init(p);
 	return p;
 
 out_free_info:
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 2ba2792a3dd4..2d0468145967 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -115,8 +115,8 @@ struct hd_struct {
 #else
 	struct disk_stats dkstats;
 #endif
+	atomic_t ref;
 	struct rcu_head rcu_head;
-	struct kref ref;
 };
 
 #define GENHD_FL_REMOVABLE			1
@@ -584,7 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
 						     sector_t len, int flags,
 						     struct partition_meta_info
 						       *info);
-extern void __delete_partition(struct kref *ref);
+extern void __delete_partition(struct hd_struct *);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
@@ -613,6 +613,29 @@ extern ssize_t part_fail_store(struct device *dev,
 			       const char *buf, size_t count);
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
+static inline void hd_ref_init(struct hd_struct *part)
+{
+	atomic_set(&part->ref, 1);
+	smp_mb();
+}
+
+static inline void hd_struct_get(struct hd_struct *part)
+{
+	atomic_inc(&part->ref);
+	smp_mb__after_atomic_inc();
+}
+
+static inline int hd_struct_try_get(struct hd_struct *part)
+{
+	return atomic_inc_not_zero(&part->ref);
+}
+
+static inline void hd_struct_put(struct hd_struct *part)
+{
+	if (atomic_dec_and_test(&part->ref))
+		__delete_partition(part);
+}
+
 #else /* CONFIG_BLOCK */
 
 static inline void printk_all_partitions(void) { }
diff --git a/include/linux/kref.h b/include/linux/kref.h
index 90b9e44abf54..6cc38fc07ab7 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -23,7 +23,6 @@ struct kref {
 
 void kref_init(struct kref *kref);
 void kref_get(struct kref *kref);
-int kref_test_and_get(struct kref *kref);
 int kref_put(struct kref *kref, void (*release) (struct kref *kref));
 
 #endif /* _KREF_H_ */
diff --git a/lib/kref.c b/lib/kref.c
index e7a6e1067122..d3d227a08a4b 100644
--- a/lib/kref.c
+++ b/lib/kref.c
@@ -36,18 +36,6 @@ void kref_get(struct kref *kref)
 	smp_mb__after_atomic_inc();
 }
 
-/**
- * kref_test_and_get - increment refcount for object only if refcount is not
- * zero.
- * @kref: object.
- *
- * Return non-zero if the refcount was incremented, 0 otherwise
- */
-int kref_test_and_get(struct kref *kref)
-{
-	return atomic_inc_not_zero(&kref->refcount);
-}
-
 /**
  * kref_put - decrement refcount for object.
  * @kref: object.
-- 
cgit v1.2.3-59-g8ed1b


From b7908c1035af7652cd613991b54dbff9c8b6bd3a Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Thu, 6 Jan 2011 20:41:42 +0100
Subject: block: trace event block fix unassigned field

The "error" field in block_bio_complete is not assigned, leaving the memory area
uninitialized (keeping garbage data). Pass an additional tracepoint argument to
this event to initialize this field.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Li Zefan <lizf@cn.fujitsu.com>
CC: Alan.Brunelle@hp.com
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm.c              | 2 +-
 include/trace/events/block.h | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0a2b5516bc21..f48a2f359ac4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -630,7 +630,7 @@ static void dec_pending(struct dm_io *io, int error)
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
-			trace_block_bio_complete(md->queue, bio);
+			trace_block_bio_complete(md->queue, bio, io_error);
 			bio_endio(bio, io_error);
 		}
 	}
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index b56c65dc105d..aba421d68f6f 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -206,15 +206,16 @@ TRACE_EVENT(block_bio_bounce,
  * block_bio_complete - completed all work on the block operation
  * @q: queue holding the block operation
  * @bio: block operation completed
+ * @error: io error value
  *
  * This tracepoint indicates there is no further work to do on this
  * block IO operation @bio.
  */
 TRACE_EVENT(block_bio_complete,
 
-	TP_PROTO(struct request_queue *q, struct bio *bio),
+	TP_PROTO(struct request_queue *q, struct bio *bio, int error),
 
-	TP_ARGS(q, bio),
+	TP_ARGS(q, bio, error),
 
 	TP_STRUCT__entry(
 		__field( dev_t,		dev		)
@@ -228,6 +229,7 @@ TRACE_EVENT(block_bio_complete,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_sector;
 		__entry->nr_sector	= bio->bi_size >> 9;
+		__entry->error		= error;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
 	),
 
-- 
cgit v1.2.3-59-g8ed1b


From 30d7b9448f03f2c82d0fd44738674cc156a8ce0a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 7 Jan 2011 08:46:59 +0100
Subject: block cfq: don't use atomic_t for cfq_queue

cfq_queue->ref is used with queue_lock hold, so ref doesn't need to be an atomic
and atomic operation is slower.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c19d015ac5a5..4cb4cf73ac00 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -96,7 +96,7 @@ struct cfq_rb_root {
  */
 struct cfq_queue {
 	/* reference count */
-	atomic_t ref;
+	int ref;
 	/* various state flags, see below */
 	unsigned int flags;
 	/* parent cfq_data */
@@ -2025,7 +2025,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq)
 	int process_refs, io_refs;
 
 	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
-	process_refs = atomic_read(&cfqq->ref) - io_refs;
+	process_refs = cfqq->ref - io_refs;
 	BUG_ON(process_refs < 0);
 	return process_refs;
 }
@@ -2065,10 +2065,10 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 	 */
 	if (new_process_refs >= process_refs) {
 		cfqq->new_cfqq = new_cfqq;
-		atomic_add(process_refs, &new_cfqq->ref);
+		new_cfqq->ref += process_refs;
 	} else {
 		new_cfqq->new_cfqq = cfqq;
-		atomic_add(new_process_refs, &cfqq->ref);
+		cfqq->ref += new_process_refs;
 	}
 }
 
@@ -2532,9 +2532,10 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct cfq_group *cfqg, *orig_cfqg;
 
-	BUG_ON(atomic_read(&cfqq->ref) <= 0);
+	BUG_ON(cfqq->ref <= 0);
 
-	if (!atomic_dec_and_test(&cfqq->ref))
+	cfqq->ref--;
+	if (cfqq->ref)
 		return;
 
 	cfq_log_cfqq(cfqd, cfqq, "put_queue");
@@ -2837,7 +2838,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	RB_CLEAR_NODE(&cfqq->p_node);
 	INIT_LIST_HEAD(&cfqq->fifo);
 
-	atomic_set(&cfqq->ref, 0);
+	cfqq->ref = 0;
 	cfqq->cfqd = cfqd;
 
 	cfq_mark_cfqq_prio_changed(cfqq);
@@ -2973,11 +2974,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 	 * pin the queue now that it's allocated, scheduler exit will prune it
 	 */
 	if (!is_sync && !(*async_cfqq)) {
-		atomic_inc(&cfqq->ref);
+		cfqq->ref++;
 		*async_cfqq = cfqq;
 	}
 
-	atomic_inc(&cfqq->ref);
+	cfqq->ref++;
 	return cfqq;
 }
 
@@ -3679,7 +3680,7 @@ new_queue:
 	}
 
 	cfqq->allocated[rw]++;
-	atomic_inc(&cfqq->ref);
+	cfqq->ref++;
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
@@ -3860,6 +3861,10 @@ static void *cfq_init_queue(struct request_queue *q)
 	if (!cfqd)
 		return NULL;
 
+	/*
+	 * Don't need take queue_lock in the routine, since we are
+	 * initializing the ioscheduler, and nobody is using cfqd
+	 */
 	cfqd->cic_index = i;
 
 	/* Init root service tree */
@@ -3899,7 +3904,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	 * will not attempt to free it.
 	 */
 	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
-	atomic_inc(&cfqd->oom_cfqq.ref);
+	cfqd->oom_cfqq.ref++;
 	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
 
 	INIT_LIST_HEAD(&cfqd->cic_list);
-- 
cgit v1.2.3-59-g8ed1b


From 329a67815b596d23daf0caa588ae0800e925320f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 7 Jan 2011 08:48:28 +0100
Subject: block cfq: don't use atomic_t for cfq_group

cfq_group->ref is used with queue_lock hold, the only exception is
cfq_set_request, which looks like a bug to me, so ref doesn't need
to be an atomic and atomic operation is slower.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4cb4cf73ac00..f083bda30546 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -207,7 +207,7 @@ struct cfq_group {
 	struct blkio_group blkg;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	struct hlist_node cfqd_node;
-	atomic_t ref;
+	int ref;
 #endif
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
@@ -1014,7 +1014,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	 * elevator which will be dropped by either elevator exit
 	 * or cgroup deletion path depending on who is exiting first.
 	 */
-	atomic_set(&cfqg->ref, 1);
+	cfqg->ref = 1;
 
 	/*
 	 * Add group onto cgroup list. It might happen that bdi->dev is
@@ -1059,7 +1059,7 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 
 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
 {
-	atomic_inc(&cfqg->ref);
+	cfqg->ref++;
 	return cfqg;
 }
 
@@ -1071,7 +1071,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
-	atomic_inc(&cfqq->cfqg->ref);
+	cfqq->cfqg->ref++;
 }
 
 static void cfq_put_cfqg(struct cfq_group *cfqg)
@@ -1079,8 +1079,9 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 	struct cfq_rb_root *st;
 	int i, j;
 
-	BUG_ON(atomic_read(&cfqg->ref) <= 0);
-	if (!atomic_dec_and_test(&cfqg->ref))
+	BUG_ON(cfqg->ref <= 0);
+	cfqg->ref--;
+	if (cfqg->ref)
 		return;
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
@@ -1188,7 +1189,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 		cfqq->orig_cfqg = cfqq->cfqg;
 		cfqq->cfqg = &cfqd->root_group;
-		atomic_inc(&cfqd->root_group.ref);
+		cfqd->root_group.ref++;
 		group_changed = 1;
 	} else if (!cfqd->cfq_group_isolation
 		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
@@ -3681,12 +3682,12 @@ new_queue:
 
 	cfqq->allocated[rw]++;
 	cfqq->ref++;
-
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
 	rq->elevator_private = cic;
 	rq->elevator_private2 = cfqq;
 	rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
 	return 0;
 
 queue_fail:
@@ -3884,7 +3885,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	 * Take a reference to root group which we never drop. This is just
 	 * to make sure that cfq_put_cfqg() does not try to kfree root group
 	 */
-	atomic_set(&cfqg->ref, 1);
+	cfqg->ref = 1;
 	rcu_read_lock();
 	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
 					(void *)cfqd, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 23036f1a340beec19cc451ba9719526c4ffb3a57 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 7 Jan 2011 08:53:58 +0100
Subject: blktrace: add missing probe argument to block_bio_complete

blktrace.c block bio complete callback needs to gain a new argument to reflect
the newly added "error" tracepoint argument. This is needed to match the new
block_bio_complete TRACE_EVENT as of
commit de983a7bfcb7c020901ca6e2314cf55a4207ab5a.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Jeff Moyer <jmoyer@redhat.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2b8e2ee7c0ef..fab31aff9d97 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -782,7 +782,8 @@ static void blk_add_trace_bio_bounce(void *ignore,
 }
 
 static void blk_add_trace_bio_complete(void *ignore,
-				       struct request_queue *q, struct bio *bio)
+				       struct request_queue *q, struct bio *bio,
+				       int error)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 797a455d2c682476c3797dbfecf5bf84c1e3b9d3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 10 Jan 2011 09:06:44 +0100
Subject: block: ensure that completion error gets properly traced

We normally just use the BIO_UPTODATE flag to signal 0/-EIO. If
we have more information available, we should pass that along to
the trace output.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fab31aff9d97..153562d0b93c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -758,54 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore,
  * @q:		queue the io is for
  * @bio:	the source bio
  * @what:	the action
+ * @error:	error, if any
  *
  * Description:
  *     Records an action against a bio. Will log the bio offset + size.
  *
  **/
 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
+			      u32 what, int error)
 {
 	struct blk_trace *bt = q->blk_trace;
 
 	if (likely(!bt))
 		return;
 
+	if (!error && !bio_flagged(bio, BIO_UPTODATE))
+		error = EIO;
+
 	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
-			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+			error, 0, NULL);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
 				     struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
 
 static void blk_add_trace_bio_complete(void *ignore,
 				       struct request_queue *q, struct bio *bio,
 				       int error)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 
 static void blk_add_trace_bio_backmerge(void *ignore,
 					struct request_queue *q,
 					struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
 }
 
 static void blk_add_trace_bio_frontmerge(void *ignore,
 					 struct request_queue *q,
 					 struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
 }
 
 static void blk_add_trace_bio_queue(void *ignore,
 				    struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
 }
 
 static void blk_add_trace_getrq(void *ignore,
@@ -813,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore,
 				struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
@@ -828,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore,
 				  struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
-- 
cgit v1.2.3-59-g8ed1b