From c4cf5261f8bffd9de132b50660a69148e7575bd6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 17 Apr 2015 16:15:18 -0600
Subject: bio: skip atomic inc/dec of ->bi_remaining for non-chains

Struct bio has an atomic ref count for chained bio's, and we use this
to know when to end IO on the bio. However, most bio's are not chained,
so we don't need to always introduce this atomic operation as part of
ending IO.

Add a helper to elevate the bi_remaining count, and flag the bio as
now actually needing the decrement at end_io time. Rename the field
to __bi_remaining to catch any current users of this doing the
incrementing manually.

For high IOPS workloads, this reduces the overhead of bio_endio()
substantially.

Tested-by: Robert Elliott <elliott@hp.com>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index f66a4eae16ee..117da319afb6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,7 +270,7 @@ void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
-	atomic_set(&bio->bi_remaining, 1);
+	atomic_set(&bio->__bi_remaining, 1);
 	atomic_set(&bio->bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
@@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
 	__bio_free(bio);
 
 	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags|(1 << BIO_UPTODATE);
-	atomic_set(&bio->bi_remaining, 1);
+	bio->bi_flags = flags | (1 << BIO_UPTODATE);
+	atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
 
@@ -320,7 +320,7 @@ void bio_chain(struct bio *bio, struct bio *parent)
 
 	bio->bi_private = parent;
 	bio->bi_end_io	= bio_chain_endio;
-	atomic_inc(&parent->bi_remaining);
+	bio_inc_remaining(parent);
 }
 EXPORT_SYMBOL(bio_chain);
 
@@ -1741,6 +1741,23 @@ void bio_flush_dcache_pages(struct bio *bi)
 EXPORT_SYMBOL(bio_flush_dcache_pages);
 #endif
 
+static inline bool bio_remaining_done(struct bio *bio)
+{
+	/*
+	 * If we're not chaining, then ->__bi_remaining is always 1 and
+	 * we always end io on the first invocation.
+	 */
+	if (!bio_flagged(bio, BIO_CHAIN))
+		return true;
+
+	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
+
+	if (atomic_dec_and_test(&bio->__bi_remaining))
+		return true;
+
+	return false;
+}
+
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
@@ -1758,15 +1775,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
 void bio_endio(struct bio *bio, int error)
 {
 	while (bio) {
-		BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
-
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			error = -EIO;
 
-		if (!atomic_dec_and_test(&bio->bi_remaining))
-			return;
+		if (unlikely(!bio_remaining_done(bio)))
+			break;
 
 		/*
 		 * Need to have a real endio function for chained bios,
@@ -1799,7 +1814,12 @@ EXPORT_SYMBOL(bio_endio);
  **/
 void bio_endio_nodec(struct bio *bio, int error)
 {
-	atomic_inc(&bio->bi_remaining);
+	/*
+	 * If it's not flagged as a chain, we are not going to dec the count
+	 */
+	if (bio_flagged(bio, BIO_CHAIN))
+		bio_inc_remaining(bio);
+
 	bio_endio(bio, error);
 }
 EXPORT_SYMBOL(bio_endio_nodec);
-- 
cgit v1.2.3-59-g8ed1b


From dac56212e8127dbc0bff7be35c508bc280213309 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 17 Apr 2015 16:23:59 -0600
Subject: bio: skip atomic inc/dec of ->bi_cnt for most use cases

Struct bio has a reference count that controls when it can be freed.
Most uses cases is allocating the bio, which then returns with a
single reference to it, doing IO, and then dropping that single
reference. We can remove this atomic_dec_and_test() in the completion
path, if nobody else is holding a reference to the bio.

If someone does call bio_get() on the bio, then we flag the bio as
now having valid count and that we must properly honor the reference
count when it's being put.

Tested-by: Robert Elliott <elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c                 | 18 +++++++++++-------
 drivers/md/bcache/request.c |  2 +-
 fs/btrfs/volumes.c          |  2 +-
 fs/xfs/xfs_aops.c           |  1 -
 include/linux/bio.h         | 16 +++++++++++++++-
 include/linux/blk_types.h   |  3 ++-
 6 files changed, 30 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index 117da319afb6..c2ff8a88aef1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -271,7 +271,7 @@ void bio_init(struct bio *bio)
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
 	atomic_set(&bio->__bi_remaining, 1);
-	atomic_set(&bio->bi_cnt, 1);
+	atomic_set(&bio->__bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
 
@@ -524,13 +524,17 @@ EXPORT_SYMBOL(zero_fill_bio);
  **/
 void bio_put(struct bio *bio)
 {
-	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
-
-	/*
-	 * last put frees it
-	 */
-	if (atomic_dec_and_test(&bio->bi_cnt))
+	if (!bio_flagged(bio, BIO_REFFED))
 		bio_free(bio);
+	else {
+		BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+
+		/*
+		 * last put frees it
+		 */
+		if (atomic_dec_and_test(&bio->__bi_cnt))
+			bio_free(bio);
+	}
 }
 EXPORT_SYMBOL(bio_put);
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ab43faddb447..1616f668a4cb 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -619,7 +619,7 @@ static void do_bio_hook(struct search *s, struct bio *orig_bio)
 	bio->bi_end_io		= request_endio;
 	bio->bi_private		= &s->cl;
 
-	atomic_set(&bio->bi_cnt, 3);
+	bio_cnt_set(bio, 3);
 }
 
 static void search_free(struct closure *cl)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 96aebf3bcd5b..8e8d1d1e28a5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -345,7 +345,7 @@ loop_lock:
 		    waitqueue_active(&fs_info->async_submit_wait))
 			wake_up(&fs_info->async_submit_wait);
 
-		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
 
 		/*
 		 * if we're doing the sync list, record that our
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a56960dd1684..095f94c2d8b5 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -356,7 +356,6 @@ xfs_end_bio(
 {
 	xfs_ioend_t		*ioend = bio->bi_private;
 
-	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
 	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 
 	/* Toss bio and pass work off to an xfsdatad thread */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8bfe9eee6d1a..7486ea103f6e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -290,7 +290,21 @@ static inline unsigned bio_segments(struct bio *bio)
  * returns. and then bio would be freed memory when if (bio->bi_flags ...)
  * runs
  */
-#define bio_get(bio)	atomic_inc(&(bio)->bi_cnt)
+static inline void bio_get(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_REFFED);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_cnt);
+}
+
+static inline void bio_cnt_set(struct bio *bio, unsigned int count)
+{
+	if (count != 1) {
+		bio->bi_flags |= (1 << BIO_REFFED);
+		smp_mb__before_atomic();
+	}
+	atomic_set(&bio->__bi_cnt, count);
+}
 
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8b07e0603887..93d2e7153816 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -92,7 +92,7 @@ struct bio {
 
 	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */
 
-	atomic_t		bi_cnt;		/* pin count */
+	atomic_t		__bi_cnt;	/* pin count */
 
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
 
@@ -123,6 +123,7 @@ struct bio {
 #define BIO_QUIET	9	/* Make BIO Quiet */
 #define BIO_SNAP_STABLE	10	/* bio data must be snapshotted during write */
 #define BIO_CHAIN	11	/* chained bio, ->bi_remaining in effect */
+#define BIO_REFFED	12	/* bio has elevated ->bi_cnt */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
-- 
cgit v1.2.3-59-g8ed1b


From a7928c1578c550bd6f4dec62d65132e6db226c57 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 Apr 2015 22:37:20 +0200
Subject: block: move PM request support to IDE

This removes the request types and hacks from the block code and into the
old IDE driver.  There is a small amunt of code duplication due to this,
but it's not too bad.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c           |  1 +
 block/blk-exec.c           | 10 ---------
 block/blk.h                |  2 --
 drivers/ide/ide-eh.c       |  2 +-
 drivers/ide/ide-io.c       |  8 +++----
 drivers/ide/ide-pm.c       | 56 +++++++++++++++++++++++++++++++++++-----------
 drivers/ide/ide-taskfile.c |  2 +-
 include/linux/blkdev.h     | 21 +----------------
 include/linux/ide.h        | 19 ++++++++++++++++
 9 files changed, 70 insertions(+), 51 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index fd154b94447a..2e5020f37d55 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -285,6 +285,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q)
 	q->request_fn(q);
 	q->request_fn_active--;
 }
+EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
 
 /**
  * __blk_run_queue - run a single device queue
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 9924725fa50d..3fec8a29d0fa 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 			   rq_end_io_fn *done)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-	bool is_pm_resume;
 
 	WARN_ON(irqs_disabled());
 	WARN_ON(rq->cmd_type == REQ_TYPE_FS);
@@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 		return;
 	}
 
-	/*
-	 * need to check this before __blk_run_queue(), because rq can
-	 * be freed before that returns.
-	 */
-	is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
-
 	spin_lock_irq(q->queue_lock);
 
 	if (unlikely(blk_queue_dying(q))) {
@@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	__elv_add_request(q, rq, where);
 	__blk_run_queue(q);
-	/* the queue is stopped so it won't be run */
-	if (is_pm_resume)
-		__blk_run_queue_uncond(q);
 	spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
diff --git a/block/blk.h b/block/blk.h
index 43b036185712..4b48d55e588e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -193,8 +193,6 @@ int blk_try_merge(struct request *rq, struct bio *bio);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
-void __blk_run_queue_uncond(struct request_queue *q);
-
 int blk_dev_init(void);
 
 
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 19d809c48a8d..d6da011299f5 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -129,7 +129,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 
 			if (cmd)
 				ide_complete_cmd(drive, cmd, stat, err);
-		} else if (blk_pm_request(rq)) {
+		} else if (ata_pm_request(rq)) {
 			rq->errors = 1;
 			ide_complete_pm_rq(drive, rq);
 			return ide_stopped;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 8e55abd03a24..669ea1e45795 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -320,7 +320,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		goto kill_rq;
 	}
 
-	if (blk_pm_request(rq))
+	if (ata_pm_request(rq))
 		ide_check_pm_state(drive, rq);
 
 	drive->hwif->tp_ops->dev_select(drive);
@@ -342,8 +342,8 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 
 		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
 			return execute_drive_cmd(drive, rq);
-		else if (blk_pm_request(rq)) {
-			struct request_pm_state *pm = rq->special;
+		else if (ata_pm_request(rq)) {
+			struct ide_pm_state *pm = rq->special;
 #ifdef DEBUG_PM
 			printk("%s: start_power_step(step: %d)\n",
 				drive->name, pm->pm_step);
@@ -538,7 +538,7 @@ repeat:
 		 * state machine.
 		 */
 		if ((drive->dev_flags & IDE_DFLAG_BLOCKED) &&
-		    blk_pm_request(rq) == 0 &&
+		    ata_pm_request(rq) == 0 &&
 		    (rq->cmd_flags & REQ_PREEMPT) == 0) {
 			/* there should be no pending command at this point */
 			ide_unlock_port(hwif);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 8d1e32d7cd97..081e43458d50 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -8,7 +8,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	ide_drive_t *pair = ide_get_pair_dev(drive);
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
-	struct request_pm_state rqpm;
+	struct ide_pm_state rqpm;
 	int ret;
 
 	if (ide_port_acpi(hwif)) {
@@ -19,7 +19,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_PM_SUSPEND;
+	rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
 	if (mesg.event == PM_EVENT_PRETHAW)
@@ -38,13 +38,43 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	return ret;
 }
 
+static void ide_end_sync_rq(struct request *rq, int error)
+{
+	complete(rq->end_io_data);
+}
+
+static int ide_pm_execute_rq(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	rq->end_io_data = &wait;
+	rq->end_io = ide_end_sync_rq;
+
+	spin_lock_irq(q->queue_lock);
+	if (unlikely(blk_queue_dying(q))) {
+		rq->cmd_flags |= REQ_QUIET;
+		rq->errors = -ENXIO;
+		__blk_end_request_all(rq, rq->errors);
+		spin_unlock_irq(q->queue_lock);
+		return -ENXIO;
+	}
+	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
+	__blk_run_queue_uncond(q);
+	spin_unlock_irq(q->queue_lock);
+
+	wait_for_completion_io(&wait);
+
+	return rq->errors ? -EIO : 0;
+}
+
 int generic_ide_resume(struct device *dev)
 {
 	ide_drive_t *drive = to_ide_device(dev);
 	ide_drive_t *pair = ide_get_pair_dev(drive);
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
-	struct request_pm_state rqpm;
+	struct ide_pm_state rqpm;
 	int err;
 
 	if (ide_port_acpi(hwif)) {
@@ -59,13 +89,13 @@ int generic_ide_resume(struct device *dev)
 
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_PM_RESUME;
+	rq->cmd_type = REQ_TYPE_ATA_PM_RESUME;
 	rq->cmd_flags |= REQ_PREEMPT;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
 
-	err = blk_execute_rq(drive->queue, NULL, rq, 1);
+	err = ide_pm_execute_rq(rq);
 	blk_put_request(rq);
 
 	if (err == 0 && dev->driver) {
@@ -80,7 +110,7 @@ int generic_ide_resume(struct device *dev)
 
 void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 
 #ifdef DEBUG_PM
 	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -110,7 +140,7 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 
 ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 	struct ide_cmd cmd = { };
 
 	switch (pm->pm_step) {
@@ -182,7 +212,7 @@ out_do_tf:
 void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 	unsigned long flags;
 
 	ide_complete_power_step(drive, rq);
@@ -191,10 +221,10 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 #ifdef DEBUG_PM
 	printk("%s: completing PM request, %s\n", drive->name,
-	       (rq->cmd_type == REQ_TYPE_PM_SUSPEND) ? "suspend" : "resume");
+	       (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND) ? "suspend" : "resume");
 #endif
 	spin_lock_irqsave(q->queue_lock, flags);
-	if (rq->cmd_type == REQ_TYPE_PM_SUSPEND)
+	if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND)
 		blk_stop_queue(q);
 	else
 		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
@@ -208,13 +238,13 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 
-	if (rq->cmd_type == REQ_TYPE_PM_SUSPEND &&
+	if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND &&
 	    pm->pm_step == IDE_PM_START_SUSPEND)
 		/* Mark drive blocked when starting the suspend sequence. */
 		drive->dev_flags |= IDE_DFLAG_BLOCKED;
-	else if (rq->cmd_type == REQ_TYPE_PM_RESUME &&
+	else if (rq->cmd_type == REQ_TYPE_ATA_PM_RESUME &&
 		 pm->pm_step == IDE_PM_START_RESUME) {
 		/*
 		 * The first thing we do on wakeup is to wait for BSY bit to
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index dabb88b1cbec..0979e126fff1 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -186,7 +186,7 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 	    tf->command == ATA_CMD_CHK_POWER) {
 		struct request *rq = hwif->rq;
 
-		if (blk_pm_request(rq))
+		if (ata_pm_request(rq))
 			ide_complete_pm_rq(drive, rq);
 		else
 			ide_finish_cmd(drive, cmd, stat);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c2829ba5e738..2da818a48097 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -30,7 +30,6 @@ struct scsi_ioctl_command;
 
 struct request_queue;
 struct elevator_queue;
-struct request_pm_state;
 struct blk_trace;
 struct request;
 struct sg_io_hdr;
@@ -75,8 +74,6 @@ struct request_list {
 enum rq_cmd_type_bits {
 	REQ_TYPE_FS		= 1,	/* fs request */
 	REQ_TYPE_BLOCK_PC,		/* scsi command */
-	REQ_TYPE_PM_SUSPEND,		/* suspend request */
-	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_DRV_PRIV,		/* driver defined types from here */
 };
 
@@ -207,19 +204,6 @@ static inline unsigned short req_get_ioprio(struct request *req)
 	return req->ioprio;
 }
 
-/*
- * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME
- * requests. Some step values could eventually be made generic.
- */
-struct request_pm_state
-{
-	/* PM state machine step value, currently driver specific */
-	int	pm_step;
-	/* requested PM state value (S1, S2, S3, S4, ...) */
-	u32	pm_state;
-	void*	data;		/* for driver use */
-};
-
 #include <linux/elevator.h>
 
 struct blk_queue_ctx;
@@ -601,10 +585,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	(((rq)->cmd_flags & REQ_STARTED) && \
 	 ((rq)->cmd_type == REQ_TYPE_FS))
 
-#define blk_pm_request(rq)	\
-	((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \
-	 (rq)->cmd_type == REQ_TYPE_PM_RESUME)
-
 #define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 /* rq->queuelist of dequeued request must be list_empty() */
@@ -838,6 +818,7 @@ extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(struct request_queue *q);
 extern void __blk_run_queue(struct request_queue *q);
+extern void __blk_run_queue_uncond(struct request_queue *q);
 extern void blk_run_queue(struct request_queue *);
 extern void blk_run_queue_async(struct request_queue *q);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9856b7d455d9..a633898f36ac 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -44,8 +44,14 @@ enum ata_cmd_type_bits {
 	REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1,
 	REQ_TYPE_ATA_PC,
 	REQ_TYPE_ATA_SENSE,	/* sense request */
+	REQ_TYPE_ATA_PM_SUSPEND,/* suspend request */
+	REQ_TYPE_ATA_PM_RESUME,	/* resume request */
 };
 
+#define ata_pm_request(rq)	\
+	((rq)->cmd_type == REQ_TYPE_ATA_PM_SUSPEND || \
+	 (rq)->cmd_type == REQ_TYPE_ATA_PM_RESUME)
+
 /* Error codes returned in rq->errors to the higher part of the driver. */
 enum {
 	IDE_DRV_ERROR_GENERAL	= 101,
@@ -1321,6 +1327,19 @@ struct ide_port_info {
 	u8			udma_mask;
 };
 
+/*
+ * State information carried for REQ_TYPE_ATA_PM_SUSPEND and REQ_TYPE_ATA_PM_RESUME
+ * requests.
+ */
+struct ide_pm_state {
+	/* PM state machine step value, currently driver specific */
+	int	pm_step;
+	/* requested PM state value (S1, S2, S3, S4, ...) */
+	u32	pm_state;
+	void*	data;		/* for driver use */
+};
+
+
 int ide_pci_init_one(struct pci_dev *, const struct ide_port_info *, void *);
 int ide_pci_init_two(struct pci_dev *, struct pci_dev *,
 		     const struct ide_port_info *, void *);
-- 
cgit v1.2.3-59-g8ed1b


From dd6cf3e18decb4895503db1752bb5500c4dd588d Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 8 May 2015 10:51:28 -0700
Subject: blk: clean up plug

Current code looks like inner plug gets flushed with a
blk_finish_plug(). Actually it's a nop. All requests/callbacks are added
to current->plug, while only outmost plug is assigned to current->plug.
So inner plug always has empty request/callback list, which makes
blk_flush_plug_list() a nop. This tries to make the code more clear.

Signed-off-by: Shaohua Li <shli@fb.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 2e5020f37d55..9dcfb8ec554b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3032,21 +3032,20 @@ void blk_start_plug(struct blk_plug *plug)
 {
 	struct task_struct *tsk = current;
 
+	/*
+	 * If this is a nested plug, don't actually assign it.
+	 */
+	if (tsk->plug)
+		return;
+
 	INIT_LIST_HEAD(&plug->list);
 	INIT_LIST_HEAD(&plug->mq_list);
 	INIT_LIST_HEAD(&plug->cb_list);
-
 	/*
-	 * If this is a nested plug, don't actually assign it. It will be
-	 * flushed on its own.
+	 * Store ordering should not be needed here, since a potential
+	 * preempt will imply a full memory barrier
 	 */
-	if (!tsk->plug) {
-		/*
-		 * Store ordering should not be needed here, since a potential
-		 * preempt will imply a full memory barrier
-		 */
-		tsk->plug = plug;
-	}
+	tsk->plug = plug;
 }
 EXPORT_SYMBOL(blk_start_plug);
 
@@ -3193,10 +3192,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 void blk_finish_plug(struct blk_plug *plug)
 {
+	if (plug != current->plug)
+		return;
 	blk_flush_plug_list(plug, false);
 
-	if (plug == current->plug)
-		current->plug = NULL;
+	current->plug = NULL;
 }
 EXPORT_SYMBOL(blk_finish_plug);
 
-- 
cgit v1.2.3-59-g8ed1b


From e6c4438ba7cb615448492849970aaf0aaa1cc973 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 8 May 2015 10:51:30 -0700
Subject: blk-mq: fix plugging in blk_sq_make_request

The following appears in blk_sq_make_request:

	/*
	 * If we have multiple hardware queues, just go directly to
	 * one of those for sync IO.
	 */

We clearly don't have multiple hardware queues, here!  This comment was
introduced with this commit 07068d5b8e (blk-mq: split make request
handler for multi and single queue):

    We want slightly different behavior from them:

    - On single queue devices, we currently use the per-process plug
      for deferred IO and for merging.

    - On multi queue devices, we don't use the per-process plug, but
      we want to go straight to hardware for SYNC IO.

The old code had this:

        use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);

and that was converted to:

	use_plug = !is_flush_fua && !is_sync;

which is not equivalent.  For the single queue case, that second half of
the && expression is always true.  So, what I think was actually inteded
follows (and this more closely matches what is done in blk_queue_bio).

V2: delete the 'likely', which should not be a big deal

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ade8a2d1b0aa..a65acffde19a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1309,16 +1309,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio->bi_rw);
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
-	unsigned int use_plug, request_count = 0;
+	struct blk_plug *plug;
+	unsigned int request_count = 0;
 	struct blk_map_ctx data;
 	struct request *rq;
 
-	/*
-	 * If we have multiple hardware queues, just go directly to
-	 * one of those for sync IO.
-	 */
-	use_plug = !is_flush_fua && !is_sync;
-
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1326,7 +1321,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
-	if (use_plug && !blk_queue_nomerges(q) &&
+	if (!is_flush_fua && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count))
 		return;
 
@@ -1345,21 +1340,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	 * utilize that to temporarily store requests until the task is
 	 * either done or scheduled away.
 	 */
-	if (use_plug) {
-		struct blk_plug *plug = current->plug;
-
-		if (plug) {
-			blk_mq_bio_to_request(rq, bio);
-			if (list_empty(&plug->mq_list))
-				trace_block_plug(q);
-			else if (request_count >= BLK_MAX_REQUEST_COUNT) {
-				blk_flush_plug_list(plug, false);
-				trace_block_plug(q);
-			}
-			list_add_tail(&rq->queuelist, &plug->mq_list);
-			blk_mq_put_ctx(data.ctx);
-			return;
+	plug = current->plug;
+	if (plug) {
+		blk_mq_bio_to_request(rq, bio);
+		if (list_empty(&plug->mq_list))
+			trace_block_plug(q);
+		else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+			blk_flush_plug_list(plug, false);
+			trace_block_plug(q);
 		}
+		list_add_tail(&rq->queuelist, &plug->mq_list);
+		blk_mq_put_ctx(data.ctx);
+		return;
 	}
 
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
-- 
cgit v1.2.3-59-g8ed1b


From 239ad215f0d8388cbe6c09a0fab8ad8ff5dba420 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 8 May 2015 10:51:31 -0700
Subject: blk-mq: avoid re-initialize request which is failed in direct
 dispatch

If we directly issue a request and it fails, we use
blk_mq_merge_queue_io(). But we already assigned bio to a request in
blk_mq_bio_to_request. blk_mq_merge_queue_io shouldn't run
blk_mq_bio_to_request again.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a65acffde19a..f13d0de42f53 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1284,6 +1284,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 				blk_mq_end_request(rq, rq->errors);
 				goto done;
 			}
+			blk_mq_insert_request(rq, false, true, true);
+			return;
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From f984df1f0f71ef96254411fc3576a10ae561be71 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 8 May 2015 10:51:32 -0700
Subject: blk-mq: do limited block plug for multiple queue case

plug is still helpful for workload with IO merge, but it can be harmful
otherwise especially with multiple hardware queues, as there is
(supposed) no lock contention in this case and plug can introduce
latency. For multiple queues, we do limited plug, eg plug only if there
is request merge. If a request doesn't have merge with following
request, the requet will be dispatched immediately.

V2: check blk_queue_nomerges() as suggested by Jeff.

Cc: Jens Axboe <axboe@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 82 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 59 insertions(+), 23 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f13d0de42f53..902c2eb9a0e7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1224,6 +1224,38 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	return rq;
 }
 
+static int blk_mq_direct_issue_request(struct request *rq)
+{
+	int ret;
+	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
+			rq->mq_ctx->cpu);
+	struct blk_mq_queue_data bd = {
+		.rq = rq,
+		.list = NULL,
+		.last = 1
+	};
+
+	/*
+	 * For OK queue, we are done. For error, kill it. Any other
+	 * error (busy), just add it to our list as we previously
+	 * would have done
+	 */
+	ret = q->mq_ops->queue_rq(hctx, &bd);
+	if (ret == BLK_MQ_RQ_QUEUE_OK)
+		return 0;
+	else {
+		__blk_mq_requeue_request(rq);
+
+		if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+			rq->errors = -EIO;
+			blk_mq_end_request(rq, rq->errors);
+			return 0;
+		}
+		return -1;
+	}
+}
+
 /*
  * Multiple hardware queue variant. This will not use per-process plugs,
  * but will attempt to bypass the hctx queueing if we can go straight to
@@ -1235,6 +1267,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 	struct blk_map_ctx data;
 	struct request *rq;
+	unsigned int request_count = 0;
+	struct blk_plug *plug;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1243,6 +1277,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
+	if (!is_flush_fua && !blk_queue_nomerges(q) &&
+	    blk_attempt_plug_merge(q, bio, &request_count))
+		return;
+
 	rq = blk_mq_map_request(q, bio, &data);
 	if (unlikely(!rq))
 		return;
@@ -1253,40 +1291,39 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		goto run_queue;
 	}
 
+	plug = current->plug;
 	/*
 	 * If the driver supports defer issued based on 'last', then
 	 * queue it up like normal since we can potentially save some
 	 * CPU this way.
 	 */
-	if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
-		struct blk_mq_queue_data bd = {
-			.rq = rq,
-			.list = NULL,
-			.last = 1
-		};
-		int ret;
+	if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
+	    !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+		struct request *old_rq = NULL;
 
 		blk_mq_bio_to_request(rq, bio);
 
 		/*
-		 * For OK queue, we are done. For error, kill it. Any other
-		 * error (busy), just add it to our list as we previously
-		 * would have done
+		 * we do limited pluging. If bio can be merged, do merge.
+		 * Otherwise the existing request in the plug list will be
+		 * issued. So the plug list will have one request at most
 		 */
-		ret = q->mq_ops->queue_rq(data.hctx, &bd);
-		if (ret == BLK_MQ_RQ_QUEUE_OK)
-			goto done;
-		else {
-			__blk_mq_requeue_request(rq);
-
-			if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
-				rq->errors = -EIO;
-				blk_mq_end_request(rq, rq->errors);
-				goto done;
+		if (plug) {
+			if (!list_empty(&plug->mq_list)) {
+				old_rq = list_first_entry(&plug->mq_list,
+					struct request, queuelist);
+				list_del_init(&old_rq->queuelist);
 			}
-			blk_mq_insert_request(rq, false, true, true);
+			list_add_tail(&rq->queuelist, &plug->mq_list);
+		} else /* is_sync */
+			old_rq = rq;
+		blk_mq_put_ctx(data.ctx);
+		if (!old_rq)
 			return;
-		}
+		if (!blk_mq_direct_issue_request(old_rq))
+			return;
+		blk_mq_insert_request(old_rq, false, true, true);
+		return;
 	}
 
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1299,7 +1336,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 run_queue:
 		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
 	}
-done:
 	blk_mq_put_ctx(data.ctx);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 5b3f341f098d60da2970758db6a05bd851eb6b39 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 8 May 2015 10:51:33 -0700
Subject: blk-mq: make plug work for mutiple disks and queues

Last patch makes plug work for multiple queue case. However it only
works for single disk case, because it assumes only one request in the
plug list. If a task is accessing multiple disks, eg MD/DM, the
assumption is wrong. Let blk_attempt_plug_merge() record request from
the same queue.

V2: use NULL parameter in !mq case. Fix a bug. Add comments in
blk_attempt_plug_merge to make it less (hopefully) confusion.

Cc: Jens Axboe <axboe@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 15 ++++++++++++---
 block/blk-mq.c   | 14 +++++++++-----
 block/blk.h      |  3 ++-
 3 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 9dcfb8ec554b..f0be754c7781 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1522,7 +1522,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
  * Caller must ensure !blk_queue_nomerges(q) beforehand.
  */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count)
+			    unsigned int *request_count,
+			    struct request **same_queue_rq)
 {
 	struct blk_plug *plug;
 	struct request *rq;
@@ -1542,8 +1543,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	list_for_each_entry_reverse(rq, plug_list, queuelist) {
 		int el_ret;
 
-		if (rq->q == q)
+		if (rq->q == q) {
 			(*request_count)++;
+			/*
+			 * Only blk-mq multiple hardware queues case checks the
+			 * rq in the same queue, there should be only one such
+			 * rq in a queue
+			 **/
+			if (same_queue_rq)
+				*same_queue_rq = rq;
+		}
 
 		if (rq->q != q || !blk_rq_merge_ok(rq, bio))
 			continue;
@@ -1608,7 +1617,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * any locks.
 	 */
 	if (!blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;
 
 	spin_lock_irq(q->queue_lock);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 902c2eb9a0e7..31df47443699 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1269,6 +1269,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	struct request *rq;
 	unsigned int request_count = 0;
 	struct blk_plug *plug;
+	struct request *same_queue_rq = NULL;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1278,7 +1279,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 		return;
 
 	rq = blk_mq_map_request(q, bio, &data);
@@ -1309,9 +1310,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		 * issued. So the plug list will have one request at most
 		 */
 		if (plug) {
-			if (!list_empty(&plug->mq_list)) {
-				old_rq = list_first_entry(&plug->mq_list,
-					struct request, queuelist);
+			/*
+			 * The plug list might get flushed before this. If that
+			 * happens, same_queue_rq is invalid and plug list is empty
+			 **/
+			if (same_queue_rq && !list_empty(&plug->mq_list)) {
+				old_rq = same_queue_rq;
 				list_del_init(&old_rq->queuelist);
 			}
 			list_add_tail(&rq->queuelist, &plug->mq_list);
@@ -1360,7 +1364,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;
 
 	rq = blk_mq_map_request(q, bio, &data);
diff --git a/block/blk.h b/block/blk.h
index 4b48d55e588e..026d9594142b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 			    struct bio *bio);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count);
+			    unsigned int *request_count,
+			    struct request **same_queue_rq);
 
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
-- 
cgit v1.2.3-59-g8ed1b


From 4ecd4fef3a074c8bb43c391a57742c422469ebbd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 May 2015 09:38:13 +0200
Subject: block: use an atomic_t for mq_freeze_depth

lockdep gets unhappy about the not disabling irqs when using the queue_lock
around it.  Instead of trying to fix that up just switch to an atomic_t
and get rid of the lock.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 24 ++++++++++--------------
 include/linux/blkdev.h |  2 +-
 2 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 31df47443699..c382a34fe5ac 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
 			return -EBUSY;
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
-				!q->mq_freeze_depth || blk_queue_dying(q));
+				!atomic_read(&q->mq_freeze_depth) ||
+				blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
 		if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 
 void blk_mq_freeze_queue_start(struct request_queue *q)
 {
-	bool freeze;
+	int freeze_depth;
 
-	spin_lock_irq(q->queue_lock);
-	freeze = !q->mq_freeze_depth++;
-	spin_unlock_irq(q->queue_lock);
-
-	if (freeze) {
+	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
+	if (freeze_depth == 1) {
 		percpu_ref_kill(&q->mq_usage_counter);
 		blk_mq_run_hw_queues(q, false);
 	}
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	bool wake;
+	int freeze_depth;
 
-	spin_lock_irq(q->queue_lock);
-	wake = !--q->mq_freeze_depth;
-	WARN_ON_ONCE(q->mq_freeze_depth < 0);
-	spin_unlock_irq(q->queue_lock);
-	if (wake) {
+	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
+	WARN_ON_ONCE(freeze_depth < 0);
+	if (!freeze_depth) {
 		percpu_ref_reinit(&q->mq_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
@@ -2081,7 +2077,7 @@ void blk_mq_free_queue(struct request_queue *q)
 /* Basically redo blk_mq_init_queue with queue frozen */
 static void blk_mq_queue_reinit(struct request_queue *q)
 {
-	WARN_ON_ONCE(!q->mq_freeze_depth);
+	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
 
 	blk_mq_sysfs_unregister(q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2da818a48097..bc917956a6d0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -444,7 +444,7 @@ struct request_queue {
 	struct mutex		sysfs_lock;
 
 	int			bypass_depth;
-	int			mq_freeze_depth;
+	atomic_t		mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
-- 
cgit v1.2.3-59-g8ed1b


From b25de9d6da49b1a8760a89672283128aa8c78345 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Apr 2015 21:41:01 +0200
Subject: block: remove BIO_EOPNOTSUPP

Since the big barrier rewrite/removal in 2007 we never fail FLUSH or
FUA requests, which means we can remove the magic BIO_EOPNOTSUPP flag
to help propagating those to the buffer_head layer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bounce.c            |  3 ---
 fs/btrfs/disk-io.c        | 11 ++---------
 fs/btrfs/extent_io.c      |  2 --
 fs/buffer.c               | 10 ----------
 fs/ext4/page-io.c         |  1 -
 fs/nilfs2/segbuf.c        | 12 ------------
 include/linux/blk_types.h |  1 -
 7 files changed, 2 insertions(+), 38 deletions(-)

(limited to 'block')

diff --git a/block/bounce.c b/block/bounce.c
index ab21ba203d5c..4bac72579c1f 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -128,9 +128,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 	struct bio_vec *bvec, *org_vec;
 	int i;
 
-	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
 	/*
 	 * free up bounce indirect pages used
 	 */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2ef9a4b72d06..e08a926fe12c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3269,11 +3269,8 @@ static int write_dev_supers(struct btrfs_device *device,
  */
 static void btrfs_end_empty_barrier(struct bio *bio, int err)
 {
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+	if (err)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
 	if (bio->bi_private)
 		complete(bio->bi_private);
 	bio_put(bio);
@@ -3301,11 +3298,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 		wait_for_completion(&device->flush_wait);
 
-		if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
-			printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
-				      rcu_str_deref(device->name));
-			device->nobarriers = 1;
-		} else if (!bio_flagged(bio, BIO_UPTODATE)) {
+		if (!bio_flagged(bio, BIO_UPTODATE)) {
 			ret = -EIO;
 			btrfs_dev_stat_inc_and_print(device,
 				BTRFS_DEV_STAT_FLUSH_ERRS);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 43af5a61ad25..1e155299abc0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2767,8 +2767,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
 	else
 		btrfsic_submit_bio(rw, bio);
 
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
 	bio_put(bio);
 	return ret;
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index c7a5602d01ee..efd85e0e8660 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2938,10 +2938,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 {
 	struct buffer_head *bh = bio->bi_private;
 
-	if (err == -EOPNOTSUPP) {
-		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-	}
-
 	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
 		set_bit(BH_Quiet, &bh->b_state);
 
@@ -3041,13 +3037,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	if (buffer_prio(bh))
 		rw |= REQ_PRIO;
 
-	bio_get(bio);
 	submit_bio(rw, bio);
-
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-
-	bio_put(bio);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(_submit_bh);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 5765f88b3904..c5d81e8d84c3 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -359,7 +359,6 @@ void ext4_io_submit(struct ext4_io_submit *io)
 	if (bio) {
 		bio_get(io->io_bio);
 		submit_bio(io->io_op, io->io_bio);
-		BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
 		bio_put(io->io_bio);
 	}
 	io->io_bio = NULL;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index dc3a9efdaab8..42468e5ab3e7 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct nilfs_segment_buffer *segbuf = bio->bi_private;
 
-	if (err == -EOPNOTSUPP) {
-		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		/* to be detected by nilfs_segbuf_submit_bio() */
-	}
-
 	if (!uptodate)
 		atomic_inc(&segbuf->sb_err);
 
@@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
 
 	bio->bi_end_io = nilfs_end_bio_write;
 	bio->bi_private = segbuf;
-	bio_get(bio);
 	submit_bio(mode, bio);
 	segbuf->sb_nbio++;
-	if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
-		bio_put(bio);
-		err = -EOPNOTSUPP;
-		goto failed;
-	}
-	bio_put(bio);
 
 	wi->bio = NULL;
 	wi->rest_blocks -= wi->end - wi->start;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 93d2e7153816..daf95915d104 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -118,7 +118,6 @@ struct bio {
 #define BIO_CLONED	4	/* doesn't own data */
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
-#define BIO_EOPNOTSUPP	7	/* not supported */
 #define BIO_NULL_MAPPED 8	/* contains invalid user pages */
 #define BIO_QUIET	9	/* Make BIO Quiet */
 #define BIO_SNAP_STABLE	10	/* bio data must be snapshotted during write */
-- 
cgit v1.2.3-59-g8ed1b


From 97ca223c3b37ed12a5b67a5dc6247e5a4799d337 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Apr 2015 21:41:02 +0200
Subject: block: remove unused BIO_RW_BLOCK and BIO_EOF flags

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c          | 2 --
 include/linux/blk_types.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index f0be754c7781..de474b5dee2b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1725,8 +1725,6 @@ static void handle_bad_sector(struct bio *bio)
 			bio->bi_rw,
 			(unsigned long long)bio_end_sector(bio),
 			(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
-
-	set_bit(BIO_EOF, &bio->bi_flags);
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index daf95915d104..09c7a2cd48ef 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -112,8 +112,6 @@ struct bio {
  * bio flags
  */
 #define BIO_UPTODATE	0	/* ok after I/O completion */
-#define BIO_RW_BLOCK	1	/* RW_AHEAD set, and read/write would block */
-#define BIO_EOF		2	/* out-out-bounds error */
 #define BIO_SEG_VALID	3	/* bi_phys_segments valid */
 #define BIO_CLONED	4	/* doesn't own data */
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
-- 
cgit v1.2.3-59-g8ed1b


From be32417796c2b8a83fe4cbece83bea96ab9e378f Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Wed, 6 May 2015 12:26:22 +0800
Subject: block: export blkdev_reread_part() and __blkdev_reread_part()

This patch exports blkdev_reread_part() for block drivers, also
introduce __blkdev_reread_part().

For some drivers, such as loop, reread of partitions can be run
from the release path, and bd_mutex may already be held prior to
calling ioctl_by_bdev(bdev, BLKRRPART, 0), so introduce
__blkdev_reread_part for use in such cases.

CC: Christoph Hellwig <hch@lst.de>
CC: Jens Axboe <axboe@kernel.dk>
CC: Tejun Heo <tj@kernel.org>
CC: Alexander Viro <viro@zeniv.linux.org.uk>
CC: Markus Pargmann <mpa@pengutronix.de>
CC: Stefan Weinhuber <wein@de.ibm.com>
CC: Stefan Haberland <stefan.haberland@de.ibm.com>
CC: Sebastian Ott <sebott@linux.vnet.ibm.com>
CC: Fabian Frederick <fabf@skynet.be>
CC: Ming Lei <ming.lei@canonical.com>
CC: David Herrmann <dh.herrmann@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: nbd-general@lists.sourceforge.net
CC: linux-s390@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/ioctl.c      | 28 +++++++++++++++++++++++++---
 include/linux/fs.h |  3 +++
 2 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befde2aca..203cb4aeea8b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -150,21 +150,43 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	}
 }
 
-static int blkdev_reread_part(struct block_device *bdev)
+/*
+ * This is an exported API for the block driver, and will not
+ * acquire bd_mutex. This API should be used in case that
+ * caller has held bd_mutex already.
+ */
+int __blkdev_reread_part(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	int res;
 
 	if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
+
+	lockdep_assert_held(&bdev->bd_mutex);
+
+	return rescan_partitions(disk, bdev);
+}
+EXPORT_SYMBOL(__blkdev_reread_part);
+
+/*
+ * This is an exported API for the block driver, and will
+ * try to acquire bd_mutex. If bd_mutex has been held already
+ * in current context, please call __blkdev_reread_part().
+ */
+int blkdev_reread_part(struct block_device *bdev)
+{
+	int res;
+
 	if (!mutex_trylock(&bdev->bd_mutex))
 		return -EBUSY;
-	res = rescan_partitions(disk, bdev);
+	res = __blkdev_reread_part(bdev);
 	mutex_unlock(&bdev->bd_mutex);
+
 	return res;
 }
+EXPORT_SYMBOL(blkdev_reread_part);
 
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len, int secure)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 35ec87e490b1..1ef63900243c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2279,6 +2279,9 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
 					      void *holder);
 extern void blkdev_put(struct block_device *bdev, fmode_t mode);
+extern int __blkdev_reread_part(struct block_device *bdev);
+extern int blkdev_reread_part(struct block_device *bdev);
+
 #ifdef CONFIG_SYSFS
 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
 extern void bd_unlink_disk_holder(struct block_device *bdev,
-- 
cgit v1.2.3-59-g8ed1b


From b04a5636a665f5529fdf69ee7e5512156196f31c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 6 May 2015 12:26:27 +0800
Subject: block: replace trylock with mutex_lock in blkdev_reread_part()

The only possible problem of using mutex_lock() instead of trylock
is about deadlock.

If there aren't any locks held before calling blkdev_reread_part(),
deadlock can't be caused by this conversion.

If there are locks held before calling blkdev_reread_part(),
and if these locks arn't required in open, close handler and I/O
path, deadlock shouldn't be caused too.

Both user space's ioctl(BLKRRPART) and md_setup_drive() from
init/do_mounts_md.c belongs to the 1st case, so the conversion is safe
for the two cases.

For loop, the previous patches in this pathset has fixed the ABBA lock
dependency, so the conversion is OK.

For nbd, tx_lock is held when calling the function:

	- both open and release won't hold the lock
	- when blkdev_reread_part() is run, I/O thread has been stopped
	already, so tx_lock won't be acquired in I/O path at that time.
	- so the conversion won't cause deadlock for nbd

For dasd, both dasd_open(), dasd_release() and request function don't
acquire any mutex/semphone, so the conversion should be safe.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Jarod Wilson <jarod@redhat.com>
Acked-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/ioctl.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/ioctl.c b/block/ioctl.c
index 203cb4aeea8b..8061eba42887 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -174,13 +174,18 @@ EXPORT_SYMBOL(__blkdev_reread_part);
  * This is an exported API for the block driver, and will
  * try to acquire bd_mutex. If bd_mutex has been held already
  * in current context, please call __blkdev_reread_part().
+ *
+ * Make sure the held locks in current context aren't required
+ * in open()/close() handler and I/O path for avoiding ABBA deadlock:
+ * - bd_mutex is held before calling block driver's open/close
+ *   handler
+ * - reading partition table may submit I/O to the block device
  */
 int blkdev_reread_part(struct block_device *bdev)
 {
 	int res;
 
-	if (!mutex_trylock(&bdev->bd_mutex))
-		return -EBUSY;
+	mutex_lock(&bdev->bd_mutex);
 	res = __blkdev_reread_part(bdev);
 	mutex_unlock(&bdev->bd_mutex);
 
-- 
cgit v1.2.3-59-g8ed1b


From 326e1dbb57368087a36607aaebe9795b8d5453e5 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 22 May 2015 09:14:03 -0400
Subject: block: remove management of bi_remaining when restoring original
 bi_end_io

Commit c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for
non-chains") regressed all existing callers that followed this pattern:
 1) saving a bio's original bi_end_io
 2) wiring up an intermediate bi_end_io
 3) restoring the original bi_end_io from intermediate bi_end_io
 4) calling bio_endio() to execute the restored original bi_end_io

The regression was due to BIO_CHAIN only ever getting set if
bio_inc_remaining() is called.  For the above pattern it isn't set until
step 3 above (step 2 would've needed to establish BIO_CHAIN).  As such
the first bio_endio(), in step 2 above, never decremented __bi_remaining
before calling the intermediate bi_end_io -- leaving __bi_remaining with
the value 1 instead of 0.  When bio_inc_remaining() occurred during step
3 it brought it to a value of 2.  When the second bio_endio() was
called, in step 4 above, it should've called the original bi_end_io but
it didn't because there was an extra reference that wasn't dropped (due
to atomic operations being optimized away since BIO_CHAIN wasn't set
upfront).

Fix this issue by removing the __bi_remaining management complexity for
all callers that use the above pattern -- bio_chain() is the only
interface that _needs_ to be concerned with __bi_remaining.  For the
above pattern callers just expect the bi_end_io they set to get called!
Remove bio_endio_nodec() and also remove all bio_inc_remaining() calls
that aren't associated with the bio_chain() interface.

Also, the bio_inc_remaining() interface has been moved local to bio.c.

Fixes: c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio-integrity.c        |  4 ++--
 block/bio.c                  | 35 ++++++++++++++---------------------
 drivers/md/bcache/io.c       |  2 +-
 drivers/md/dm-cache-target.c |  6 ------
 drivers/md/dm-raid1.c        |  2 --
 drivers/md/dm-snap.c         |  1 -
 drivers/md/dm-thin.c         |  9 +++------
 drivers/md/dm-verity.c       |  2 +-
 fs/btrfs/disk-io.c           |  2 +-
 fs/btrfs/volumes.c           | 16 +++++-----------
 fs/btrfs/volumes.h           |  2 --
 include/linux/bio.h          | 12 ------------
 12 files changed, 27 insertions(+), 66 deletions(-)

(limited to 'block')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5cbd5d9ea61d..0436c21db7f2 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 /**
@@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error)
 	 */
 	if (error) {
 		bio->bi_end_io = bip->bip_end_io;
-		bio_endio_nodec(bio, error);
+		bio_endio(bio, error);
 
 		return;
 	}
diff --git a/block/bio.c b/block/bio.c
index c2ff8a88aef1..259197d97de1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error)
 	bio_put(bio);
 }
 
+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
 /**
  * bio_chain - chain bio completions
  * @bio: the target bio
@@ -1756,8 +1767,10 @@ static inline bool bio_remaining_done(struct bio *bio)
 
 	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
 
-	if (atomic_dec_and_test(&bio->__bi_remaining))
+	if (atomic_dec_and_test(&bio->__bi_remaining)) {
+		clear_bit(BIO_CHAIN, &bio->bi_flags);
 		return true;
+	}
 
 	return false;
 }
@@ -1808,26 +1821,6 @@ void bio_endio(struct bio *bio, int error)
 }
 EXPORT_SYMBOL(bio_endio);
 
-/**
- * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
- * @bio:	bio
- * @error:	error, if any
- *
- * For code that has saved and restored bi_end_io; thing hard before using this
- * function, probably you should've cloned the entire bio.
- **/
-void bio_endio_nodec(struct bio *bio, int error)
-{
-	/*
-	 * If it's not flagged as a chain, we are not going to dec the count
-	 */
-	if (bio_flagged(bio, BIO_CHAIN))
-		bio_inc_remaining(bio);
-
-	bio_endio(bio, error);
-}
-EXPORT_SYMBOL(bio_endio_nodec);
-
 /**
  * bio_split - split a bio
  * @bio:	bio to split
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fa028fa82df4..cb64e64a4789 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -55,7 +55,7 @@ static void bch_bio_submit_split_done(struct closure *cl)
 
 	s->bio->bi_end_io = s->bi_end_io;
 	s->bio->bi_private = s->bi_private;
-	bio_endio_nodec(s->bio, 0);
+	bio_endio(s->bio, 0);
 
 	closure_debug_destroy(&s->cl);
 	mempool_free(s, s->p->bio_split_hook);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 705eb7b99d69..41b2594a80c6 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -86,12 +86,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 {
 	bio->bi_end_io = h->bi_end_io;
 	bio->bi_private = h->bi_private;
-
-	/*
-	 * Must bump bi_remaining to allow bio to complete with
-	 * restored bi_end_io.
-	 */
-	bio_inc_remaining(bio);
 }
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d6a1c096b777..743fa9bbae9e 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1254,8 +1254,6 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 			dm_bio_restore(bd, bio);
 			bio_record->details.bi_bdev = NULL;
 
-			bio_inc_remaining(bio);
-
 			queue_bio(ms, bio, rw);
 			return DM_ENDIO_INCOMPLETE;
 		}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 8bfeae218531..7c82d3ccce87 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1478,7 +1478,6 @@ out:
 	if (full_bio) {
 		full_bio->bi_end_io = pe->full_bio_end_io;
 		full_bio->bi_private = pe->full_bio_private;
-		bio_inc_remaining(full_bio);
 	}
 	increment_pending_exceptions_done_count();
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 342dbdad6131..e852602c0091 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -793,10 +793,9 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-	if (m->bio) {
+	if (m->bio)
 		m->bio->bi_end_io = m->saved_bi_end_io;
-		bio_inc_remaining(m->bio);
-	}
+
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
 	mempool_free(m, m->tc->pool->mapping_pool);
@@ -810,10 +809,8 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	int r;
 
 	bio = m->bio;
-	if (bio) {
+	if (bio)
 		bio->bi_end_io = m->saved_bi_end_io;
-		bio_inc_remaining(bio);
-	}
 
 	if (m->err) {
 		cell_error(pool, m->cell);
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 66616db33e6f..bb9c6a00e4b0 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -459,7 +459,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_private = io->orig_bi_private;
 
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 static void verity_work(struct work_struct *w)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e08a926fe12c..0bccf18dc1dc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1745,7 +1745,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
 	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 static int cleaner_kthread(void *arg)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8e8d1d1e28a5..dac77d42a9ab 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5585,10 +5585,10 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
 {
-	if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
-		bio_endio_nodec(bio, err);
-	else
-		bio_endio(bio, err);
+	bio->bi_private = bbio->private;
+	bio->bi_end_io = bbio->end_io;
+	bio_endio(bio, err);
+
 	btrfs_put_bbio(bbio);
 }
 
@@ -5632,8 +5632,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
 			bio = bbio->orig_bio;
 		}
 
-		bio->bi_private = bbio->private;
-		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the btrfs bio
@@ -5815,8 +5813,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 		/* Shoud be the original bio. */
 		WARN_ON(bio != bbio->orig_bio);
 
-		bio->bi_private = bbio->private;
-		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		bio->bi_iter.bi_sector = logical >> 9;
 
@@ -5897,10 +5893,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		if (dev_nr < total_devs - 1) {
 			bio = btrfs_bio_clone(first_bio, GFP_NOFS);
 			BUG_ON(!bio); /* -ENOMEM */
-		} else {
+		} else
 			bio = first_bio;
-			bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
-		}
 
 		submit_stripe_bio(root, bbio, bio,
 				  bbio->stripes[dev_nr].physical, dev_nr, rw,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ebc31331a837..cedae0356558 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,8 +292,6 @@ struct btrfs_bio_stripe {
 struct btrfs_bio;
 typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 
-#define BTRFS_BIO_ORIG_BIO_SUBMITTED	(1 << 0)
-
 struct btrfs_bio {
 	atomic_t refs;
 	atomic_t stripes_pending;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7486ea103f6e..f0291cf64cc5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -427,7 +427,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
 }
 
 extern void bio_endio(struct bio *, int);
-extern void bio_endio_nodec(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
@@ -658,17 +657,6 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 	return bio;
 }
 
-/*
- * Increment chain count for the bio. Make sure the CHAIN flag update
- * is visible before the raised count.
- */
-static inline void bio_inc_remaining(struct bio *bio)
-{
-	bio->bi_flags |= (1 << BIO_CHAIN);
-	smp_mb__before_atomic();
-	atomic_inc(&bio->__bi_remaining);
-}
-
 /*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
-- 
cgit v1.2.3-59-g8ed1b


From 5f1b670d0bef508a5554d92525f5f6d00d640b38 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 May 2015 09:14:04 -0400
Subject: block, dm: don't copy bios for request clones

Currently dm-multipath has to clone the bios for every request sent
to the lower devices, which wastes cpu cycles and ties down memory.

This patch instead adds a new REQ_CLONE flag that instructs req_bio_endio
to not complete bios attached to a request, which we set on clone
requests similar to bios in a flush sequence.  With this change I/O
errors on a path failure only get propagated to dm-multipath, which
can then either resubmit the I/O or complete the bios on the original
request.

I've done some basic testing of this on a Linux target with ALUA support,
and it survives path failures during I/O nicely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c          |  94 +++----------------------
 drivers/md/dm-table.c     |  25 ++++---
 drivers/md/dm.c           | 171 +++++++++++-----------------------------------
 drivers/md/dm.h           |   5 +-
 include/linux/blk_types.h |   2 +
 include/linux/blkdev.h    |   6 +-
 6 files changed, 73 insertions(+), 230 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index de474b5dee2b..aa819a58ea24 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
-	if (error)
+	if (error && !(rq->cmd_flags & REQ_CLONE))
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
@@ -128,7 +128,8 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+	if (bio->bi_iter.bi_size == 0 &&
+	    !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE)))
 		bio_endio(bio, error);
 }
 
@@ -2909,95 +2910,22 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 
-/**
- * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
- * @rq: the clone request to be cleaned up
- *
- * Description:
- *     Free all bios in @rq for a cloned request.
- */
-void blk_rq_unprep_clone(struct request *rq)
-{
-	struct bio *bio;
-
-	while ((bio = rq->bio) != NULL) {
-		rq->bio = bio->bi_next;
-
-		bio_put(bio);
-	}
-}
-EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-
-/*
- * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->sense) are not copied.
- */
-static void __blk_rq_prep_clone(struct request *dst, struct request *src)
+void blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
+	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK);
+	dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
 	dst->ioprio = src->ioprio;
 	dst->extra_len = src->extra_len;
-}
-
-/**
- * blk_rq_prep_clone - Helper function to setup clone request
- * @rq: the request to be setup
- * @rq_src: original request to be cloned
- * @bs: bio_set that bios for clone are allocated from
- * @gfp_mask: memory allocation mask for bio
- * @bio_ctr: setup function to be called for each clone bio.
- *           Returns %0 for success, non %0 for failure.
- * @data: private data to be passed to @bio_ctr
- *
- * Description:
- *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
- *     are not copied, and copying such parts is the caller's responsibility.
- *     Also, pages which the original bios are pointing to are not copied
- *     and the cloned bios just point same pages.
- *     So cloned bios must be completed before original bios, which means
- *     the caller must complete @rq before @rq_src.
- */
-int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-		      struct bio_set *bs, gfp_t gfp_mask,
-		      int (*bio_ctr)(struct bio *, struct bio *, void *),
-		      void *data)
-{
-	struct bio *bio, *bio_src;
-
-	if (!bs)
-		bs = fs_bio_set;
-
-	__rq_for_each_bio(bio_src, rq_src) {
-		bio = bio_clone_fast(bio_src, gfp_mask, bs);
-		if (!bio)
-			goto free_and_out;
-
-		if (bio_ctr && bio_ctr(bio, bio_src, data))
-			goto free_and_out;
-
-		if (rq->bio) {
-			rq->biotail->bi_next = bio;
-			rq->biotail = bio;
-		} else
-			rq->bio = rq->biotail = bio;
-	}
-
-	__blk_rq_prep_clone(rq, rq_src);
-
-	return 0;
-
-free_and_out:
-	if (bio)
-		bio_put(bio);
-	blk_rq_unprep_clone(rq);
-
-	return -ENOMEM;
+	dst->bio = src->bio;
+	dst->biotail = src->biotail;
+	dst->cmd = src->cmd;
+	dst->cmd_len = src->cmd_len;
+	dst->sense = src->sense;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index d9b00b8565c6..3662b2e49b8d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -940,21 +940,28 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 {
 	unsigned type = dm_table_get_type(t);
 	unsigned per_bio_data_size = 0;
-	struct dm_target *tgt;
 	unsigned i;
 
-	if (unlikely(type == DM_TYPE_NONE)) {
+	switch (type) {
+	case DM_TYPE_BIO_BASED:
+		for (i = 0; i < t->num_targets; i++) {
+			struct dm_target *tgt = t->targets + i;
+
+			per_bio_data_size = max(per_bio_data_size,
+						tgt->per_bio_data_size);
+		}
+		t->mempools = dm_alloc_bio_mempools(t->integrity_supported,
+						    per_bio_data_size);
+		break;
+	case DM_TYPE_REQUEST_BASED:
+	case DM_TYPE_MQ_REQUEST_BASED:
+		t->mempools = dm_alloc_rq_mempools(md, type);
+		break;
+	default:
 		DMWARN("no table type is set, can't allocate mempools");
 		return -EINVAL;
 	}
 
-	if (type == DM_TYPE_BIO_BASED)
-		for (i = 0; i < t->num_targets; i++) {
-			tgt = t->targets + i;
-			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
-		}
-
-	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a930b72314ac..38837f8ea327 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -990,57 +990,6 @@ static void clone_endio(struct bio *bio, int error)
 	dec_pending(io, error);
 }
 
-/*
- * Partial completion handling for request-based dm
- */
-static void end_clone_bio(struct bio *clone, int error)
-{
-	struct dm_rq_clone_bio_info *info =
-		container_of(clone, struct dm_rq_clone_bio_info, clone);
-	struct dm_rq_target_io *tio = info->tio;
-	struct bio *bio = info->orig;
-	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
-
-	bio_put(clone);
-
-	if (tio->error)
-		/*
-		 * An error has already been detected on the request.
-		 * Once error occurred, just let clone->end_io() handle
-		 * the remainder.
-		 */
-		return;
-	else if (error) {
-		/*
-		 * Don't notice the error to the upper layer yet.
-		 * The error handling decision is made by the target driver,
-		 * when the request is completed.
-		 */
-		tio->error = error;
-		return;
-	}
-
-	/*
-	 * I/O for the bio successfully completed.
-	 * Notice the data completion to the upper layer.
-	 */
-
-	/*
-	 * bios are processed from the head of the list.
-	 * So the completing bio should always be rq->bio.
-	 * If it's not, something wrong is happening.
-	 */
-	if (tio->orig->bio != bio)
-		DMERR("bio completion is going in the middle of the request");
-
-	/*
-	 * Update the original request.
-	 * Do not use blk_end_request() here, because it may complete
-	 * the original request before the clone, and break the ordering.
-	 */
-	blk_update_request(tio->orig, 0, nr_bytes);
-}
-
 static struct dm_rq_target_io *tio_from_request(struct request *rq)
 {
 	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
@@ -1089,8 +1038,6 @@ static void free_rq_clone(struct request *clone, bool must_be_mapped)
 
 	WARN_ON_ONCE(must_be_mapped && !clone->q);
 
-	blk_rq_unprep_clone(clone);
-
 	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
 		/* stacked on blk-mq queue(s) */
 		tio->ti->type->release_clone_rq(clone);
@@ -1821,39 +1768,13 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 		dm_complete_request(rq, r);
 }
 
-static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
-				 void *data)
+static void setup_clone(struct request *clone, struct request *rq,
+		        struct dm_rq_target_io *tio)
 {
-	struct dm_rq_target_io *tio = data;
-	struct dm_rq_clone_bio_info *info =
-		container_of(bio, struct dm_rq_clone_bio_info, clone);
-
-	info->orig = bio_orig;
-	info->tio = tio;
-	bio->bi_end_io = end_clone_bio;
-
-	return 0;
-}
-
-static int setup_clone(struct request *clone, struct request *rq,
-		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
-	int r;
-
-	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
-			      dm_rq_bio_constructor, tio);
-	if (r)
-		return r;
-
-	clone->cmd = rq->cmd;
-	clone->cmd_len = rq->cmd_len;
-	clone->sense = rq->sense;
+	blk_rq_prep_clone(clone, rq);
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
-
 	tio->clone = clone;
-
-	return 0;
 }
 
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
@@ -1874,12 +1795,7 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 		clone = tio->clone;
 
 	blk_rq_init(NULL, clone);
-	if (setup_clone(clone, rq, tio, gfp_mask)) {
-		/* -ENOMEM */
-		if (alloc_clone)
-			free_clone_request(md, clone);
-		return NULL;
-	}
+	setup_clone(clone, rq, tio);
 
 	return clone;
 }
@@ -1973,11 +1889,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		}
 		if (IS_ERR(clone))
 			return DM_MAPIO_REQUEUE;
-		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
-			/* -ENOMEM */
-			ti->type->release_clone_rq(clone);
-			return DM_MAPIO_REQUEUE;
-		}
+		setup_clone(clone, rq, tio);
 	}
 
 	switch (r) {
@@ -2431,8 +2343,6 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
-	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
-
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->rq_pool = p->rq_pool;
@@ -3536,48 +3446,23 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
-					    unsigned integrity, unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
+					     unsigned per_bio_data_size)
 {
-	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-	struct kmem_cache *cachep = NULL;
-	unsigned int pool_size = 0;
+	struct dm_md_mempools *pools;
+	unsigned int pool_size = dm_get_reserved_bio_based_ios();
 	unsigned int front_pad;
 
+	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
 		return NULL;
 
-	type = filter_md_type(type, md);
+	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
+		offsetof(struct dm_target_io, clone);
 
-	switch (type) {
-	case DM_TYPE_BIO_BASED:
-		cachep = _io_cache;
-		pool_size = dm_get_reserved_bio_based_ios();
-		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
-		break;
-	case DM_TYPE_REQUEST_BASED:
-		cachep = _rq_tio_cache;
-		pool_size = dm_get_reserved_rq_based_ios();
-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-		if (!pools->rq_pool)
-			goto out;
-		/* fall through to setup remaining rq-based pools */
-	case DM_TYPE_MQ_REQUEST_BASED:
-		if (!pool_size)
-			pool_size = dm_get_reserved_rq_based_ios();
-		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
-		/* per_bio_data_size is not used. See __bind_mempools(). */
-		WARN_ON(per_bio_data_size != 0);
-		break;
-	default:
-		BUG();
-	}
-
-	if (cachep) {
-		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-		if (!pools->io_pool)
-			goto out;
-	}
+	pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
+	if (!pools->io_pool)
+		goto out;
 
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
@@ -3587,10 +3472,34 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 		goto out;
 
 	return pools;
-
 out:
 	dm_free_md_mempools(pools);
+	return NULL;
+}
+
+struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
+					    unsigned type)
+{
+	unsigned int pool_size = dm_get_reserved_rq_based_ios();
+	struct dm_md_mempools *pools;
 
+	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+	if (!pools)
+		return NULL;
+
+	if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
+		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+		if (!pools->rq_pool)
+			goto out;
+	}
+
+	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
+	if (!pools->io_pool)
+		goto out;
+
+	return pools;
+out:
+	dm_free_md_mempools(pools);
 	return NULL;
 }
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 6123c2bf9150..e6e66d087b26 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -222,8 +222,9 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
-					    unsigned integrity, unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
+					     unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, unsigned type);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3f4ded0b1a34..45a6be89957c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -192,6 +192,7 @@ enum rq_flag_bits {
 	__REQ_HASHED,		/* on IO scheduler merge hash */
 	__REQ_MQ_INFLIGHT,	/* track inflight for MQ */
 	__REQ_NO_TIMEOUT,	/* requests may never expire */
+	__REQ_CLONE,		/* cloned bios */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -246,5 +247,6 @@ enum rq_flag_bits {
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
 #define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
 #define REQ_NO_TIMEOUT		(1ULL << __REQ_NO_TIMEOUT)
+#define REQ_CLONE		(1ULL << __REQ_CLONE)
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bc917956a6d0..9ded80da2c16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -775,11 +775,7 @@ extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
-extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-			     struct bio_set *bs, gfp_t gfp_mask,
-			     int (*bio_ctr)(struct bio *, struct bio *, void *),
-			     void *data);
-extern void blk_rq_unprep_clone(struct request *rq);
+extern void blk_rq_prep_clone(struct request *rq, struct request *rq_src);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
-- 
cgit v1.2.3-59-g8ed1b