From b5dd2f6047ca108001328aac0e8588edd15f1778 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 31 Dec 2014 13:22:57 +0000
Subject: block: loop: improve performance via blk-mq

The conversion is a bit straightforward, and use work queue to
dispatch requests of loop block, and one big change is that requests
is submitted to backend file/device concurrently with work queue,
so throughput may get improved much. Given write requests over same
file are often run exclusively, so don't handle them concurrently for
avoiding extra context switch cost, possible lock contention and work
schedule cost. Also with blk-mq, there is opportunity to get loop I/O
merged before submitting to backend file/device.

In the following test:
	- base: v3.19-rc2-2041231
	- loop over file in ext4 file system on SSD disk
	- bs: 4k, libaio, io depth: 64, O_DIRECT, num of jobs: 1
	- throughput: IOPS

	------------------------------------------------------
	|            | base      | base with loop-mq | delta |
	------------------------------------------------------
	| randread   | 1740      | 25318             | +1355%|
	------------------------------------------------------
	| read       | 42196     | 51771             | +22.6%|
	-----------------------------------------------------
	| randwrite  | 35709     | 34624             | -3%   |
	-----------------------------------------------------
	| write      | 39137     | 40326             | +3%   |
	-----------------------------------------------------

So loop-mq can improve throughput for both read and randread, meantime,
performance of write and randwrite isn't hurted basically.

Another benefit is that loop driver code gets simplified
much after blk-mq conversion, and the patch can be thought as
cleanup too.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 327 +++++++++++++++++++++++++--------------------------
 drivers/block/loop.h |  17 ++-
 2 files changed, 174 insertions(+), 170 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6cb1beb47c25..c678eb24a7b5 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -75,6 +75,7 @@
 #include <linux/sysfs.h>
 #include <linux/miscdevice.h>
 #include <linux/falloc.h>
+#include <linux/blk-mq.h>
 #include "loop.h"
 
 #include <asm/uaccess.h>
@@ -85,6 +86,8 @@ static DEFINE_MUTEX(loop_index_mutex);
 static int max_part;
 static int part_shift;
 
+static struct workqueue_struct *loop_wq;
+
 /*
  * Transfer functions
  */
@@ -466,109 +469,36 @@ out:
 	return ret;
 }
 
-/*
- * Add bio to back of pending list
- */
-static void loop_add_bio(struct loop_device *lo, struct bio *bio)
-{
-	lo->lo_bio_count++;
-	bio_list_add(&lo->lo_bio_list, bio);
-}
-
-/*
- * Grab first pending buffer
- */
-static struct bio *loop_get_bio(struct loop_device *lo)
-{
-	lo->lo_bio_count--;
-	return bio_list_pop(&lo->lo_bio_list);
-}
-
-static void loop_make_request(struct request_queue *q, struct bio *old_bio)
-{
-	struct loop_device *lo = q->queuedata;
-	int rw = bio_rw(old_bio);
-
-	if (rw == READA)
-		rw = READ;
-
-	BUG_ON(!lo || (rw != READ && rw != WRITE));
-
-	spin_lock_irq(&lo->lo_lock);
-	if (lo->lo_state != Lo_bound)
-		goto out;
-	if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
-		goto out;
-	if (lo->lo_bio_count >= q->nr_congestion_on)
-		wait_event_lock_irq(lo->lo_req_wait,
-				    lo->lo_bio_count < q->nr_congestion_off,
-				    lo->lo_lock);
-	loop_add_bio(lo, old_bio);
-	wake_up(&lo->lo_event);
-	spin_unlock_irq(&lo->lo_lock);
-	return;
-
-out:
-	spin_unlock_irq(&lo->lo_lock);
-	bio_io_error(old_bio);
-}
-
 struct switch_request {
 	struct file *file;
 	struct completion wait;
 };
 
-static void do_loop_switch(struct loop_device *, struct switch_request *);
-
-static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
+static inline int loop_handle_bio(struct loop_device *lo, struct bio *bio)
 {
-	if (unlikely(!bio->bi_bdev)) {
-		do_loop_switch(lo, bio->bi_private);
-		bio_put(bio);
-	} else {
-		int ret = do_bio_filebacked(lo, bio);
-		bio_endio(bio, ret);
-	}
+	return do_bio_filebacked(lo, bio);
 }
 
 /*
- * worker thread that handles reads/writes to file backed loop devices,
- * to avoid blocking in our make_request_fn. it also does loop decrypting
- * on reads for block backed loop, as that is too heavy to do from
- * b_end_io context where irqs may be disabled.
- *
- * Loop explanation:  loop_clr_fd() sets lo_state to Lo_rundown before
- * calling kthread_stop().  Therefore once kthread_should_stop() is
- * true, make_request will not place any more requests.  Therefore
- * once kthread_should_stop() is true and lo_bio is NULL, we are
- * done with the loop.
+ * Do the actual switch; called from the BIO completion routine
  */
-static int loop_thread(void *data)
+static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
 {
-	struct loop_device *lo = data;
-	struct bio *bio;
-
-	set_user_nice(current, MIN_NICE);
-
-	while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
-
-		wait_event_interruptible(lo->lo_event,
-				!bio_list_empty(&lo->lo_bio_list) ||
-				kthread_should_stop());
-
-		if (bio_list_empty(&lo->lo_bio_list))
-			continue;
-		spin_lock_irq(&lo->lo_lock);
-		bio = loop_get_bio(lo);
-		if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
-			wake_up(&lo->lo_req_wait);
-		spin_unlock_irq(&lo->lo_lock);
+	struct file *file = p->file;
+	struct file *old_file = lo->lo_backing_file;
+	struct address_space *mapping;
 
-		BUG_ON(!bio);
-		loop_handle_bio(lo, bio);
-	}
+	/* if no new file, only flush of queued bios requested */
+	if (!file)
+		return;
 
-	return 0;
+	mapping = file->f_mapping;
+	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
+	lo->lo_backing_file = file;
+	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
+		mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
+	lo->old_gfp_mask = mapping_gfp_mask(mapping);
+	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 }
 
 /*
@@ -579,15 +509,18 @@ static int loop_thread(void *data)
 static int loop_switch(struct loop_device *lo, struct file *file)
 {
 	struct switch_request w;
-	struct bio *bio = bio_alloc(GFP_KERNEL, 0);
-	if (!bio)
-		return -ENOMEM;
-	init_completion(&w.wait);
+
 	w.file = file;
-	bio->bi_private = &w;
-	bio->bi_bdev = NULL;
-	loop_make_request(lo->lo_queue, bio);
-	wait_for_completion(&w.wait);
+
+	/* freeze queue and wait for completion of scheduled requests */
+	blk_mq_freeze_queue(lo->lo_queue);
+
+	/* do the switch action */
+	do_loop_switch(lo, &w);
+
+	/* unfreeze */
+	blk_mq_unfreeze_queue(lo->lo_queue);
+
 	return 0;
 }
 
@@ -596,38 +529,9 @@ static int loop_switch(struct loop_device *lo, struct file *file)
  */
 static int loop_flush(struct loop_device *lo)
 {
-	/* loop not yet configured, no running thread, nothing to flush */
-	if (!lo->lo_thread)
-		return 0;
-
 	return loop_switch(lo, NULL);
 }
 
-/*
- * Do the actual switch; called from the BIO completion routine
- */
-static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
-{
-	struct file *file = p->file;
-	struct file *old_file = lo->lo_backing_file;
-	struct address_space *mapping;
-
-	/* if no new file, only flush of queued bios requested */
-	if (!file)
-		goto out;
-
-	mapping = file->f_mapping;
-	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
-	lo->lo_backing_file = file;
-	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
-		mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
-	lo->old_gfp_mask = mapping_gfp_mask(mapping);
-	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
-out:
-	complete(&p->wait);
-}
-
-
 /*
  * loop_change_fd switched the backing store of a loopback device to
  * a new file. This is useful for operating system installers to free up
@@ -889,12 +793,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->transfer = transfer_none;
 	lo->ioctl = NULL;
 	lo->lo_sizelimit = 0;
-	lo->lo_bio_count = 0;
 	lo->old_gfp_mask = mapping_gfp_mask(mapping);
 	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 
-	bio_list_init(&lo->lo_bio_list);
-
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
 		blk_queue_flush(lo->lo_queue, REQ_FLUSH);
 
@@ -906,14 +807,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 
 	set_blocksize(bdev, lo_blocksize);
 
-	lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
-						lo->lo_number);
-	if (IS_ERR(lo->lo_thread)) {
-		error = PTR_ERR(lo->lo_thread);
-		goto out_clr;
-	}
 	lo->lo_state = Lo_bound;
-	wake_up_process(lo->lo_thread);
 	if (part_shift)
 		lo->lo_flags |= LO_FLAGS_PARTSCAN;
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
@@ -925,18 +819,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	bdgrab(bdev);
 	return 0;
 
-out_clr:
-	loop_sysfs_exit(lo);
-	lo->lo_thread = NULL;
-	lo->lo_device = NULL;
-	lo->lo_backing_file = NULL;
-	lo->lo_flags = 0;
-	set_capacity(lo->lo_disk, 0);
-	invalidate_bdev(bdev);
-	bd_set_size(bdev, 0);
-	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
-	mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
-	lo->lo_state = Lo_unbound;
  out_putf:
 	fput(file);
  out:
@@ -1012,11 +894,6 @@ static int loop_clr_fd(struct loop_device *lo)
 
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_state = Lo_rundown;
-	spin_unlock_irq(&lo->lo_lock);
-
-	kthread_stop(lo->lo_thread);
-
-	spin_lock_irq(&lo->lo_lock);
 	lo->lo_backing_file = NULL;
 	spin_unlock_irq(&lo->lo_lock);
 
@@ -1028,7 +905,6 @@ static int loop_clr_fd(struct loop_device *lo)
 	lo->lo_offset = 0;
 	lo->lo_sizelimit = 0;
 	lo->lo_encrypt_key_size = 0;
-	lo->lo_thread = NULL;
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
 	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
@@ -1601,6 +1477,108 @@ int loop_unregister_transfer(int number)
 EXPORT_SYMBOL(loop_register_transfer);
 EXPORT_SYMBOL(loop_unregister_transfer);
 
+static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+
+	blk_mq_start_request(bd->rq);
+
+	if (cmd->rq->cmd_flags & REQ_WRITE) {
+		struct loop_device *lo = cmd->rq->q->queuedata;
+		bool need_sched = true;
+
+		spin_lock_irq(&lo->lo_lock);
+		if (lo->write_started)
+			need_sched = false;
+		else
+			lo->write_started = true;
+		list_add_tail(&cmd->list, &lo->write_cmd_head);
+		spin_unlock_irq(&lo->lo_lock);
+
+		if (need_sched)
+			queue_work(loop_wq, &lo->write_work);
+	} else {
+		queue_work(loop_wq, &cmd->read_work);
+	}
+
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static void loop_handle_cmd(struct loop_cmd *cmd)
+{
+	const bool write = cmd->rq->cmd_flags & REQ_WRITE;
+	struct loop_device *lo = cmd->rq->q->queuedata;
+	int ret = -EIO;
+	struct bio *bio;
+
+	if (lo->lo_state != Lo_bound)
+		goto failed;
+
+	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
+		goto failed;
+
+	ret = 0;
+	__rq_for_each_bio(bio, cmd->rq)
+		ret |= loop_handle_bio(lo, bio);
+
+ failed:
+	if (ret)
+		cmd->rq->errors = -EIO;
+	blk_mq_complete_request(cmd->rq);
+}
+
+static void loop_queue_write_work(struct work_struct *work)
+{
+	struct loop_device *lo =
+		container_of(work, struct loop_device, write_work);
+	LIST_HEAD(cmd_list);
+
+	spin_lock_irq(&lo->lo_lock);
+ repeat:
+	list_splice_init(&lo->write_cmd_head, &cmd_list);
+	spin_unlock_irq(&lo->lo_lock);
+
+	while (!list_empty(&cmd_list)) {
+		struct loop_cmd *cmd = list_first_entry(&cmd_list,
+				struct loop_cmd, list);
+		list_del_init(&cmd->list);
+		loop_handle_cmd(cmd);
+	}
+
+	spin_lock_irq(&lo->lo_lock);
+	if (!list_empty(&lo->write_cmd_head))
+		goto repeat;
+	lo->write_started = false;
+	spin_unlock_irq(&lo->lo_lock);
+}
+
+static void loop_queue_read_work(struct work_struct *work)
+{
+	struct loop_cmd *cmd =
+		container_of(work, struct loop_cmd, read_work);
+
+	loop_handle_cmd(cmd);
+}
+
+static int loop_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
+{
+	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	cmd->rq = rq;
+	INIT_WORK(&cmd->read_work, loop_queue_read_work);
+
+	return 0;
+}
+
+static struct blk_mq_ops loop_mq_ops = {
+	.queue_rq       = loop_queue_rq,
+	.map_queue      = blk_mq_map_queue,
+	.init_request	= loop_init_request,
+};
+
 static int loop_add(struct loop_device **l, int i)
 {
 	struct loop_device *lo;
@@ -1627,16 +1605,28 @@ static int loop_add(struct loop_device **l, int i)
 	i = err;
 
 	err = -ENOMEM;
-	lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
-	if (!lo->lo_queue)
+	lo->tag_set.ops = &loop_mq_ops;
+	lo->tag_set.nr_hw_queues = 1;
+	lo->tag_set.queue_depth = 128;
+	lo->tag_set.numa_node = NUMA_NO_NODE;
+	lo->tag_set.cmd_size = sizeof(struct loop_cmd);
+	lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	lo->tag_set.driver_data = lo;
+
+	err = blk_mq_alloc_tag_set(&lo->tag_set);
+	if (err)
 		goto out_free_idr;
 
-	/*
-	 * set queue make_request_fn
-	 */
-	blk_queue_make_request(lo->lo_queue, loop_make_request);
+	lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
+	if (IS_ERR_OR_NULL(lo->lo_queue)) {
+		err = PTR_ERR(lo->lo_queue);
+		goto out_cleanup_tags;
+	}
 	lo->lo_queue->queuedata = lo;
 
+	INIT_LIST_HEAD(&lo->write_cmd_head);
+	INIT_WORK(&lo->write_work, loop_queue_write_work);
+
 	disk = lo->lo_disk = alloc_disk(1 << part_shift);
 	if (!disk)
 		goto out_free_queue;
@@ -1664,9 +1654,6 @@ static int loop_add(struct loop_device **l, int i)
 	disk->flags |= GENHD_FL_EXT_DEVT;
 	mutex_init(&lo->lo_ctl_mutex);
 	lo->lo_number		= i;
-	lo->lo_thread		= NULL;
-	init_waitqueue_head(&lo->lo_event);
-	init_waitqueue_head(&lo->lo_req_wait);
 	spin_lock_init(&lo->lo_lock);
 	disk->major		= LOOP_MAJOR;
 	disk->first_minor	= i << part_shift;
@@ -1680,6 +1667,8 @@ static int loop_add(struct loop_device **l, int i)
 
 out_free_queue:
 	blk_cleanup_queue(lo->lo_queue);
+out_cleanup_tags:
+	blk_mq_free_tag_set(&lo->tag_set);
 out_free_idr:
 	idr_remove(&loop_index_idr, i);
 out_free_dev:
@@ -1692,6 +1681,7 @@ static void loop_remove(struct loop_device *lo)
 {
 	del_gendisk(lo->lo_disk);
 	blk_cleanup_queue(lo->lo_queue);
+	blk_mq_free_tag_set(&lo->tag_set);
 	put_disk(lo->lo_disk);
 	kfree(lo);
 }
@@ -1875,6 +1865,13 @@ static int __init loop_init(void)
 		goto misc_out;
 	}
 
+	loop_wq = alloc_workqueue("kloopd",
+			WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
+	if (!loop_wq) {
+		err = -ENOMEM;
+		goto misc_out;
+	}
+
 	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
 				  THIS_MODULE, loop_probe, NULL, NULL);
 
@@ -1912,6 +1909,8 @@ static void __exit loop_exit(void)
 	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
 	unregister_blkdev(LOOP_MAJOR, "loop");
 
+	destroy_workqueue(loop_wq);
+
 	misc_deregister(&loop_misc);
 }
 
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 90df5d6485b6..e20cdbbff7d5 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -13,6 +13,7 @@
 #include <linux/blkdev.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
+#include <linux/workqueue.h>
 #include <uapi/linux/loop.h>
 
 /* Possible states of device */
@@ -52,19 +53,23 @@ struct loop_device {
 	gfp_t		old_gfp_mask;
 
 	spinlock_t		lo_lock;
-	struct bio_list		lo_bio_list;
-	unsigned int		lo_bio_count;
+	struct list_head	write_cmd_head;
+	struct work_struct	write_work;
+	bool			write_started;
 	int			lo_state;
 	struct mutex		lo_ctl_mutex;
-	struct task_struct	*lo_thread;
-	wait_queue_head_t	lo_event;
-	/* wait queue for incoming requests */
-	wait_queue_head_t	lo_req_wait;
 
 	struct request_queue	*lo_queue;
+	struct blk_mq_tag_set	tag_set;
 	struct gendisk		*lo_disk;
 };
 
+struct loop_cmd {
+	struct work_struct read_work;
+	struct request *rq;
+	struct list_head list;
+};
+
 /* Support for loadable transfer modules */
 struct loop_func_table {
 	int number;	/* filter type */ 
-- 
cgit v1.2.3-59-g8ed1b


From 301120134628c49346e5f23f7c00e7377013d5e0 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 31 Dec 2014 13:22:58 +0000
Subject: block: loop: say goodby to bio

Switch to block request completely.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 45 ++++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c678eb24a7b5..d9589017cc1e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -287,12 +287,12 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
 	return ret;
 }
 
-static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
+static int lo_send(struct loop_device *lo, struct request *rq, loff_t pos)
 {
 	int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t,
 			struct page *page);
 	struct bio_vec bvec;
-	struct bvec_iter iter;
+	struct req_iterator iter;
 	struct page *page = NULL;
 	int ret = 0;
 
@@ -306,7 +306,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
 		do_lo_send = do_lo_send_direct_write;
 	}
 
-	bio_for_each_segment(bvec, bio, iter) {
+	rq_for_each_segment(bvec, rq, iter) {
 		ret = do_lo_send(lo, &bvec, pos, page);
 		if (ret < 0)
 			break;
@@ -394,19 +394,22 @@ do_lo_receive(struct loop_device *lo,
 }
 
 static int
-lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
+lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos)
 {
 	struct bio_vec bvec;
-	struct bvec_iter iter;
+	struct req_iterator iter;
 	ssize_t s;
 
-	bio_for_each_segment(bvec, bio, iter) {
+	rq_for_each_segment(bvec, rq, iter) {
 		s = do_lo_receive(lo, &bvec, bsize, pos);
 		if (s < 0)
 			return s;
 
 		if (s != bvec.bv_len) {
-			zero_fill_bio(bio);
+			struct bio *bio;
+
+			__rq_for_each_bio(bio, rq)
+				zero_fill_bio(bio);
 			break;
 		}
 		pos += bvec.bv_len;
@@ -414,17 +417,17 @@ lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
 	return 0;
 }
 
-static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
+static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 {
 	loff_t pos;
 	int ret;
 
-	pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset;
+	pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
 
-	if (bio_rw(bio) == WRITE) {
+	if (rq->cmd_flags & REQ_WRITE) {
 		struct file *file = lo->lo_backing_file;
 
-		if (bio->bi_rw & REQ_FLUSH) {
+		if (rq->cmd_flags & REQ_FLUSH) {
 			ret = vfs_fsync(file, 0);
 			if (unlikely(ret && ret != -EINVAL)) {
 				ret = -EIO;
@@ -438,7 +441,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 		 * encryption is enabled, because it may give an attacker
 		 * useful information.
 		 */
-		if (bio->bi_rw & REQ_DISCARD) {
+		if (rq->cmd_flags & REQ_DISCARD) {
 			struct file *file = lo->lo_backing_file;
 			int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
 
@@ -448,22 +451,22 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 				goto out;
 			}
 			ret = file->f_op->fallocate(file, mode, pos,
-						    bio->bi_iter.bi_size);
+						    blk_rq_bytes(rq));
 			if (unlikely(ret && ret != -EINVAL &&
 				     ret != -EOPNOTSUPP))
 				ret = -EIO;
 			goto out;
 		}
 
-		ret = lo_send(lo, bio, pos);
+		ret = lo_send(lo, rq, pos);
 
-		if ((bio->bi_rw & REQ_FUA) && !ret) {
+		if ((rq->cmd_flags & REQ_FUA) && !ret) {
 			ret = vfs_fsync(file, 0);
 			if (unlikely(ret && ret != -EINVAL))
 				ret = -EIO;
 		}
 	} else
-		ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
+		ret = lo_receive(lo, rq, lo->lo_blocksize, pos);
 
 out:
 	return ret;
@@ -474,11 +477,6 @@ struct switch_request {
 	struct completion wait;
 };
 
-static inline int loop_handle_bio(struct loop_device *lo, struct bio *bio)
-{
-	return do_bio_filebacked(lo, bio);
-}
-
 /*
  * Do the actual switch; called from the BIO completion routine
  */
@@ -1510,7 +1508,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	const bool write = cmd->rq->cmd_flags & REQ_WRITE;
 	struct loop_device *lo = cmd->rq->q->queuedata;
 	int ret = -EIO;
-	struct bio *bio;
 
 	if (lo->lo_state != Lo_bound)
 		goto failed;
@@ -1518,9 +1515,7 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
 		goto failed;
 
-	ret = 0;
-	__rq_for_each_bio(bio, cmd->rq)
-		ret |= loop_handle_bio(lo, bio);
+	ret = do_req_filebacked(lo, cmd->rq);
 
  failed:
 	if (ret)
-- 
cgit v1.2.3-59-g8ed1b


From cf655d953422c846558a320ac9b3c8e659b68275 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 31 Dec 2014 13:22:59 +0000
Subject: block: loop: introduce lo_discard() and lo_req_flush()

No behaviour change, just move the handling for REQ_DISCARD
and REQ_FLUSH in these two functions.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 73 ++++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 33 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d9589017cc1e..018af27256c5 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -417,6 +417,40 @@ lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos)
 	return 0;
 }
 
+static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
+{
+	/*
+	 * We use punch hole to reclaim the free space used by the
+	 * image a.k.a. discard. However we do not support discard if
+	 * encryption is enabled, because it may give an attacker
+	 * useful information.
+	 */
+	struct file *file = lo->lo_backing_file;
+	int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+	int ret;
+
+	if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
+	if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
+		ret = -EIO;
+ out:
+	return ret;
+}
+
+static int lo_req_flush(struct loop_device *lo, struct request *rq)
+{
+	struct file *file = lo->lo_backing_file;
+	int ret = vfs_fsync(file, 0);
+	if (unlikely(ret && ret != -EINVAL))
+		ret = -EIO;
+
+	return ret;
+}
+
 static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 {
 	loff_t pos;
@@ -425,46 +459,19 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 	pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
 
 	if (rq->cmd_flags & REQ_WRITE) {
-		struct file *file = lo->lo_backing_file;
-
-		if (rq->cmd_flags & REQ_FLUSH) {
-			ret = vfs_fsync(file, 0);
-			if (unlikely(ret && ret != -EINVAL)) {
-				ret = -EIO;
-				goto out;
-			}
-		}
 
-		/*
-		 * We use punch hole to reclaim the free space used by the
-		 * image a.k.a. discard. However we do not support discard if
-		 * encryption is enabled, because it may give an attacker
-		 * useful information.
-		 */
+		if (rq->cmd_flags & REQ_FLUSH)
+			ret = lo_req_flush(lo, rq);
+
 		if (rq->cmd_flags & REQ_DISCARD) {
-			struct file *file = lo->lo_backing_file;
-			int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
-
-			if ((!file->f_op->fallocate) ||
-			    lo->lo_encrypt_key_size) {
-				ret = -EOPNOTSUPP;
-				goto out;
-			}
-			ret = file->f_op->fallocate(file, mode, pos,
-						    blk_rq_bytes(rq));
-			if (unlikely(ret && ret != -EINVAL &&
-				     ret != -EOPNOTSUPP))
-				ret = -EIO;
+			ret = lo_discard(lo, rq, pos);
 			goto out;
 		}
 
 		ret = lo_send(lo, rq, pos);
 
-		if ((rq->cmd_flags & REQ_FUA) && !ret) {
-			ret = vfs_fsync(file, 0);
-			if (unlikely(ret && ret != -EINVAL))
-				ret = -EIO;
-		}
+		if ((rq->cmd_flags & REQ_FUA) && !ret)
+			ret = lo_req_flush(lo, rq);
 	} else
 		ret = lo_receive(lo, rq, lo->lo_blocksize, pos);
 
-- 
cgit v1.2.3-59-g8ed1b


From af65aa8ea78b296857f257bdc52338d03101813b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 31 Dec 2014 13:23:00 +0000
Subject: block: loop: don't handle REQ_FUA explicitly

block core handles REQ_FUA by its flush state machine, so
won't do it in loop explicitly.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 018af27256c5..185a86d73c3f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -459,23 +459,15 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 	pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
 
 	if (rq->cmd_flags & REQ_WRITE) {
-
 		if (rq->cmd_flags & REQ_FLUSH)
 			ret = lo_req_flush(lo, rq);
-
-		if (rq->cmd_flags & REQ_DISCARD) {
+		else if (rq->cmd_flags & REQ_DISCARD)
 			ret = lo_discard(lo, rq, pos);
-			goto out;
-		}
-
-		ret = lo_send(lo, rq, pos);
-
-		if ((rq->cmd_flags & REQ_FUA) && !ret)
-			ret = lo_req_flush(lo, rq);
+		else
+			ret = lo_send(lo, rq, pos);
 	} else
 		ret = lo_receive(lo, rq, lo->lo_blocksize, pos);
 
-out:
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 78e367a3601f35ea811e7f5660b7362afa2401fa Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 2 Jan 2015 15:20:25 -0700
Subject: loop: add blk-mq.h include

Looks like we pull it in through other ways on x86, but we fail
on sparc:

In file included from drivers/block/cryptoloop.c:30:0:
drivers/block/loop.h:63:24: error: field 'tag_set' has incomplete type
struct blk_mq_tag_set tag_set;

Add the include to loop.h, kill it from loop.c.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 1 -
 drivers/block/loop.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 185a86d73c3f..d1f168b73634 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -75,7 +75,6 @@
 #include <linux/sysfs.h>
 #include <linux/miscdevice.h>
 #include <linux/falloc.h>
-#include <linux/blk-mq.h>
 #include "loop.h"
 
 #include <asm/uaccess.h>
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index e20cdbbff7d5..301c27f8323f 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -11,6 +11,7 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
-- 
cgit v1.2.3-59-g8ed1b


From 91117a20245b59f70b563523edbf998a62fc6383 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 7 Jan 2015 18:04:18 +0200
Subject: axonram: Fix bug in direct_access

The 'pfn' returned by axonram was completely bogus, and has been since
2008.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 arch/powerpc/sysdev/axonram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index f532c92bf99d..367533bb3d48 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -156,7 +156,7 @@ axon_ram_direct_access(struct block_device *device, sector_t sector,
 	}
 
 	*kaddr = (void *)(bank->ph_addr + offset);
-	*pfn = virt_to_phys(kaddr) >> PAGE_SHIFT;
+	*pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 937af5ecd0591e84ee54180fa97dcbe9bbe5fed6 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <boaz@plexistor.com>
Date: Wed, 7 Jan 2015 18:07:56 +0200
Subject: brd: Fix all partitions BUGs

This patch fixes up brd's partitions scheme, now enjoying all worlds.

The MAIN fix here is that currently, if one fdisks some partitions,
a BAD bug will make all partitions point to the same start-end sector
ie: 0 - brd_size And an mkfs of any partition would trash the partition
table and the other partition.

Another fix is that "mount -U uuid" will not work if show_part was not
specified, because of the GENHD_FL_SUPPRESS_PARTITION_INFO flag.
We now always load without it and remove the show_part parameter.

[We remove Dmitry's new module-param part_show it is now always
 show]

So NOW the logic goes like this:
* max_part - Just says how many minors to reserve between ramX
  devices. In any way, there can be as many partition as requested.
  If minors between devices ends, then dynamic 259-major ids will
  be allocated on the fly.
  The default is now max_part=1, which means all partitions devt(s)
  will be from the dynamic (259) major-range.
  (If persistent partition minors is needed use max_part=X)
  For example with /dev/sdX max_part is hard coded 16.

* Creation of new devices on the fly still/always work:
  mknod /path/devnod b 1 X
  fdisk -l /path/devnod
  Will create a new device if [X / max_part] was not already
  created before. (Just as before)

  partitions on the dynamically created device will work as well
  Same logic applies with minors as with the pre-created ones.

TODO: dynamic grow of device size. So each device can have it's
      own size.

CC: Dmitry Monakhov <dmonakhov@openvz.org>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/brd.c | 100 ++++++++++++++++++++--------------------------------
 1 file changed, 38 insertions(+), 62 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 89e90ec52f28..a7463c9595e7 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -438,19 +438,18 @@ static const struct block_device_operations brd_fops = {
 /*
  * And now the modules code and kernel interface.
  */
-static int rd_nr;
-int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
-static int max_part;
-static int part_shift;
-static int part_show = 0;
+static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
 module_param(rd_nr, int, S_IRUGO);
 MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
+
+int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
 module_param(rd_size, int, S_IRUGO);
 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
+
+static int max_part = 1;
 module_param(max_part, int, S_IRUGO);
-MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
-module_param(part_show, int, S_IRUGO);
-MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions");
+MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
+
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
 MODULE_ALIAS("rd");
@@ -496,16 +495,15 @@ static struct brd_device *brd_alloc(int i)
 	brd->brd_queue->limits.discard_zeroes_data = 1;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
 
-	disk = brd->brd_disk = alloc_disk(1 << part_shift);
+	disk = brd->brd_disk = alloc_disk(max_part);
 	if (!disk)
 		goto out_free_queue;
 	disk->major		= RAMDISK_MAJOR;
-	disk->first_minor	= i << part_shift;
+	disk->first_minor	= i * max_part;
 	disk->fops		= &brd_fops;
 	disk->private_data	= brd;
 	disk->queue		= brd->brd_queue;
-	if (!part_show)
-		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
+	disk->flags		= GENHD_FL_EXT_DEVT;
 	sprintf(disk->disk_name, "ram%d", i);
 	set_capacity(disk, rd_size * 2);
 
@@ -527,10 +525,11 @@ static void brd_free(struct brd_device *brd)
 	kfree(brd);
 }
 
-static struct brd_device *brd_init_one(int i)
+static struct brd_device *brd_init_one(int i, bool *new)
 {
 	struct brd_device *brd;
 
+	*new = false;
 	list_for_each_entry(brd, &brd_devices, brd_list) {
 		if (brd->brd_number == i)
 			goto out;
@@ -541,6 +540,7 @@ static struct brd_device *brd_init_one(int i)
 		add_disk(brd->brd_disk);
 		list_add_tail(&brd->brd_list, &brd_devices);
 	}
+	*new = true;
 out:
 	return brd;
 }
@@ -556,70 +556,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
 {
 	struct brd_device *brd;
 	struct kobject *kobj;
+	bool new;
 
 	mutex_lock(&brd_devices_mutex);
-	brd = brd_init_one(MINOR(dev) >> part_shift);
+	brd = brd_init_one(MINOR(dev) / max_part, &new);
 	kobj = brd ? get_disk(brd->brd_disk) : NULL;
 	mutex_unlock(&brd_devices_mutex);
 
-	*part = 0;
+	if (new)
+		*part = 0;
+
 	return kobj;
 }
 
 static int __init brd_init(void)
 {
-	int i, nr;
-	unsigned long range;
 	struct brd_device *brd, *next;
+	int i;
 
 	/*
 	 * brd module now has a feature to instantiate underlying device
 	 * structure on-demand, provided that there is an access dev node.
-	 * However, this will not work well with user space tool that doesn't
-	 * know about such "feature".  In order to not break any existing
-	 * tool, we do the following:
 	 *
-	 * (1) if rd_nr is specified, create that many upfront, and this
-	 *     also becomes a hard limit.
-	 * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT
-	 *     (default 16) rd device on module load, user can further
-	 *     extend brd device by create dev node themselves and have
-	 *     kernel automatically instantiate actual device on-demand.
+	 * (1) if rd_nr is specified, create that many upfront. else
+	 *     it defaults to CONFIG_BLK_DEV_RAM_COUNT
+	 * (2) User can further extend brd devices by create dev node themselves
+	 *     and have kernel automatically instantiate actual device
+	 *     on-demand. Example:
+	 *		mknod /path/devnod_name b 1 X	# 1 is the rd major
+	 *		fdisk -l /path/devnod_name
+	 *	If (X / max_part) was not already created it will be created
+	 *	dynamically.
 	 */
 
-	part_shift = 0;
-	if (max_part > 0) {
-		part_shift = fls(max_part);
-
-		/*
-		 * Adjust max_part according to part_shift as it is exported
-		 * to user space so that user can decide correct minor number
-		 * if [s]he want to create more devices.
-		 *
-		 * Note that -1 is required because partition 0 is reserved
-		 * for the whole disk.
-		 */
-		max_part = (1UL << part_shift) - 1;
-	}
-
-	if ((1UL << part_shift) > DISK_MAX_PARTS)
-		return -EINVAL;
-
-	if (rd_nr > 1UL << (MINORBITS - part_shift))
-		return -EINVAL;
-
-	if (rd_nr) {
-		nr = rd_nr;
-		range = rd_nr << part_shift;
-	} else {
-		nr = CONFIG_BLK_DEV_RAM_COUNT;
-		range = 1UL << MINORBITS;
-	}
-
 	if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
 		return -EIO;
 
-	for (i = 0; i < nr; i++) {
+	if (unlikely(!max_part))
+		max_part = 1;
+
+	for (i = 0; i < rd_nr; i++) {
 		brd = brd_alloc(i);
 		if (!brd)
 			goto out_free;
@@ -631,10 +607,10 @@ static int __init brd_init(void)
 	list_for_each_entry(brd, &brd_devices, brd_list)
 		add_disk(brd->brd_disk);
 
-	blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
+	blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
 				  THIS_MODULE, brd_probe, NULL, NULL);
 
-	printk(KERN_INFO "brd: module loaded\n");
+	pr_info("brd: module loaded\n");
 	return 0;
 
 out_free:
@@ -644,21 +620,21 @@ out_free:
 	}
 	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 
+	pr_info("brd: module NOT loaded !!!\n");
 	return -ENOMEM;
 }
 
 static void __exit brd_exit(void)
 {
-	unsigned long range;
 	struct brd_device *brd, *next;
 
-	range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
-
 	list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
 		brd_del_one(brd);
 
-	blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
+	blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
 	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
+	pr_info("brd: module unloaded\n");
 }
 
 module_init(brd_init);
-- 
cgit v1.2.3-59-g8ed1b


From c8fa31730fc72a83bad762112a872675b3e76886 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <boaz@plexistor.com>
Date: Wed, 7 Jan 2015 18:09:38 +0200
Subject: brd: Request from fdisk 4k alignment

Because of the direct_access() API which returns a PFN. partitions
better start on 4K boundary, else offset ZERO of a partition will
not be aligned and blk_direct_access() will fail the call.

By setting blk_queue_physical_block_size(PAGE_SIZE) we can communicate
this to fdisk and friends.

The call to blk_queue_physical_block_size() is harmless and will
not affect the Kernel behavior in any way. It is only for
communication to user-mode.

before this patch running fdisk on a default size brd of 4M
the first sector offered is 34 (BAD), but after this patch it
will be 40, ie 8 sectors aligned. Also when entering some random
partition sizes the next partition-start sector is offered 8 sectors
aligned after this patch. (Please note that with fdisk the user
can still enter bad values, only the offered default values will
be correct)

Note that with bdev-size > 4M fdisk will try to align on a 1M
boundary (above first-sector will be 2048), in any case.

CC: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/brd.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index a7463c9595e7..c01b921b1b4a 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -486,10 +486,19 @@ static struct brd_device *brd_alloc(int i)
 	brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
 	if (!brd->brd_queue)
 		goto out_free_dev;
+
 	blk_queue_make_request(brd->brd_queue, brd_make_request);
 	blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 	blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 
+	/* This is so fdisk will align partitions on 4k, because of
+	 * direct_access API needing 4k alignment, returning a PFN
+	 * (This is only a problem on very small devices <= 4M,
+	 *  otherwise fdisk will align on 1M. Regardless this call
+	 *  is harmless)
+	 */
+	blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
+
 	brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
 	brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
 	brd->brd_queue->limits.discard_zeroes_data = 1;
-- 
cgit v1.2.3-59-g8ed1b


From 227290b4690510148d1f40f277c6c3a181fcb440 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 16 Jan 2015 16:02:24 -0700
Subject: null_blk: suppress invalid partition info

null_blk is partitionable, but it doesn't store any of the info. When
it is loaded, you would normally see:

[1226739.343608]  nullb0: unknown partition table
[1226739.343746]  nullb1: unknown partition table

which can confuse some people. Add the appropriate gendisk flag
to suppress this info.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index aa2224aa7caa..65cd61a4145e 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -579,7 +579,7 @@ static int null_add_dev(void)
 	sector_div(size, bs);
 	set_capacity(disk, size);
 
-	disk->flags |= GENHD_FL_EXT_DEVT;
+	disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
 	disk->major		= null_major;
 	disk->first_minor	= nullb->index;
 	disk->fops		= &null_fops;
-- 
cgit v1.2.3-59-g8ed1b


From 121c7ad4ef3f3a9e697c4b8b78d74e4d6847d9e4 Mon Sep 17 00:00:00 2001
From: kaoudis <kaoudis@colorado.edu>
Date: Wed, 14 Jan 2015 21:01:58 -0700
Subject: NVMe: within nvme_free_queues(), delete RCU sychro/deferred free

Converting from to blk-queue got rid of the driver's RCU
locking-on-queue, so removing unnecessary RCU locking-on-queue
artefacts.

Reviewed-by: Keith Busch <keith.busch@intel.com>

Signed-off-by: Kelly Nicole Kaoudis <kaoudis@colorado.edu>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f7d083bb3bd5..f4aa64160838 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1108,21 +1108,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
 
 static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 {
-	LLIST_HEAD(q_list);
-	struct nvme_queue *nvmeq, *next;
-	struct llist_node *entry;
 	int i;
 
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
 		struct nvme_queue *nvmeq = dev->queues[i];
-		llist_add(&nvmeq->node, &q_list);
 		dev->queue_count--;
 		dev->queues[i] = NULL;
-	}
-	synchronize_rcu();
-	entry = llist_del_all(&q_list);
-	llist_for_each_entry_safe(nvmeq, next, entry, node)
 		nvme_free_queue(nvmeq);
+	}
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 12cb5ce101abfaf74421f8cc9f196e708209eb79 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 15 Jan 2015 17:32:27 -0800
Subject: libata: use blk taging

libata uses its own tag management which is duplication and the
implementation is poor. And if we switch to blk-mq, tag is build-in.
It's time to switch to generic taging.

The SAS driver has its own tag management, and looks we can't directly
map the host controler tag to SATA tag. So I just bypassed the SAS case.

I changed the code/variable name for the tag management of libata to
make it self contained. Only sas will use it. Later if libsas implements
its tag management, the tag management code in libata can be deleted
easily.

Cc: Jens Axboe <axboe@fb.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/ata/libata-core.c | 54 +++++++++++++++++++++++++++++++++++------------
 drivers/ata/libata-scsi.c |  4 +++-
 drivers/ata/libata.h      |  2 +-
 include/linux/libata.h    |  5 +++--
 4 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 5c84fb5c3372..695d33df3df5 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1525,6 +1525,15 @@ static void ata_qc_complete_internal(struct ata_queued_cmd *qc)
 	complete(waiting);
 }
 
+static bool ata_valid_internal_tag(struct ata_port *ap, struct ata_device *dev,
+				   unsigned int tag)
+{
+	if (!ap->scsi_host)
+		return !test_and_set_bit(tag, &ap->sas_tag_allocated);
+	return !dev->sdev ||
+	       !blk_queue_find_tag(dev->sdev->request_queue, tag);
+}
+
 /**
  *	ata_exec_internal_sg - execute libata internal command
  *	@dev: Device to which the command is sent
@@ -1585,8 +1594,7 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 	else
 		tag = 0;
 
-	if (test_and_set_bit(tag, &ap->qc_allocated))
-		BUG();
+	BUG_ON(!ata_valid_internal_tag(ap, dev, tag));
 	qc = __ata_qc_from_tag(ap, tag);
 
 	qc->tag = tag;
@@ -4737,27 +4745,23 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
  *	None.
  */
 
-static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
+static struct ata_queued_cmd *sas_ata_qc_new(struct ata_port *ap)
 {
 	struct ata_queued_cmd *qc = NULL;
 	unsigned int max_queue = ap->host->n_tags;
 	unsigned int i, tag;
 
-	/* no command while frozen */
-	if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
-		return NULL;
-
-	for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) {
+	for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) {
 		tag = tag < max_queue ? tag : 0;
 
 		/* the last tag is reserved for internal command. */
 		if (tag == ATA_TAG_INTERNAL)
 			continue;
 
-		if (!test_and_set_bit(tag, &ap->qc_allocated)) {
+		if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) {
 			qc = __ata_qc_from_tag(ap, tag);
 			qc->tag = tag;
-			ap->last_tag = tag;
+			ap->sas_last_tag = tag;
 			break;
 		}
 	}
@@ -4765,6 +4769,24 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
 	return qc;
 }
 
+static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap, int blktag)
+{
+	struct ata_queued_cmd *qc;
+
+	/* no command while frozen */
+	if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
+		return NULL;
+
+	/* SATA will directly use block tag. libsas need its own tag management */
+	if (ap->scsi_host) {
+		qc = __ata_qc_from_tag(ap, blktag);
+		qc->tag = blktag;
+		return qc;
+	}
+
+	return sas_ata_qc_new(ap);
+}
+
 /**
  *	ata_qc_new_init - Request an available ATA command, and initialize it
  *	@dev: Device from whom we request an available command structure
@@ -4773,12 +4795,12 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
  *	None.
  */
 
-struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
+struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int blktag)
 {
 	struct ata_port *ap = dev->link->ap;
 	struct ata_queued_cmd *qc;
 
-	qc = ata_qc_new(ap);
+	qc = ata_qc_new(ap, blktag);
 	if (qc) {
 		qc->scsicmd = NULL;
 		qc->ap = ap;
@@ -4800,6 +4822,12 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
  *	LOCKING:
  *	spin_lock_irqsave(host lock)
  */
+static void sas_ata_qc_free(unsigned int tag, struct ata_port *ap)
+{
+	if (!ap->scsi_host)
+		clear_bit(tag, &ap->sas_tag_allocated);
+}
+
 void ata_qc_free(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap;
@@ -4812,7 +4840,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 	tag = qc->tag;
 	if (likely(ata_tag_valid(tag))) {
 		qc->tag = ATA_TAG_POISON;
-		clear_bit(tag, &ap->qc_allocated);
+		sas_ata_qc_free(tag, ap);
 	}
 }
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index e364e86e84d7..94339c2aed1b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -756,7 +756,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
 {
 	struct ata_queued_cmd *qc;
 
-	qc = ata_qc_new_init(dev);
+	qc = ata_qc_new_init(dev, cmd->request->tag);
 	if (qc) {
 		qc->scsicmd = cmd;
 		qc->scsidone = cmd->scsi_done;
@@ -3666,6 +3666,8 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht)
 		 */
 		shost->max_host_blocked = 1;
 
+		scsi_init_shared_tag_map(shost, host->n_tags);
+
 		rc = scsi_add_host_with_dma(ap->scsi_host,
 						&ap->tdev, ap->host->dev);
 		if (rc)
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 5f4e0cca56ec..40405135bbb6 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -63,7 +63,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev);
 extern void ata_force_cbl(struct ata_port *ap);
 extern u64 ata_tf_to_lba(const struct ata_taskfile *tf);
 extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf);
-extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev);
+extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag);
 extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
 			   u64 block, u32 n_block, unsigned int tf_flags,
 			   unsigned int tag);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2d182413b1db..f23454762717 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -821,10 +821,10 @@ struct ata_port {
 	unsigned int		cbl;	/* cable type; ATA_CBL_xxx */
 
 	struct ata_queued_cmd	qcmd[ATA_MAX_QUEUE];
-	unsigned long		qc_allocated;
+	unsigned long		sas_tag_allocated; /* for sas tag allocation only */
 	unsigned int		qc_active;
 	int			nr_active_links; /* #links with active qcs */
-	unsigned int		last_tag;	/* track next tag hw expects */
+	unsigned int		sas_last_tag;	/* track next tag hw expects */
 
 	struct ata_link		link;		/* host default link */
 	struct ata_link		*slave_link;	/* see ata_slave_link_init() */
@@ -1344,6 +1344,7 @@ extern struct device_attribute *ata_common_sdev_attrs[];
 	.ioctl			= ata_scsi_ioctl,		\
 	.queuecommand		= ata_scsi_queuecmd,		\
 	.can_queue		= ATA_DEF_QUEUE,		\
+	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,		\
 	.this_id		= ATA_SHT_THIS_ID,		\
 	.cmd_per_lun		= ATA_SHT_CMD_PER_LUN,		\
 	.emulated		= ATA_SHT_EMULATED,		\
-- 
cgit v1.2.3-59-g8ed1b


From 98bd4be1ba95f2fe7f543910792b7163a5de06eb Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 23 Jan 2015 19:52:07 -0800
Subject: libata: move sas ata tag allocation to libata-scsi.c

Basically move the sas ata tag allocation to libata-scsi.c to make it clear
these staffs are just for sas.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/ata/libata-core.c | 93 +++++++++--------------------------------------
 drivers/ata/libata-scsi.c | 28 +++++++++++++-
 drivers/ata/libata.h      |  2 +
 3 files changed, 46 insertions(+), 77 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 695d33df3df5..d626605ac8f7 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1525,15 +1525,6 @@ static void ata_qc_complete_internal(struct ata_queued_cmd *qc)
 	complete(waiting);
 }
 
-static bool ata_valid_internal_tag(struct ata_port *ap, struct ata_device *dev,
-				   unsigned int tag)
-{
-	if (!ap->scsi_host)
-		return !test_and_set_bit(tag, &ap->sas_tag_allocated);
-	return !dev->sdev ||
-	       !blk_queue_find_tag(dev->sdev->request_queue, tag);
-}
-
 /**
  *	ata_exec_internal_sg - execute libata internal command
  *	@dev: Device to which the command is sent
@@ -1594,7 +1585,6 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 	else
 		tag = 0;
 
-	BUG_ON(!ata_valid_internal_tag(ap, dev, tag));
 	qc = __ata_qc_from_tag(ap, tag);
 
 	qc->tag = tag;
@@ -4734,80 +4724,36 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
 }
 
 /**
- *	ata_qc_new - Request an available ATA command, for queueing
- *	@ap: target port
- *
- *	Some ATA host controllers may implement a queue depth which is less
- *	than ATA_MAX_QUEUE. So we shouldn't allocate a tag which is beyond
- *	the hardware limitation.
+ *	ata_qc_new_init - Request an available ATA command, and initialize it
+ *	@dev: Device from whom we request an available command structure
  *
  *	LOCKING:
  *	None.
  */
 
-static struct ata_queued_cmd *sas_ata_qc_new(struct ata_port *ap)
-{
-	struct ata_queued_cmd *qc = NULL;
-	unsigned int max_queue = ap->host->n_tags;
-	unsigned int i, tag;
-
-	for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) {
-		tag = tag < max_queue ? tag : 0;
-
-		/* the last tag is reserved for internal command. */
-		if (tag == ATA_TAG_INTERNAL)
-			continue;
-
-		if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) {
-			qc = __ata_qc_from_tag(ap, tag);
-			qc->tag = tag;
-			ap->sas_last_tag = tag;
-			break;
-		}
-	}
-
-	return qc;
-}
-
-static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap, int blktag)
+struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag)
 {
+	struct ata_port *ap = dev->link->ap;
 	struct ata_queued_cmd *qc;
 
 	/* no command while frozen */
 	if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
 		return NULL;
 
-	/* SATA will directly use block tag. libsas need its own tag management */
-	if (ap->scsi_host) {
-		qc = __ata_qc_from_tag(ap, blktag);
-		qc->tag = blktag;
-		return qc;
+	/* libsas case */
+	if (!ap->scsi_host) {
+		tag = ata_sas_allocate_tag(ap);
+		if (tag < 0)
+			return NULL;
 	}
 
-	return sas_ata_qc_new(ap);
-}
-
-/**
- *	ata_qc_new_init - Request an available ATA command, and initialize it
- *	@dev: Device from whom we request an available command structure
- *
- *	LOCKING:
- *	None.
- */
-
-struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int blktag)
-{
-	struct ata_port *ap = dev->link->ap;
-	struct ata_queued_cmd *qc;
-
-	qc = ata_qc_new(ap, blktag);
-	if (qc) {
-		qc->scsicmd = NULL;
-		qc->ap = ap;
-		qc->dev = dev;
+	qc = __ata_qc_from_tag(ap, tag);
+	qc->tag = tag;
+	qc->scsicmd = NULL;
+	qc->ap = ap;
+	qc->dev = dev;
 
-		ata_qc_reinit(qc);
-	}
+	ata_qc_reinit(qc);
 
 	return qc;
 }
@@ -4822,12 +4768,6 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int blktag)
  *	LOCKING:
  *	spin_lock_irqsave(host lock)
  */
-static void sas_ata_qc_free(unsigned int tag, struct ata_port *ap)
-{
-	if (!ap->scsi_host)
-		clear_bit(tag, &ap->sas_tag_allocated);
-}
-
 void ata_qc_free(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap;
@@ -4840,7 +4780,8 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 	tag = qc->tag;
 	if (likely(ata_tag_valid(tag))) {
 		qc->tag = ATA_TAG_POISON;
-		sas_ata_qc_free(tag, ap);
+		if (!ap->scsi_host)
+			ata_sas_free_tag(tag, ap);
 	}
 }
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 94339c2aed1b..59c9d721b347 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3666,7 +3666,8 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht)
 		 */
 		shost->max_host_blocked = 1;
 
-		scsi_init_shared_tag_map(shost, host->n_tags);
+		if (scsi_init_shared_tag_map(shost, host->n_tags))
+			goto err_add;
 
 		rc = scsi_add_host_with_dma(ap->scsi_host,
 						&ap->tdev, ap->host->dev);
@@ -4230,3 +4231,28 @@ int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap)
 	return rc;
 }
 EXPORT_SYMBOL_GPL(ata_sas_queuecmd);
+
+int ata_sas_allocate_tag(struct ata_port *ap)
+{
+	unsigned int max_queue = ap->host->n_tags;
+	unsigned int i, tag;
+
+	for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) {
+		tag = tag < max_queue ? tag : 0;
+
+		/* the last tag is reserved for internal command. */
+		if (tag == ATA_TAG_INTERNAL)
+			continue;
+
+		if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) {
+			ap->sas_last_tag = tag;
+			return tag;
+		}
+	}
+	return -1;
+}
+
+void ata_sas_free_tag(unsigned int tag, struct ata_port *ap)
+{
+	clear_bit(tag, &ap->sas_tag_allocated);
+}
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 40405135bbb6..8c491cd8805b 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -145,6 +145,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work);
 extern int ata_bus_probe(struct ata_port *ap);
 extern int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
 			      unsigned int id, u64 lun);
+int ata_sas_allocate_tag(struct ata_port *ap);
+void ata_sas_free_tag(unsigned int tag, struct ata_port *ap);
 
 
 /* libata-eh.c */
-- 
cgit v1.2.3-59-g8ed1b


From 9269e23496ddd83b974536019cf420520df0ee7f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 23 Jan 2015 20:17:59 -0800
Subject: libata: make sata_sil24 use fifo tag allocator

libata starts using block tag now, we can use BLK_TAG_ALLOC_FIFO to
solve the sata_sil24 tag bug.

https://bugzilla.kernel.org/show_bug.cgi?id=87101

Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/ata/sata_sil24.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index d81b20ddb527..52428971ba94 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -388,6 +388,7 @@ static struct scsi_host_template sil24_sht = {
 	.can_queue		= SIL24_MAX_CMDS,
 	.sg_tablesize		= SIL24_MAX_SGE,
 	.dma_boundary		= ATA_DMA_BOUNDARY,
+	.tag_alloc_policy	= BLK_TAG_ALLOC_FIFO,
 };
 
 static struct ata_port_operations sil24_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From 4ca5829ac8b1297715bf609443ade2c332f3fd0c Mon Sep 17 00:00:00 2001
From: Markus Pargmann <mpa@pengutronix.de>
Date: Wed, 28 Jan 2015 19:35:38 +0100
Subject: MAINTAINERS: Update NBD maintainer

Paul stops maintining NBD and I will take his place from now on.

Signed-off-by: Markus Pargmann <mpa@pengutronix.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 MAINTAINERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index ddb9ac8d32b3..bb5e510e977e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6537,9 +6537,10 @@ F:	include/uapi/linux/netrom.h
 F:	net/netrom/
 
 NETWORK BLOCK DEVICE (NBD)
-M:	Paul Clements <Paul.Clements@steeleye.com>
+M:	Markus Pargmann <mpa@pengutronix.de>
 S:	Maintained
 L:	nbd-general@lists.sourceforge.net
+T:	git git://git.pengutronix.de/git/mpa/linux-nbd.git
 F:	Documentation/blockdev/nbd.txt
 F:	drivers/block/nbd.c
 F:	include/linux/nbd.h
-- 
cgit v1.2.3-59-g8ed1b


From ac3dd5bd128b1d1ce2a037775766f39d06a4848a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 22 Jan 2015 12:07:58 -0700
Subject: NVMe: avoid kmalloc/kfree for smaller IO

Currently we allocate an nvme_iod for each IO, which holds the
sg list, prps, and other IO related info. Set a threshold of
2 pages and/or 8KB of data, below which we can just embed this
in the per-command pdu in blk-mq. For any IO at or below
NVME_INT_PAGES and NVME_INT_BYTES, we save a kmalloc and kfree.

For higher IOPS, this saves up to 1% of CPU time.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
---
 drivers/block/nvme-core.c | 119 ++++++++++++++++++++++++++++++++++------------
 include/linux/nvme.h      |   3 +-
 2 files changed, 89 insertions(+), 33 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f4aa64160838..3eaa0becc52d 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -144,8 +144,37 @@ struct nvme_cmd_info {
 	void *ctx;
 	int aborted;
 	struct nvme_queue *nvmeq;
+	struct nvme_iod iod[0];
 };
 
+/*
+ * Max size of iod being embedded in the request payload
+ */
+#define NVME_INT_PAGES		2
+#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->page_size)
+
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size, struct nvme_dev *dev)
+{
+	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+{
+	unsigned int ret = sizeof(struct nvme_cmd_info);
+
+	ret += sizeof(struct nvme_iod);
+	ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
+	ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
+
+	return ret;
+}
+
 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 				unsigned int hctx_idx)
 {
@@ -217,6 +246,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
 	cmd->aborted = 0;
 }
 
+static void *iod_get_private(struct nvme_iod *iod)
+{
+	return (void *) (iod->private & ~0x1UL);
+}
+
+/*
+ * If bit 0 is set, the iod is embedded in the request payload.
+ */
+static bool iod_should_kfree(struct nvme_iod *iod)
+{
+	return (iod->private & 0x01) == 0;
+}
+
 /* Special values must be less than 0x1000 */
 #define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
@@ -360,35 +402,53 @@ static __le64 **iod_list(struct nvme_iod *iod)
 	return ((void *)iod) + iod->offset;
 }
 
-/*
- * Will slightly overestimate the number of pages needed.  This is OK
- * as it only leads to a small amount of wasted memory for the lifetime of
- * the I/O.
- */
-static int nvme_npages(unsigned size, struct nvme_dev *dev)
+static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
+			    unsigned nseg, unsigned long private)
 {
-	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
-	return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
+	iod->private = private;
+	iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+	iod->npages = -1;
+	iod->length = nbytes;
+	iod->nents = 0;
 }
 
 static struct nvme_iod *
-nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
+__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
+		 unsigned long priv, gfp_t gfp)
 {
 	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
-				sizeof(__le64 *) * nvme_npages(nbytes, dev) +
+				sizeof(__le64 *) * nvme_npages(bytes, dev) +
 				sizeof(struct scatterlist) * nseg, gfp);
 
-	if (iod) {
-		iod->offset = offsetof(struct nvme_iod, sg[nseg]);
-		iod->npages = -1;
-		iod->length = nbytes;
-		iod->nents = 0;
-		iod->first_dma = 0ULL;
-	}
+	if (iod)
+		iod_init(iod, bytes, nseg, priv);
 
 	return iod;
 }
 
+static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
+			               gfp_t gfp)
+{
+	unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
+                                                sizeof(struct nvme_dsm_range);
+	unsigned long mask = 0;
+	struct nvme_iod *iod;
+
+	if (rq->nr_phys_segments <= NVME_INT_PAGES &&
+	    size <= NVME_INT_BYTES(dev)) {
+		struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
+
+		iod = cmd->iod;
+		mask = 0x01;
+		iod_init(iod, size, rq->nr_phys_segments,
+				(unsigned long) rq | 0x01);
+		return iod;
+	}
+
+	return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
+				(unsigned long) rq, gfp);
+}
+
 void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
 	const int last_prp = dev->page_size / 8 - 1;
@@ -404,7 +464,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
 		prp_dma = next_prp_dma;
 	}
-	kfree(iod);
+
+	if (iod_should_kfree(iod))
+		kfree(iod);
 }
 
 static int nvme_error_status(u16 status)
@@ -423,7 +485,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct nvme_iod *iod = ctx;
-	struct request *req = iod->private;
+	struct request *req = iod_get_private(iod);
 	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
 
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
@@ -579,7 +641,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 							struct nvme_ns *ns)
 {
-	struct request *req = iod->private;
+	struct request *req = iod_get_private(iod);
 	struct nvme_command *cmnd;
 	u16 control = 0;
 	u32 dsmgmt = 0;
@@ -620,17 +682,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *req = bd->rq;
 	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
-	int psegs = req->nr_phys_segments;
 	enum dma_data_direction dma_dir;
-	unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
-						sizeof(struct nvme_dsm_range);
 
-	iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
+	iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
 	if (!iod)
 		return BLK_MQ_RQ_QUEUE_BUSY;
 
-	iod->private = req;
-
 	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
 		/*
@@ -645,10 +702,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			goto retry_cmd;
 		iod_list(iod)[0] = (__le64 *)range;
 		iod->npages = 0;
-	} else if (psegs) {
+	} else if (req->nr_phys_segments) {
 		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 
-		sg_init_table(iod->sg, psegs);
+		sg_init_table(iod->sg, req->nr_phys_segments);
 		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
 		if (!iod->nents)
 			goto error_cmd;
@@ -1362,7 +1419,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 		dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
-		dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
+		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
 		dev->admin_tagset.driver_data = dev;
 
 		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
@@ -1483,7 +1540,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 	}
 
 	err = -ENOMEM;
-	iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
+	iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
 	if (!iod)
 		goto put_pages;
 
@@ -2109,7 +2166,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
 	dev->tagset.queue_depth =
 				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
-	dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
+	dev->tagset.cmd_size = nvme_cmd_size(dev);
 	dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
 	dev->tagset.driver_data = dev;
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 258945fcabf1..19a5d4b23209 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -132,13 +132,12 @@ struct nvme_ns {
  * allocated to store the PRP list.
  */
 struct nvme_iod {
-	void *private;		/* For the use of the submitter of the I/O */
+	unsigned long private;	/* For the use of the submitter of the I/O */
 	int npages;		/* In the PRP list. 0 means small pool in use */
 	int offset;		/* Of PRP list */
 	int nents;		/* Used in scatterlist */
 	int length;		/* Of data, in bytes */
 	dma_addr_t first_dma;
-	struct list_head node;
 	struct scatterlist sg[0];
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From b7f120b211510b80cb72c1d790d9a4531271edfa Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 2 Feb 2015 17:08:45 +0100
Subject: floppy: Avoid manual call of device_create_file()

Use the static attribute groups assigned to the device instead of
calling device_create_file() after the device registration.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/block/floppy.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 56d46ffb08e1..a08cda955285 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev,
 
 static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
 
+static struct attribute *floppy_dev_attrs[] = {
+	&dev_attr_cmos.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(floppy_dev);
+
 static void floppy_device_release(struct device *dev)
 {
 }
@@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void)
 		floppy_device[drive].name = floppy_device_name;
 		floppy_device[drive].id = drive;
 		floppy_device[drive].dev.release = floppy_device_release;
+		floppy_device[drive].dev.groups = floppy_dev_groups;
 
 		err = platform_device_register(&floppy_device[drive]);
 		if (err)
 			goto out_remove_drives;
 
-		err = device_create_file(&floppy_device[drive].dev,
-					 &dev_attr_cmos);
-		if (err)
-			goto out_unreg_platform_dev;
-
 		/* to be cleaned up... */
 		disks[drive]->private_data = (void *)(long)drive;
 		disks[drive]->flags |= GENHD_FL_REMOVABLE;
@@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void)
 
 	return 0;
 
-out_unreg_platform_dev:
-	platform_device_unregister(&floppy_device[drive]);
 out_remove_drives:
 	while (drive--) {
 		if (floppy_available(drive)) {
 			del_gendisk(disks[drive]);
-			device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
 			platform_device_unregister(&floppy_device[drive]);
 		}
 	}
@@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void)
 
 		if (floppy_available(drive)) {
 			del_gendisk(disks[drive]);
-			device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
 			platform_device_unregister(&floppy_device[drive]);
 		}
 		blk_cleanup_queue(disks[drive]->queue);
-- 
cgit v1.2.3-59-g8ed1b


From 2c561246524c3319473bf47b558354f7ff47f0cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 3 Feb 2015 12:55:31 +0100
Subject: block: Simplify bsg complete all

It took me a few tries to figure out what this code did; lets rewrite
it into a more regular form.

The thing that makes this one 'special' is the BSG_F_BLOCK flag, if
that is not set we're not supposed/allowed to block and should spin
wait for completion.

The (new) io_wait_event() will never see a false condition in case of
the spinning and we will therefore not block.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bsg.c          | 72 ++++++++++++++++++----------------------------------
 include/linux/wait.h | 15 +++++++++++
 2 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/block/bsg.c b/block/bsg.c
index 276e869e686c..d214e929ce18 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -136,42 +136,6 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
 	return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
 }
 
-static int bsg_io_schedule(struct bsg_device *bd)
-{
-	DEFINE_WAIT(wait);
-	int ret = 0;
-
-	spin_lock_irq(&bd->lock);
-
-	BUG_ON(bd->done_cmds > bd->queued_cmds);
-
-	/*
-	 * -ENOSPC or -ENODATA?  I'm going for -ENODATA, meaning "I have no
-	 * work to do", even though we return -ENOSPC after this same test
-	 * during bsg_write() -- there, it means our buffer can't have more
-	 * bsg_commands added to it, thus has no space left.
-	 */
-	if (bd->done_cmds == bd->queued_cmds) {
-		ret = -ENODATA;
-		goto unlock;
-	}
-
-	if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
-		ret = -EAGAIN;
-		goto unlock;
-	}
-
-	prepare_to_wait(&bd->wq_done, &wait, TASK_UNINTERRUPTIBLE);
-	spin_unlock_irq(&bd->lock);
-	io_schedule();
-	finish_wait(&bd->wq_done, &wait);
-
-	return ret;
-unlock:
-	spin_unlock_irq(&bd->lock);
-	return ret;
-}
-
 static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 				struct sg_io_v4 *hdr, struct bsg_device *bd,
 				fmode_t has_write_perm)
@@ -482,6 +446,30 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	return ret;
 }
 
+static bool bsg_complete(struct bsg_device *bd)
+{
+	bool ret = false;
+	bool spin;
+
+	do {
+		spin_lock_irq(&bd->lock);
+
+		BUG_ON(bd->done_cmds > bd->queued_cmds);
+
+		/*
+		 * All commands consumed.
+		 */
+		if (bd->done_cmds == bd->queued_cmds)
+			ret = true;
+
+		spin = !test_bit(BSG_F_BLOCK, &bd->flags);
+
+		spin_unlock_irq(&bd->lock);
+	} while (!ret && spin);
+
+	return ret;
+}
+
 static int bsg_complete_all_commands(struct bsg_device *bd)
 {
 	struct bsg_command *bc;
@@ -492,17 +480,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
 	/*
 	 * wait for all commands to complete
 	 */
-	ret = 0;
-	do {
-		ret = bsg_io_schedule(bd);
-		/*
-		 * look for -ENODATA specifically -- we'll sometimes get
-		 * -ERESTARTSYS when we've taken a signal, but we can't
-		 * return until we're done freeing the queue, so ignore
-		 * it.  The signal will get handled when we're done freeing
-		 * the bsg_device.
-		 */
-	} while (ret != -ENODATA);
+	io_wait_event(bd->wq_done, bsg_complete(bd));
 
 	/*
 	 * discard done commands
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2232ed16635a..71fc1d31e48d 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -267,6 +267,21 @@ do {									\
 	__wait_event(wq, condition);					\
 } while (0)
 
+#define __io_wait_event(wq, condition)					\
+	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			    io_schedule())
+
+/*
+ * io_wait_event() -- like wait_event() but with io_schedule()
+ */
+#define io_wait_event(wq, condition)					\
+do {									\
+	might_sleep();							\
+	if (condition)							\
+		break;							\
+	__io_wait_event(wq, condition);					\
+} while (0)
+
 #define __wait_event_freezable(wq, condition)				\
 	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,		\
 			    schedule(); try_to_freeze())
-- 
cgit v1.2.3-59-g8ed1b


From bcadb699b05b828fb43c54abb75607db7e8c3e29 Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Fri, 23 Jan 2015 17:14:29 +0100
Subject: xen-blkback,xen-blkfront: add myself as maintainer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I've done quite a lot of work in blkfront/blkback, and I usually end up
looking at the patches, so add myself as maintainer together with Konrad.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index bb5e510e977e..6660beb96e61 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10574,6 +10574,7 @@ F:	drivers/pci/*xen*
 
 XEN BLOCK SUBSYSTEM
 M:	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+M:	Roger Pau Monné <roger.pau@citrix.com>
 L:	xen-devel@lists.xenproject.org (moderated for non-subscribers)
 S:	Supported
 F:	drivers/block/xen-blkback/*
-- 
cgit v1.2.3-59-g8ed1b


From 3bb8c98e5612f069010ad04e5f463389e2eb6563 Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Mon, 2 Feb 2015 11:28:21 +0000
Subject: xen-blkfront: fix accounting of reqs when migrating
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current migration code uses blk_put_request in order to finish a request
before requeuing it. This function doesn't update the statistics of the
queue, which completely screws accounting. Use blk_end_request_all instead
which properly updates the statistics of the queue.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reported-and-Tested-by: Ouyang Zhaowei (Charles) <ouyangzhaowei@huawei.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: xen-devel@lists.xenproject.org
---
 drivers/block/xen-blkfront.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2236c6f31608..7f66d2e08f19 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info)
 		merge_bio.tail = copy[i].request->biotail;
 		bio_list_merge(&bio_list, &merge_bio);
 		copy[i].request->bio = NULL;
-		blk_put_request(copy[i].request);
+		blk_end_request_all(copy[i].request, 0);
 	}
 
 	kfree(copy);
@@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info)
 		req->bio = NULL;
 		if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
 			pr_alert("diskcache flush request found!\n");
-		__blk_put_request(info->rq, req);
+		__blk_end_request_all(req, 0);
 	}
 	spin_unlock_irq(&info->io_lock);
 
-- 
cgit v1.2.3-59-g8ed1b


From b042a3ca949053231950a1b15f31cccca9e305f3 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Thu, 5 Feb 2015 17:09:56 +0000
Subject: xen-blkback: default to X86_32 ABI on x86
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prior to the existance of 64-bit backends using the X86_64 ABI,
frontends used the X86_32 ABI.  These old frontends do not specify the
ABI and when used with a 64-bit backend do not work.

On x86, default to the X86_32 ABI if one is not specified.  Backends
on ARM continue to default to their NATIVE ABI.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
---
 drivers/block/xen-blkback/common.h | 9 +++++++++
 drivers/block/xen-blkback/xenbus.c | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index f65b807e3236..78b0411e5ed5 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -214,6 +214,15 @@ enum blkif_protocol {
 	BLKIF_PROTOCOL_X86_64 = 3,
 };
 
+/*
+ * Default protocol if the frontend doesn't specify one.
+ */
+#ifdef CONFIG_X86
+#  define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32
+#else
+#  define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE
+#endif
+
 struct xen_vbd {
 	/* What the domain refers to this vbd as. */
 	blkif_vdev_t		handle;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 630a489e757d..e3afe97280b1 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be)
 		return err;
 	}
 
-	be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
 	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
 			    "%63s", protocol, NULL);
 	if (err)
-		strcpy(protocol, "unspecified, assuming native");
+		strcpy(protocol, "unspecified, assuming default");
 	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
 		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
 	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
-- 
cgit v1.2.3-59-g8ed1b