From 531ed6261e7466907418b1a9971a5c71d7d250e4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 18 Jun 2014 11:21:08 -0400
Subject: blk-mq: fix a memory ordering bug in blk_mq_queue_enter()

blk-mq uses a percpu_counter to keep track of how many usages are in
flight.  The percpu_counter is drained while freezing to ensure that
no usage is left in-flight after freezing is complete.

blk_mq_queue_enter/exit() and blk_mq_[un]freeze_queue() implement this
per-cpu gating mechanism; unfortunately, it contains a subtle bug -
smp_wmb() in blk_mq_queue_enter() doesn't prevent prevent the cpu from
fetching @q->bypass_depth before incrementing @q->mq_usage_counter and
if freezing happens inbetween the caller can slip through and freezing
can be complete while there are active users.

Use smp_mb() instead so that bypass_depth and mq_usage_counter
modifications and tests are properly interlocked.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Nicholas A. Bellinger <nab@linux-iscsi.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ad69ef657e85..9541f5111ba6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -81,7 +81,7 @@ static int blk_mq_queue_enter(struct request_queue *q)
 	int ret;
 
 	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
-	smp_wmb();
+	smp_mb();
 
 	/* we have problems freezing the queue if it's initializing */
 	if (!blk_queue_dying(q) &&
-- 
cgit v1.2.3-59-g8ed1b


From 776687bce42bb22cce48b5da950e48ebbb9a948f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Jul 2014 10:29:17 -0600
Subject: block, blk-mq: draining can't be skipped even if bypass_depth was
 non-zero

Currently, both blk_queue_bypass_start() and blk_mq_freeze_queue()
skip queue draining if bypass_depth was already above zero.  The
assumption is that the one which bumped the bypass_depth should have
performed draining already; however, there's nothing which prevents a
new instance of bypassing/freezing from starting before the previous
one finishes draining.  The current code may allow the later
bypassing/freezing instances to complete while there still are
in-flight requests which haven't finished draining.

Fix it by draining regardless of bypass_depth.  We still skip draining
from blk_queue_bypass_start() while the queue is initializing to avoid
introducing excessive delays during boot.  INIT_DONE setting is moved
above the initial blk_queue_bypass_end() so that bypassing attempts
can't slip inbetween.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Nicholas A. Bellinger <nab@linux-iscsi.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c  | 11 +++++++----
 block/blk-mq.c    |  7 ++-----
 block/blk-sysfs.c |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6f8dba161bfe..0d0bdd65b2d7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -438,14 +438,17 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
  */
 void blk_queue_bypass_start(struct request_queue *q)
 {
-	bool drain;
-
 	spin_lock_irq(q->queue_lock);
-	drain = !q->bypass_depth++;
+	q->bypass_depth++;
 	queue_flag_set(QUEUE_FLAG_BYPASS, q);
 	spin_unlock_irq(q->queue_lock);
 
-	if (drain) {
+	/*
+	 * Queues start drained.  Skip actual draining till init is
+	 * complete.  This avoids lenghty delays during queue init which
+	 * can happen many times during boot.
+	 */
+	if (blk_queue_init_done(q)) {
 		spin_lock_irq(q->queue_lock);
 		__blk_drain_queue(q, false);
 		spin_unlock_irq(q->queue_lock);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9541f5111ba6..f4bdddd7ed99 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -131,15 +131,12 @@ void blk_mq_drain_queue(struct request_queue *q)
  */
 static void blk_mq_freeze_queue(struct request_queue *q)
 {
-	bool drain;
-
 	spin_lock_irq(q->queue_lock);
-	drain = !q->bypass_depth++;
+	q->bypass_depth++;
 	queue_flag_set(QUEUE_FLAG_BYPASS, q);
 	spin_unlock_irq(q->queue_lock);
 
-	if (drain)
-		blk_mq_drain_queue(q);
+	blk_mq_drain_queue(q);
 }
 
 static void blk_mq_unfreeze_queue(struct request_queue *q)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 23321fbab293..4db5abf96b9e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -554,8 +554,8 @@ int blk_register_queue(struct gendisk *disk)
 	 * Initialization must be complete by now.  Finish the initial
 	 * bypass from queue allocation.
 	 */
-	blk_queue_bypass_end(q);
 	queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
+	blk_queue_bypass_end(q);
 
 	ret = blk_trace_init_sysfs(dev);
 	if (ret)
-- 
cgit v1.2.3-59-g8ed1b


From 780db2071ac4d167ee4154ad9c96088f1bba044b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Jul 2014 10:31:13 -0600
Subject: blk-mq: decouble blk-mq freezing from generic bypassing

blk_mq freezing is entangled with generic bypassing which bypasses
blkcg and io scheduler and lets IO requests fall through the block
layer to the drivers in FIFO order.  This allows forward progress on
IOs with the advanced features disabled so that those features can be
configured or altered without worrying about stalling IO which may
lead to deadlock through memory allocation.

However, generic bypassing doesn't quite fit blk-mq.  blk-mq currently
doesn't make use of blkcg or ioscheds and it maps bypssing to
freezing, which blocks request processing and drains all the in-flight
ones.  This causes problems as bypassing assumes that request
processing is online.  blk-mq works around this by conditionally
allowing request processing for the problem case - during queue
initialization.

Another weirdity is that except for during queue cleanup, bypassing
started on the generic side prevents blk-mq from processing new
requests but doesn't drain the in-flight ones.  This shouldn't break
anything but again highlights that something isn't quite right here.

The root cause is conflating blk-mq freezing and generic bypassing
which are two different mechanisms.  The only intersecting purpose
that they serve is during queue cleanup.  Let's properly separate
blk-mq freezing from generic bypassing and simply use it where
necessary.

* request_queue->mq_freeze_depth is added and
  blk_mq_[un]freeze_queue() now operate on this counter instead of
  ->bypass_depth.  The replacement for QUEUE_FLAG_BYPASS isn't added
  but the counter is tested directly.  This will be further updated by
  later changes.

* blk_mq_drain_queue() is dropped and "__" prefix is dropped from
  blk_mq_freeze_queue().  Queue cleanup path now calls
  blk_mq_freeze_queue() directly.

* blk_queue_enter()'s fast path condition is simplified to simply
  check @q->mq_freeze_depth.  Previously, the condition was

	!blk_queue_dying(q) &&
	    (!blk_queue_bypass(q) || !blk_queue_init_done(q))

  mq_freeze_depth is incremented right after dying is set and
  blk_queue_init_done() exception isn't necessary as blk-mq doesn't
  start frozen, which only leaves the blk_queue_bypass() test which
  can be replaced by @q->mq_freeze_depth test.

This change simplifies the code and reduces confusion in the area.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Nicholas A. Bellinger <nab@linux-iscsi.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  2 +-
 block/blk-mq.c         | 17 ++++++-----------
 block/blk-mq.h         |  2 +-
 include/linux/blkdev.h |  1 +
 4 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 0d0bdd65b2d7..c359d72e9d76 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -514,7 +514,7 @@ void blk_cleanup_queue(struct request_queue *q)
 	 * prevent that q->request_fn() gets invoked after draining finished.
 	 */
 	if (q->mq_ops) {
-		blk_mq_drain_queue(q);
+		blk_mq_freeze_queue(q);
 		spin_lock_irq(lock);
 	} else {
 		spin_lock_irq(lock);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f4bdddd7ed99..1e324a123d40 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -84,15 +84,14 @@ static int blk_mq_queue_enter(struct request_queue *q)
 	smp_mb();
 
 	/* we have problems freezing the queue if it's initializing */
-	if (!blk_queue_dying(q) &&
-	    (!blk_queue_bypass(q) || !blk_queue_init_done(q)))
+	if (!q->mq_freeze_depth)
 		return 0;
 
 	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
 
 	spin_lock_irq(q->queue_lock);
 	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
-		!blk_queue_bypass(q) || blk_queue_dying(q),
+		!q->mq_freeze_depth || blk_queue_dying(q),
 		*q->queue_lock);
 	/* inc usage with lock hold to avoid freeze_queue runs here */
 	if (!ret && !blk_queue_dying(q))
@@ -129,11 +128,10 @@ void blk_mq_drain_queue(struct request_queue *q)
  * Guarantee no request is in use, so we can change any data structure of
  * the queue afterward.
  */
-static void blk_mq_freeze_queue(struct request_queue *q)
+void blk_mq_freeze_queue(struct request_queue *q)
 {
 	spin_lock_irq(q->queue_lock);
-	q->bypass_depth++;
-	queue_flag_set(QUEUE_FLAG_BYPASS, q);
+	q->mq_freeze_depth++;
 	spin_unlock_irq(q->queue_lock);
 
 	blk_mq_drain_queue(q);
@@ -144,11 +142,8 @@ static void blk_mq_unfreeze_queue(struct request_queue *q)
 	bool wake = false;
 
 	spin_lock_irq(q->queue_lock);
-	if (!--q->bypass_depth) {
-		queue_flag_clear(QUEUE_FLAG_BYPASS, q);
-		wake = true;
-	}
-	WARN_ON_ONCE(q->bypass_depth < 0);
+	wake = !--q->mq_freeze_depth;
+	WARN_ON_ONCE(q->mq_freeze_depth < 0);
 	spin_unlock_irq(q->queue_lock);
 	if (wake)
 		wake_up_all(&q->mq_freeze_wq);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 26460884c6cd..ca4964a6295d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -28,7 +28,7 @@ struct blk_mq_ctx {
 void __blk_mq_complete_request(struct request *rq);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_init_flush(struct request_queue *q);
-void blk_mq_drain_queue(struct request_queue *q);
+void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 void blk_mq_clone_flush_request(struct request *flush_rq,
 		struct request *orig_rq);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8699bcf5f099..c8f344ff74fe 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -470,6 +470,7 @@ struct request_queue {
 	struct mutex		sysfs_lock;
 
 	int			bypass_depth;
+	int			mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
-- 
cgit v1.2.3-59-g8ed1b


From 72d6f02a8d4e0dda74de3a541b1c4ae82f5f7b45 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Jul 2014 10:33:02 -0600
Subject: blk-mq: collapse __blk_mq_drain_queue() into blk_mq_freeze_queue()

Keeping __blk_mq_drain_queue() as a separate function doesn't buy us
anything and it's gonna be further simplified.  Let's flatten it into
its caller.

This patch doesn't make any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Nicholas A. Bellinger <nab@linux-iscsi.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1e324a123d40..22682fb4be65 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -108,8 +108,16 @@ static void blk_mq_queue_exit(struct request_queue *q)
 	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
 }
 
-void blk_mq_drain_queue(struct request_queue *q)
+/*
+ * Guarantee no request is in use, so we can change any data structure of
+ * the queue afterward.
+ */
+void blk_mq_freeze_queue(struct request_queue *q)
 {
+	spin_lock_irq(q->queue_lock);
+	q->mq_freeze_depth++;
+	spin_unlock_irq(q->queue_lock);
+
 	while (true) {
 		s64 count;
 
@@ -124,19 +132,6 @@ void blk_mq_drain_queue(struct request_queue *q)
 	}
 }
 
-/*
- * Guarantee no request is in use, so we can change any data structure of
- * the queue afterward.
- */
-void blk_mq_freeze_queue(struct request_queue *q)
-{
-	spin_lock_irq(q->queue_lock);
-	q->mq_freeze_depth++;
-	spin_unlock_irq(q->queue_lock);
-
-	blk_mq_drain_queue(q);
-}
-
 static void blk_mq_unfreeze_queue(struct request_queue *q)
 {
 	bool wake = false;
-- 
cgit v1.2.3-59-g8ed1b


From add703fda981b9719d37f371498b9f129acbd997 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Jul 2014 10:34:38 -0600
Subject: blk-mq: use percpu_ref for mq usage count

Currently, blk-mq uses a percpu_counter to keep track of how many
usages are in flight.  The percpu_counter is drained while freezing to
ensure that no usage is left in-flight after freezing is complete.
blk_mq_queue_enter/exit() and blk_mq_[un]freeze_queue() implement this
per-cpu gating mechanism.

This type of code has relatively high chance of subtle bugs which are
extremely difficult to trigger and it's way too hairy to be open coded
in blk-mq.  percpu_ref can serve the same purpose after the recent
changes.  This patch replaces the open-coded per-cpu usage counting
and draining mechanism with percpu_ref.

blk_mq_queue_enter() performs tryget_live on the ref and exit()
performs put.  blk_mq_freeze_queue() kills the ref and waits until the
reference count reaches zero.  blk_mq_unfreeze_queue() revives the ref
and wakes up the waiters.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Nicholas A. Bellinger <nab@linux-iscsi.org>
Cc: Kent Overstreet <kmo@daterainc.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 68 +++++++++++++++++++++-----------------------------
 include/linux/blkdev.h |  3 ++-
 2 files changed, 31 insertions(+), 40 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 22682fb4be65..5189cb1e478a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -78,34 +78,32 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 
 static int blk_mq_queue_enter(struct request_queue *q)
 {
-	int ret;
-
-	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
-	smp_mb();
-
-	/* we have problems freezing the queue if it's initializing */
-	if (!q->mq_freeze_depth)
-		return 0;
-
-	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+	while (true) {
+		int ret;
 
-	spin_lock_irq(q->queue_lock);
-	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
-		!q->mq_freeze_depth || blk_queue_dying(q),
-		*q->queue_lock);
-	/* inc usage with lock hold to avoid freeze_queue runs here */
-	if (!ret && !blk_queue_dying(q))
-		__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
-	else if (blk_queue_dying(q))
-		ret = -ENODEV;
-	spin_unlock_irq(q->queue_lock);
+		if (percpu_ref_tryget_live(&q->mq_usage_counter))
+			return 0;
 
-	return ret;
+		ret = wait_event_interruptible(q->mq_freeze_wq,
+				!q->mq_freeze_depth || blk_queue_dying(q));
+		if (blk_queue_dying(q))
+			return -ENODEV;
+		if (ret)
+			return ret;
+	}
 }
 
 static void blk_mq_queue_exit(struct request_queue *q)
 {
-	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+	percpu_ref_put(&q->mq_usage_counter);
+}
+
+static void blk_mq_usage_counter_release(struct percpu_ref *ref)
+{
+	struct request_queue *q =
+		container_of(ref, struct request_queue, mq_usage_counter);
+
+	wake_up_all(&q->mq_freeze_wq);
 }
 
 /*
@@ -118,18 +116,9 @@ void blk_mq_freeze_queue(struct request_queue *q)
 	q->mq_freeze_depth++;
 	spin_unlock_irq(q->queue_lock);
 
-	while (true) {
-		s64 count;
-
-		spin_lock_irq(q->queue_lock);
-		count = percpu_counter_sum(&q->mq_usage_counter);
-		spin_unlock_irq(q->queue_lock);
-
-		if (count == 0)
-			break;
-		blk_mq_start_hw_queues(q);
-		msleep(10);
-	}
+	percpu_ref_kill(&q->mq_usage_counter);
+	blk_mq_run_queues(q, false);
+	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
 }
 
 static void blk_mq_unfreeze_queue(struct request_queue *q)
@@ -140,8 +129,10 @@ static void blk_mq_unfreeze_queue(struct request_queue *q)
 	wake = !--q->mq_freeze_depth;
 	WARN_ON_ONCE(q->mq_freeze_depth < 0);
 	spin_unlock_irq(q->queue_lock);
-	if (wake)
+	if (wake) {
+		percpu_ref_reinit(&q->mq_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
+	}
 }
 
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
@@ -1785,7 +1776,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	if (!q)
 		goto err_hctxs;
 
-	if (percpu_counter_init(&q->mq_usage_counter, 0))
+	if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release))
 		goto err_map;
 
 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
@@ -1878,7 +1869,7 @@ void blk_mq_free_queue(struct request_queue *q)
 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
 	blk_mq_free_hw_queues(q, set);
 
-	percpu_counter_destroy(&q->mq_usage_counter);
+	percpu_ref_exit(&q->mq_usage_counter);
 
 	free_percpu(q->queue_ctx);
 	kfree(q->queue_hw_ctx);
@@ -2037,8 +2028,7 @@ static int __init blk_mq_init(void)
 {
 	blk_mq_cpu_init();
 
-	/* Must be called after percpu_counter_hotcpu_callback() */
-	hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
+	hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
 
 	return 0;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c8f344ff74fe..518b46555b80 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -21,6 +21,7 @@
 #include <linux/bsg.h>
 #include <linux/smp.h>
 #include <linux/rcupdate.h>
+#include <linux/percpu-refcount.h>
 
 #include <asm/scatterlist.h>
 
@@ -484,7 +485,7 @@ struct request_queue {
 #endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
-	struct percpu_counter	mq_usage_counter;
+	struct percpu_ref	mq_usage_counter;
 	struct list_head	all_q_node;
 
 	struct blk_mq_tag_set	*tag_set;
-- 
cgit v1.2.3-59-g8ed1b


From cbcd1054a1fd2aa980fc11ff28e436fc4aaa2d54 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Tue, 1 Jul 2014 10:36:47 -0600
Subject: bio-integrity: add "bip_max_vcnt" into struct bio_integrity_payload

Commit 08778795 ("block: Fix nr_vecs for inline integrity vectors") from
Martin introduces the function bip_integrity_vecs(get the useful vectors)
to fix the issue about nr_vecs for inline integrity vectors that reported
by David Milburn.

But it seems that bip_integrity_vecs() will return the wrong number if the
bio is not based on any bio_set for some reason(bio->bi_pool == NULL),
because in that case, the bip_inline_vecs[0] is malloced directly.  So
here we add the bip_max_vcnt to record the count of vector slots, and
cleanup the function bip_integrity_vecs().

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Kent Overstreet <kmo@daterainc.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio-integrity.c | 12 +++---------
 include/linux/bio.h   |  1 +
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 9e241063a616..bc423f7b02da 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -70,8 +70,10 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 					  bs->bvec_integrity_pool);
 		if (!bip->bip_vec)
 			goto err;
+		bip->bip_max_vcnt = bvec_nr_vecs(idx);
 	} else {
 		bip->bip_vec = bip->bip_inline_vecs;
+		bip->bip_max_vcnt = inline_vecs;
 	}
 
 	bip->bip_slab = idx;
@@ -114,14 +116,6 @@ void bio_integrity_free(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_integrity_free);
 
-static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
-{
-	if (bip->bip_slab == BIO_POOL_NONE)
-		return BIP_INLINE_VECS;
-
-	return bvec_nr_vecs(bip->bip_slab);
-}
-
 /**
  * bio_integrity_add_page - Attach integrity metadata
  * @bio:	bio to update
@@ -137,7 +131,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 	struct bio_vec *iv;
 
-	if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
+	if (bip->bip_vcnt >= bip->bip_max_vcnt) {
 		printk(KERN_ERR "%s: bip_vec full\n", __func__);
 		return 0;
 	}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d2633ee099d9..b39e5000ff58 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -308,6 +308,7 @@ struct bio_integrity_payload {
 
 	unsigned short		bip_slab;	/* slab the bip came from */
 	unsigned short		bip_vcnt;	/* # of integrity bio_vecs */
+	unsigned short		bip_max_vcnt;	/* integrity bio_vec slots */
 	unsigned		bip_owns_buf:1;	/* should free bip_buf */
 
 	struct work_struct	bip_work;	/* I/O completion */
-- 
cgit v1.2.3-59-g8ed1b


From 472d5e2af28cc6ae9749ce22f951ea426fdfc392 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Thu, 12 Jun 2014 19:45:17 +0200
Subject: block/partitions/aix.c: replace count*size kzalloc by kcalloc

kcalloc manages count*sizeof overflow.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/partitions/aix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/partitions/aix.c b/block/partitions/aix.c
index 43be471d9b1d..0a6ed546331d 100644
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -215,7 +215,7 @@ int aix_partition(struct parsed_partitions *state)
 		numlvs = be16_to_cpu(p->numlvs);
 		put_dev_sector(sect);
 	}
-	lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL);
+	lvip = kcalloc(state->limit, sizeof(struct lv_info), GFP_KERNEL);
 	if (!lvip)
 		return 0;
 	if (numlvs && (d = read_part_sector(state, vgda_sector + 1, &sect))) {
-- 
cgit v1.2.3-59-g8ed1b


From 600ffc5ead7fd0bc5b9950842b2aece26682b0e0 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Thu, 12 Jun 2014 20:04:52 +0200
Subject: block/partitions/amiga.c: replace nolevel printk by pr_err

Also add no prefix pr_fmt to avoid any future default format update

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/partitions/amiga.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c
index 70cbf44a1560..2b13533d60a2 100644
--- a/block/partitions/amiga.c
+++ b/block/partitions/amiga.c
@@ -7,6 +7,8 @@
  *  Re-organised Feb 1998 Russell King
  */
 
+#define pr_fmt(fmt) fmt
+
 #include <linux/types.h>
 #include <linux/affs_hardblocks.h>
 
@@ -40,7 +42,7 @@ int amiga_partition(struct parsed_partitions *state)
 		data = read_part_sector(state, blk, &sect);
 		if (!data) {
 			if (warn_no_part)
-				printk("Dev %s: unable to read RDB block %d\n",
+				pr_err("Dev %s: unable to read RDB block %d\n",
 				       bdevname(state->bdev, b), blk);
 			res = -1;
 			goto rdb_done;
@@ -57,12 +59,12 @@ int amiga_partition(struct parsed_partitions *state)
 		*(__be32 *)(data+0xdc) = 0;
 		if (checksum_block((__be32 *)data,
 				be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
-			printk("Warning: Trashed word at 0xd0 in block %d "
-				"ignored in checksum calculation\n",blk);
+			pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n",
+			       blk);
 			break;
 		}
 
-		printk("Dev %s: RDB in block %d has bad checksum\n",
+		pr_err("Dev %s: RDB in block %d has bad checksum\n",
 		       bdevname(state->bdev, b), blk);
 	}
 
@@ -83,7 +85,7 @@ int amiga_partition(struct parsed_partitions *state)
 		data = read_part_sector(state, blk, &sect);
 		if (!data) {
 			if (warn_no_part)
-				printk("Dev %s: unable to read partition block %d\n",
+				pr_err("Dev %s: unable to read partition block %d\n",
 				       bdevname(state->bdev, b), blk);
 			res = -1;
 			goto rdb_done;
-- 
cgit v1.2.3-59-g8ed1b


From dce14c239ad550816d07f7441fc10a2d5739fd29 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Thu, 12 Jun 2014 20:16:57 +0200
Subject: block/partitions/msdos.c: code clean-up

checkpatch fixing:
WARNING: Missing a blank line after declarations
WARNING: space prohibited between function name and open parenthesis '('
ERROR: spaces required around that '<' (ctx:VxV)

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/partitions/msdos.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 9123f250b425..93e7c1b32edd 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -159,8 +159,9 @@ static void parse_extended(struct parsed_partitions *state,
 		/*
 		 * First process the data partition(s)
 		 */
-		for (i=0; i<4; i++, p++) {
+		for (i = 0; i < 4; i++, p++) {
 			sector_t offs, size, next;
+
 			if (!nr_sects(p) || is_extended_partition(p))
 				continue;
 
@@ -194,7 +195,7 @@ static void parse_extended(struct parsed_partitions *state,
 		 * It should be a link to the next logical partition.
 		 */
 		p -= 4;
-		for (i=0; i<4; i++, p++)
+		for (i = 0; i < 4; i++, p++)
 			if (nr_sects(p) && is_extended_partition(p))
 				break;
 		if (i == 4)
@@ -243,8 +244,8 @@ static void parse_solaris_x86(struct parsed_partitions *state,
 		return;
 	}
 	/* Ensure we can handle previous case of VTOC with 8 entries gracefully */
-	max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
-	for (i=0; i<max_nparts && state->next<state->limit; i++) {
+	max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
+	for (i = 0; i < max_nparts && state->next < state->limit; i++) {
 		struct solaris_x86_slice *s = &v->v_slice[i];
 		char tmp[3 + 10 + 1 + 1];
 
@@ -409,7 +410,7 @@ static void parse_minix(struct parsed_partitions *state,
 	/* The first sector of a Minix partition can have either
 	 * a secondary MBR describing its subpartitions, or
 	 * the normal boot sector. */
-	if (msdos_magic_present (data + 510) &&
+	if (msdos_magic_present(data + 510) &&
 	    SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
 		char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
 
@@ -527,6 +528,7 @@ int msdos_partition(struct parsed_partitions *state)
 	for (slot = 1 ; slot <= 4 ; slot++, p++) {
 		sector_t start = start_sect(p)*sector_size;
 		sector_t size = nr_sects(p)*sector_size;
+
 		if (!size)
 			continue;
 		if (is_extended_partition(p)) {
@@ -537,6 +539,7 @@ int msdos_partition(struct parsed_partitions *state)
 			 * sector, although it may not be enough/proper.
 			 */
 			sector_t n = 2;
+
 			n = min(size, max(sector_size, n));
 			put_partition(state, slot, start, n);
 
-- 
cgit v1.2.3-59-g8ed1b


From 16e1556526241b893d40b01d1c1b14a4e83ee499 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Thu, 12 Jun 2014 20:26:01 +0200
Subject: block/partitions/efi.c: kerneldoc fixing

Adding function documentation and fixing kerneldoc warnings
('field: description' uniformization).

Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/partitions/efi.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'block')

diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index dc51f467a560..56d08fd75b1a 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -121,7 +121,7 @@ __setup("gpt", force_gpt_fn);
 /**
  * efi_crc32() - EFI version of crc32 function
  * @buf: buffer to calculate crc32 of
- * @len - length of buf
+ * @len: length of buf
  *
  * Description: Returns EFI-style CRC32 value for @buf
  * 
@@ -240,10 +240,10 @@ done:
 
 /**
  * read_lba(): Read bytes from disk, starting at given LBA
- * @state
- * @lba
- * @buffer
- * @size_t
+ * @state: disk parsed partitions
+ * @lba: the Logical Block Address of the partition table
+ * @buffer: destination buffer
+ * @count: bytes to read
  *
  * Description: Reads @count bytes from @state->bdev into @buffer.
  * Returns number of bytes read on success, 0 on error.
@@ -277,8 +277,8 @@ static size_t read_lba(struct parsed_partitions *state,
 
 /**
  * alloc_read_gpt_entries(): reads partition entries from disk
- * @state
- * @gpt - GPT header
+ * @state: disk parsed partitions
+ * @gpt: GPT header
  * 
  * Description: Returns ptes on success,  NULL on error.
  * Allocates space for PTEs based on information found in @gpt.
@@ -312,8 +312,8 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
 
 /**
  * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @state
- * @lba is the Logical Block Address of the partition table
+ * @state: disk parsed partitions
+ * @lba: the Logical Block Address of the partition table
  * 
  * Description: returns GPT header on success, NULL on error.   Allocates
  * and fills a GPT header starting at @ from @state->bdev.
@@ -340,10 +340,10 @@ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
 
 /**
  * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @state
- * @lba is the logical block address of the GPT header to test
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
+ * @state: disk parsed partitions
+ * @lba: logical block address of the GPT header to test
+ * @gpt: GPT header ptr, filled on return.
+ * @ptes: PTEs ptr, filled on return.
  *
  * Description: returns 1 if valid,  0 on error.
  * If valid, returns pointers to newly allocated GPT header and PTEs.
@@ -461,8 +461,8 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
 
 /**
  * is_pte_valid() - tests one PTE for validity
- * @pte is the pte to check
- * @lastlba is last lba of the disk
+ * @pte:pte to check
+ * @lastlba: last lba of the disk
  *
  * Description: returns 1 if valid,  0 on error.
  */
@@ -478,9 +478,10 @@ is_pte_valid(const gpt_entry *pte, const u64 lastlba)
 
 /**
  * compare_gpts() - Search disk for valid GPT headers and PTEs
- * @pgpt is the primary GPT header
- * @agpt is the alternate GPT header
- * @lastlba is the last LBA number
+ * @pgpt: primary GPT header
+ * @agpt: alternate GPT header
+ * @lastlba: last LBA number
+ *
  * Description: Returns nothing.  Sanity checks pgpt and agpt fields
  * and prints warnings on discrepancies.
  * 
@@ -572,9 +573,10 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 
 /**
  * find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @state
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
+ * @state: disk parsed partitions
+ * @gpt: GPT header ptr, filled on return.
+ * @ptes: PTEs ptr, filled on return.
+ *
  * Description: Returns 1 if valid, 0 on error.
  * If valid, returns pointers to newly allocated GPT header and PTEs.
  * Validity depends on PMBR being valid (or being overridden by the
@@ -663,7 +665,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
 
 /**
  * efi_partition(struct parsed_partitions *state)
- * @state
+ * @state: disk parsed partitions
  *
  * Description: called from check.c, if the disk contains GPT
  * partitions, sets up partition entries in the kernel.
-- 
cgit v1.2.3-59-g8ed1b


From 63f264965947ac6299452711f614f086955b2515 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Sun, 25 May 2014 21:43:33 +0900
Subject: block: fix BLKSECTGET ioctl when max_sectors is greater than
 USHRT_MAX

BLKSECTGET ioctl loads the request queue's max_sectors as unsigned
short value to the argument pointer.  So if the max_sector is greater
than USHRT_MAX, the upper 16 bits of that is just discarded.

In such case, USHRT_MAX is more preferable than the lower 16 bits of
max_sectors.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: linux-scsi@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/compat_ioctl.c | 6 ++++--
 block/ioctl.c        | 5 ++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index fbd5a67cb773..e0393cd2ea7f 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -663,6 +663,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	fmode_t mode = file->f_mode;
 	struct backing_dev_info *bdi;
 	loff_t size;
+	unsigned int max_sectors;
 
 	/*
 	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
@@ -718,8 +719,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKSSZGET: /* get block device hardware sector size */
 		return compat_put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
-		return compat_put_ushort(arg,
-					 queue_max_sectors(bdev_get_queue(bdev)));
+		max_sectors = min_t(unsigned int, USHRT_MAX,
+				    queue_max_sectors(bdev_get_queue(bdev)));
+		return compat_put_ushort(arg, max_sectors);
 	case BLKROTATIONAL:
 		return compat_put_ushort(arg,
 					 !blk_queue_nonrot(bdev_get_queue(bdev)));
diff --git a/block/ioctl.c b/block/ioctl.c
index 7d5c3b20af45..d6cda8147c91 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -278,6 +278,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	struct backing_dev_info *bdi;
 	loff_t size;
 	int ret, n;
+	unsigned int max_sectors;
 
 	switch(cmd) {
 	case BLKFLSBUF:
@@ -375,7 +376,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKDISCARDZEROES:
 		return put_uint(arg, bdev_discard_zeroes_data(bdev));
 	case BLKSECTGET:
-		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
+		max_sectors = min_t(unsigned int, USHRT_MAX,
+				    queue_max_sectors(bdev_get_queue(bdev)));
+		return put_ushort(arg, max_sectors);
 	case BLKROTATIONAL:
 		return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
 	case BLKRASET:
-- 
cgit v1.2.3-59-g8ed1b


From 9b4231bf995996d6459c57959ead5a1829ff2c57 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Sun, 25 May 2014 21:43:34 +0900
Subject: block: fix SG_[GS]ET_RESERVED_SIZE ioctl when max_sectors is huge

SG_GET_RESERVED_SIZE and SG_SET_RESERVED_SIZE ioctls access a reserved
buffer in bytes as int type.  The value needs to be capped at the request
queue's max_sectors.  But integer overflow is not correctly handled in
the calculation when converting max_sectors from sectors to bytes.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: linux-scsi@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/scsi_ioctl.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 14695c6221c8..bda1497add4c 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -82,9 +82,18 @@ static int sg_set_timeout(struct request_queue *q, int __user *p)
 	return err;
 }
 
+static int max_sectors_bytes(struct request_queue *q)
+{
+	unsigned int max_sectors = queue_max_sectors(q);
+
+	max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);
+
+	return max_sectors << 9;
+}
+
 static int sg_get_reserved_size(struct request_queue *q, int __user *p)
 {
-	unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9);
+	int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));
 
 	return put_user(val, p);
 }
@@ -98,10 +107,8 @@ static int sg_set_reserved_size(struct request_queue *q, int __user *p)
 
 	if (size < 0)
 		return -EINVAL;
-	if (size > (queue_max_sectors(q) << 9))
-		size = queue_max_sectors(q) << 9;
 
-	q->sg_reserved_size = size;
+	q->sg_reserved_size = min(size, max_sectors_bytes(q));
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 254c4407cb84a6dec90336054615b0f0e996bb7c Mon Sep 17 00:00:00 2001
From: Maurizio Lombardi <mlombard@redhat.com>
Date: Tue, 1 Jul 2014 10:55:15 -0600
Subject: bio: modify __bio_add_page() to accept pages that don't start a new
 segment

The original behaviour is to refuse to add a new page if the maximum
number of segments has been reached, regardless of the fact the page we
are going to add can be merged into the last segment or not.

Unfortunately, when the system runs under heavy memory fragmentation
conditions, a driver may try to add multiple pages to the last segment.
The original code won't accept them and EBUSY will be reported to
userspace.

This patch modifies the function so it refuses to add a page only in case
the latter starts a new segment and the maximum number of segments has
already been reached.

The bug can be easily reproduced with the st driver:

1) set CONFIG_SCSI_MPT2SAS_MAX_SGE or CONFIG_SCSI_MPT3SAS_MAX_SGE  to 16
2) modprobe st buffer_kbs=1024
3) #dd if=/dev/zero of=/dev/st0 bs=1M count=10
   dd: error writing `/dev/st0': Device or resource busy

[ming.lei@canonical.com: update bi_iter.bi_size before recounting segments]
Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Tested-by: Dongsu Park <dongsu.park@profitbricks.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c | 52 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 23 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index 0ec61c9e536c..fb12df9af0fc 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -744,6 +744,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 				}
 			}
 
+			bio->bi_iter.bi_size += len;
 			goto done;
 		}
 
@@ -760,28 +761,31 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		return 0;
 
 	/*
-	 * we might lose a segment or two here, but rather that than
-	 * make this too complex.
+	 * setup the new entry, we might clear it again later if we
+	 * cannot add the page
+	 */
+	bvec = &bio->bi_io_vec[bio->bi_vcnt];
+	bvec->bv_page = page;
+	bvec->bv_len = len;
+	bvec->bv_offset = offset;
+	bio->bi_vcnt++;
+	bio->bi_phys_segments++;
+	bio->bi_iter.bi_size += len;
+
+	/*
+	 * Perform a recount if the number of segments is greater
+	 * than queue_max_segments(q).
 	 */
 
-	while (bio->bi_phys_segments >= queue_max_segments(q)) {
+	while (bio->bi_phys_segments > queue_max_segments(q)) {
 
 		if (retried_segments)
-			return 0;
+			goto failed;
 
 		retried_segments = 1;
 		blk_recount_segments(q, bio);
 	}
 
-	/*
-	 * setup the new entry, we might clear it again later if we
-	 * cannot add the page
-	 */
-	bvec = &bio->bi_io_vec[bio->bi_vcnt];
-	bvec->bv_page = page;
-	bvec->bv_len = len;
-	bvec->bv_offset = offset;
-
 	/*
 	 * if queue has other restrictions (eg varying max sector size
 	 * depending on offset), it can specify a merge_bvec_fn in the
@@ -799,23 +803,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		 * merge_bvec_fn() returns number of bytes it can accept
 		 * at this offset
 		 */
-		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
-			bvec->bv_page = NULL;
-			bvec->bv_len = 0;
-			bvec->bv_offset = 0;
-			return 0;
-		}
+		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
+			goto failed;
 	}
 
 	/* If we may be able to merge these biovecs, force a recount */
-	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
+	if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
 		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
 
-	bio->bi_vcnt++;
-	bio->bi_phys_segments++;
  done:
-	bio->bi_iter.bi_size += len;
 	return len;
+
+ failed:
+	bvec->bv_page = NULL;
+	bvec->bv_len = 0;
+	bvec->bv_offset = 0;
+	bio->bi_vcnt--;
+	bio->bi_iter.bi_size -= len;
+	blk_recount_segments(q, bio);
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 26a337944e73d88838642a09689fdf40daf00069 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 14 Jul 2014 22:04:47 +0200
Subject: Revert "bio: modify __bio_add_page() to accept pages that don't start
 a new segment"

This reverts commit 254c4407cb84a6dec90336054615b0f0e996bb7c.

It causes crashes with cryptsetup, even after a few iterations and
updates. Drop it for now.
---
 block/bio.c | 52 +++++++++++++++++++++++-----------------------------
 1 file changed, 23 insertions(+), 29 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index fb12df9af0fc..0ec61c9e536c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -744,7 +744,6 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 				}
 			}
 
-			bio->bi_iter.bi_size += len;
 			goto done;
 		}
 
@@ -761,31 +760,28 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		return 0;
 
 	/*
-	 * setup the new entry, we might clear it again later if we
-	 * cannot add the page
-	 */
-	bvec = &bio->bi_io_vec[bio->bi_vcnt];
-	bvec->bv_page = page;
-	bvec->bv_len = len;
-	bvec->bv_offset = offset;
-	bio->bi_vcnt++;
-	bio->bi_phys_segments++;
-	bio->bi_iter.bi_size += len;
-
-	/*
-	 * Perform a recount if the number of segments is greater
-	 * than queue_max_segments(q).
+	 * we might lose a segment or two here, but rather that than
+	 * make this too complex.
 	 */
 
-	while (bio->bi_phys_segments > queue_max_segments(q)) {
+	while (bio->bi_phys_segments >= queue_max_segments(q)) {
 
 		if (retried_segments)
-			goto failed;
+			return 0;
 
 		retried_segments = 1;
 		blk_recount_segments(q, bio);
 	}
 
+	/*
+	 * setup the new entry, we might clear it again later if we
+	 * cannot add the page
+	 */
+	bvec = &bio->bi_io_vec[bio->bi_vcnt];
+	bvec->bv_page = page;
+	bvec->bv_len = len;
+	bvec->bv_offset = offset;
+
 	/*
 	 * if queue has other restrictions (eg varying max sector size
 	 * depending on offset), it can specify a merge_bvec_fn in the
@@ -803,25 +799,23 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		 * merge_bvec_fn() returns number of bytes it can accept
 		 * at this offset
 		 */
-		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
-			goto failed;
+		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
+			bvec->bv_page = NULL;
+			bvec->bv_len = 0;
+			bvec->bv_offset = 0;
+			return 0;
+		}
 	}
 
 	/* If we may be able to merge these biovecs, force a recount */
-	if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
+	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
 		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
 
+	bio->bi_vcnt++;
+	bio->bi_phys_segments++;
  done:
+	bio->bi_iter.bi_size += len;
 	return len;
-
- failed:
-	bvec->bv_page = NULL;
-	bvec->bv_len = 0;
-	bvec->bv_offset = 0;
-	bio->bi_vcnt--;
-	bio->bi_iter.bi_size -= len;
-	blk_recount_segments(q, bio);
-	return 0;
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 2a1b4cf2331d92bc009bf94fa02a24604cdaf24c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 5 Jul 2014 18:43:21 -0400
Subject: blkcg: don't call into policy draining if root_blkg is already gone

While a queue is being destroyed, all the blkgs are destroyed and its
->root_blkg pointer is set to NULL.  If someone else starts to drain
while the queue is in this state, the following oops happens.

  NULL pointer dereference at 0000000000000028
  IP: [<ffffffff8144e944>] blk_throtl_drain+0x84/0x230
  PGD e4a1067 PUD b773067 PMD 0
  Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
  Modules linked in: cfq_iosched(-) [last unloaded: cfq_iosched]
  CPU: 1 PID: 537 Comm: bash Not tainted 3.16.0-rc3-work+ #2
  Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  task: ffff88000e222250 ti: ffff88000efd4000 task.ti: ffff88000efd4000
  RIP: 0010:[<ffffffff8144e944>]  [<ffffffff8144e944>] blk_throtl_drain+0x84/0x230
  RSP: 0018:ffff88000efd7bf0  EFLAGS: 00010046
  RAX: 0000000000000000 RBX: ffff880015091450 RCX: 0000000000000001
  RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
  RBP: ffff88000efd7c10 R08: 0000000000000000 R09: 0000000000000001
  R10: ffff88000e222250 R11: 0000000000000000 R12: ffff880015091450
  R13: ffff880015092e00 R14: ffff880015091d70 R15: ffff88001508fc28
  FS:  00007f1332650740(0000) GS:ffff88001fa80000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: 0000000000000028 CR3: 0000000009446000 CR4: 00000000000006e0
  Stack:
   ffffffff8144e8f6 ffff880015091450 0000000000000000 ffff880015091d80
   ffff88000efd7c28 ffffffff8144ae2f ffff880015091450 ffff88000efd7c58
   ffffffff81427641 ffff880015091450 ffffffff82401f00 ffff880015091450
  Call Trace:
   [<ffffffff8144ae2f>] blkcg_drain_queue+0x1f/0x60
   [<ffffffff81427641>] __blk_drain_queue+0x71/0x180
   [<ffffffff81429b3e>] blk_queue_bypass_start+0x6e/0xb0
   [<ffffffff814498b8>] blkcg_deactivate_policy+0x38/0x120
   [<ffffffff8144ec44>] blk_throtl_exit+0x34/0x50
   [<ffffffff8144aea5>] blkcg_exit_queue+0x35/0x40
   [<ffffffff8142d476>] blk_release_queue+0x26/0xd0
   [<ffffffff81454968>] kobject_cleanup+0x38/0x70
   [<ffffffff81454848>] kobject_put+0x28/0x60
   [<ffffffff81427505>] blk_put_queue+0x15/0x20
   [<ffffffff817d07bb>] scsi_device_dev_release_usercontext+0x16b/0x1c0
   [<ffffffff810bc339>] execute_in_process_context+0x89/0xa0
   [<ffffffff817d064c>] scsi_device_dev_release+0x1c/0x20
   [<ffffffff817930e2>] device_release+0x32/0xa0
   [<ffffffff81454968>] kobject_cleanup+0x38/0x70
   [<ffffffff81454848>] kobject_put+0x28/0x60
   [<ffffffff817934d7>] put_device+0x17/0x20
   [<ffffffff817d11b9>] __scsi_remove_device+0xa9/0xe0
   [<ffffffff817d121b>] scsi_remove_device+0x2b/0x40
   [<ffffffff817d1257>] sdev_store_delete+0x27/0x30
   [<ffffffff81792ca8>] dev_attr_store+0x18/0x30
   [<ffffffff8126f75e>] sysfs_kf_write+0x3e/0x50
   [<ffffffff8126ea87>] kernfs_fop_write+0xe7/0x170
   [<ffffffff811f5e9f>] vfs_write+0xaf/0x1d0
   [<ffffffff811f69bd>] SyS_write+0x4d/0xc0
   [<ffffffff81d24692>] system_call_fastpath+0x16/0x1b

776687bce42b ("block, blk-mq: draining can't be skipped even if
bypass_depth was non-zero") made it easier to trigger this bug by
making blk_queue_bypass_start() drain even when it loses the first
bypass test to blk_cleanup_queue(); however, the bug has always been
there even before the commit as blk_queue_bypass_start() could race
against queue destruction, win the initial bypass test but perform the
actual draining after blk_cleanup_queue() already destroyed all blkgs.

Fix it by skippping calling into policy draining if all the blkgs are
already gone.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Shirish Pargaonkar <spargaonkar@suse.com>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Reported-by: Jet Chen <jet.chen@intel.com>
Cc: stable@vger.kernel.org
Tested-by: Shirish Pargaonkar <spargaonkar@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b9f4cc494ece..28d227c5ca77 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -872,6 +872,13 @@ void blkcg_drain_queue(struct request_queue *q)
 {
 	lockdep_assert_held(q->queue_lock);
 
+	/*
+	 * @q could be exiting and already have destroyed all blkgs as
+	 * indicated by NULL root_blkg.  If so, don't confuse policies.
+	 */
+	if (!q->root_blkg)
+		return;
+
 	blk_throtl_drain(q);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d97a86c170b4e432f76db072a827fe30b4d6f659 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 5 Aug 2014 11:09:59 +0300
Subject: partitions: aix.c: off by one bug

The lvip[] array has "state->limit" elements so the condition here
should be >= instead of >.

Fixes: 6ceea22bbbc8 ('partitions: add aix lvm partition support files')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Philippe De Muyter <phdm@macqel.be>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/partitions/aix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/partitions/aix.c b/block/partitions/aix.c
index 0a6ed546331d..f3ed7b2d89bf 100644
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -253,7 +253,7 @@ int aix_partition(struct parsed_partitions *state)
 				continue;
 			}
 			lv_ix = be16_to_cpu(p->lv_ix) - 1;
-			if (lv_ix > state->limit) {
+			if (lv_ix >= state->limit) {
 				cur_lv_ix = -1;
 				continue;
 			}
-- 
cgit v1.2.3-59-g8ed1b