blk-mq: avoid starving tag allocation after allocating process migrates

When the allocation process is scheduled back and the mapped hw queue is changed, fake one extra wake up on previous queue for compensating wake up miss, so other allocations on the previous queue won't be starved. This patch fixes one request allocation hang issue, which can be triggered easily in case of very low nr_request. The race is as follows: 1) 2 hw queues, nr_requests are 2, and wake_batch is one 2) there are 3 waiters on hw queue 0 3) two in-flight requests in hw queue 0 are completed, and only two waiters of 3 are waken up because of wake_batch, but both the two waiters can be scheduled to another CPU and cause to switch to hw queue 1 4) then the 3rd waiter will wait for ever, since no in-flight request is in hw queue 0 any more. 5) this patch fixes it by the fake wakeup when waiter is scheduled to another hw queue Cc: <stable@vger.kernel.org> Reviewed-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> Modified commit message to make it clearer, and make it apply on top of the 4.18 branch. Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Ming Lei <ming.lei@redhat.com> 2018-05-24 11:00:39 -0600
committer: Jens Axboe <axboe@kernel.dk> 2018-05-24 11:00:39 -0600
commit: e6fc46498784e799d3eb95d83079180e413c4e7d (patch)
tree: 63876dd6517d7d170d90d71441392ebdcdff3ea6 /lib
parent: bdi: Move cgroup bdi_writeback to a dedicated low concurrency workqueue (diff)
download: linux-dev-e6fc46498784e799d3eb95d83079180e413c4e7d.tar.xz
linux-dev-e6fc46498784e799d3eb95d83079180e413c4e7d.zip
1 files changed, 15 insertions, 14 deletions
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index e6d7d610778d..6fdc6267f4a8 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -352,8 +352,9 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
 	if (sbq->wake_batch != wake_batch) {
 		WRITE_ONCE(sbq->wake_batch, wake_batch);
 		/*
-		 * Pairs with the memory barrier in sbq_wake_up() to ensure that
-		 * the batch size is updated before the wait counts.
+		 * Pairs with the memory barrier in sbitmap_queue_wake_up()
+		 * to ensure that the batch size is updated before the wait
+		 * counts.
 		 */
 		smp_mb__before_atomic();
 		for (i = 0; i < SBQ_WAIT_QUEUES; i++)
@@ -463,15 +464,6 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq)
 	unsigned int wake_batch;
 	int wait_cnt;
 
-	/*
-	 * Pairs with the memory barrier in set_current_state() to ensure the
-	 * proper ordering of clear_bit()/waitqueue_active() in the waker and
-	 * test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
-	 * waiter. See the comment on waitqueue_active(). This is __after_atomic
-	 * because we just did clear_bit_unlock() in the caller.
-	 */
-	smp_mb__after_atomic();
-
 	ws = sbq_wake_ptr(sbq);
 	if (!ws)
 		return false;
@@ -507,17 +499,26 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq)
 	return false;
 }
 
-static void sbq_wake_up(struct sbitmap_queue *sbq)
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
 {
 	while (__sbq_wake_up(sbq))
 		;
 }
+EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
 	sbitmap_clear_bit_unlock(&sbq->sb, nr);
-	sbq_wake_up(sbq);
+	/*
+	 * Pairs with the memory barrier in set_current_state() to ensure the
+	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
+	 * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
+	 * waiter. See the comment on waitqueue_active().
+	 */
+	smp_mb__after_atomic();
+	sbitmap_queue_wake_up(sbq);
+
 	if (likely(!sbq->round_robin && nr < sbq->sb.depth))
 		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
 }
@@ -529,7 +530,7 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
 
 	/*
 	 * Pairs with the memory barrier in set_current_state() like in
-	 * sbq_wake_up().
+	 * sbitmap_queue_wake_up().
 	 */
 	smp_mb();
 	wake_index = atomic_read(&sbq->wake_index);
author	Ming Lei <ming.lei@redhat.com>	2018-05-24 11:00:39 -0600
committer	Jens Axboe <axboe@kernel.dk>	2018-05-24 11:00:39 -0600
commit	e6fc46498784e799d3eb95d83079180e413c4e7d (patch)
tree	63876dd6517d7d170d90d71441392ebdcdff3ea6 /lib
parent	bdi: Move cgroup bdi_writeback to a dedicated low concurrency workqueue (diff)
download	linux-dev-e6fc46498784e799d3eb95d83079180e413c4e7d.tar.xz linux-dev-e6fc46498784e799d3eb95d83079180e413c4e7d.zip