aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-mq.c
diff options
context:
space:
mode:
authorJianchao Wang <jianchao.w.wang@oracle.com>2018-08-21 15:15:03 +0800
committerJens Axboe <axboe@kernel.dk>2018-08-21 09:02:55 -0600
commitd48ece209f82c9ce07be942441b53d3fa3664936 (patch)
treeddf957d9d4edf1915558bc528fee6696cc02f1ed /block/blk-mq.c
parentblock: remove duplicate initialization (diff)
downloadlinux-dev-d48ece209f82c9ce07be942441b53d3fa3664936.tar.xz
linux-dev-d48ece209f82c9ce07be942441b53d3fa3664936.zip
blk-mq: init hctx sched after update ctx and hctx mapping
Currently, when update nr_hw_queues, IO scheduler's init_hctx will be invoked before the mapping between ctx and hctx is adapted correctly by blk_mq_map_swqueue. The IO scheduler init_hctx (kyber) may depend on this mapping and get wrong result and panic finally. A simply way to fix this is that switch the IO scheduler to 'none' before update the nr_hw_queues, and then switch it back after update nr_hw_queues. blk_mq_sched_init_/exit_hctx are removed due to nobody use them any more. Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c92
1 files changed, 84 insertions, 8 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5efd789910e2..9c8c8c71a13f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2147,8 +2147,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
if (set->ops->exit_request)
set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
- blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
-
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@@ -2216,12 +2214,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
goto free_bitmap;
- if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
- goto exit_hctx;
-
hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
if (!hctx->fq)
- goto sched_exit_hctx;
+ goto exit_hctx;
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
goto free_fq;
@@ -2235,8 +2230,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
free_fq:
kfree(hctx->fq);
- sched_exit_hctx:
- blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@@ -2898,10 +2891,81 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return ret;
}
+/*
+ * request_queue and elevator_type pair.
+ * It is just used by __blk_mq_update_nr_hw_queues to cache
+ * the elevator_type associated with a request_queue.
+ */
+struct blk_mq_qe_pair {
+ struct list_head node;
+ struct request_queue *q;
+ struct elevator_type *type;
+};
+
+/*
+ * Cache the elevator_type in qe pair list and switch the
+ * io scheduler to 'none'
+ */
+static bool blk_mq_elv_switch_none(struct list_head *head,
+ struct request_queue *q)
+{
+ struct blk_mq_qe_pair *qe;
+
+ if (!q->elevator)
+ return true;
+
+ qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
+ if (!qe)
+ return false;
+
+ INIT_LIST_HEAD(&qe->node);
+ qe->q = q;
+ qe->type = q->elevator->type;
+ list_add(&qe->node, head);
+
+ mutex_lock(&q->sysfs_lock);
+ /*
+ * After elevator_switch_mq, the previous elevator_queue will be
+ * released by elevator_release. The reference of the io scheduler
+ * module get by elevator_get will also be put. So we need to get
+ * a reference of the io scheduler module here to prevent it to be
+ * removed.
+ */
+ __module_get(qe->type->elevator_owner);
+ elevator_switch_mq(q, NULL);
+ mutex_unlock(&q->sysfs_lock);
+
+ return true;
+}
+
+static void blk_mq_elv_switch_back(struct list_head *head,
+ struct request_queue *q)
+{
+ struct blk_mq_qe_pair *qe;
+ struct elevator_type *t = NULL;
+
+ list_for_each_entry(qe, head, node)
+ if (qe->q == q) {
+ t = qe->type;
+ break;
+ }
+
+ if (!t)
+ return;
+
+ list_del(&qe->node);
+ kfree(qe);
+
+ mutex_lock(&q->sysfs_lock);
+ elevator_switch_mq(q, t);
+ mutex_unlock(&q->sysfs_lock);
+}
+
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
struct request_queue *q;
+ LIST_HEAD(head);
lockdep_assert_held(&set->tag_list_lock);
@@ -2912,6 +2976,14 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue(q);
+ /*
+ * Switch IO scheduler to 'none', cleaning up the data associated
+ * with the previous scheduler. We will switch back once we are done
+ * updating the new sw to hw queue mappings.
+ */
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ if (!blk_mq_elv_switch_none(&head, q))
+ goto switch_back;
set->nr_hw_queues = nr_hw_queues;
blk_mq_update_queue_map(set);
@@ -2920,6 +2992,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
blk_mq_queue_reinit(q);
}
+switch_back:
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ blk_mq_elv_switch_back(&head, q);
+
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_unfreeze_queue(q);
}