From 3aef3cae4342c1d8137a1c0782cbb66f1be3943c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 3 Mar 2019 09:14:01 -0700 Subject: block: add a req_bvec helper Return the currently active bvec segment, potentially spanning multiple pages. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- include/linux/blkdev.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c58a3b2bf00..84ce76f92d83 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -932,6 +932,17 @@ static inline unsigned int blk_rq_payload_bytes(struct request *rq) return blk_rq_bytes(rq); } +/* + * Return the first full biovec in the request. The caller needs to check that + * there are any bvecs before calling this helper. + */ +static inline struct bio_vec req_bvec(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return rq->special_vec; + return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); +} + static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, int op) { -- cgit v1.3-8-gc7d7 From 2a876f5e25e8ec9fa5777d36e5695ee33dd63f6f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 3 Mar 2019 08:38:29 -0700 Subject: block: add a rq_integrity_vec helper This provides a nice little shortcut to get the integrity data for drivers like NVMe that only support a single integrity segment. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- include/linux/blkdev.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 84ce76f92d83..3a13fbe13e08 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1559,6 +1559,17 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, return bio_integrity_intervals(bi, sectors) * bi->tuple_size; } +/* + * Return the first bvec that contains integrity data. Only drivers that are + * limited to a single integrity segment should use this helper. + */ +static inline struct bio_vec *rq_integrity_vec(struct request *rq) +{ + if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1)) + return NULL; + return rq->bio->bi_integrity->bip_vec; +} + #else /* CONFIG_BLK_DEV_INTEGRITY */ struct bio; @@ -1633,6 +1644,11 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, return 0; } +static inline struct bio_vec *rq_integrity_vec(struct request *rq) +{ + return NULL; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ struct block_device_operations { -- cgit v1.3-8-gc7d7 From 9d9de535f385a8b3ba0e88ca0abf386c5704bbfc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 3 Mar 2019 08:18:30 -0700 Subject: block: add a rq_dma_dir helper In a lot of places we want to know the DMA direction for a given struct request. Add a little helper to make it a littler easier. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3a13fbe13e08..74469a4dc0a1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -641,6 +641,9 @@ static inline bool blk_account_rq(struct request *rq) #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) +#define rq_dma_dir(rq) \ + (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) + static inline bool queue_is_mq(struct request_queue *q) { return q->mq_ops; -- cgit v1.3-8-gc7d7 From 3ab3a0313cb8c50391d74e40fd46a3408d8e4de9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 3 Mar 2019 08:40:36 -0700 Subject: block: add dma_map_bvec helper Provide a nice little shortcut for mapping a single bvec. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- include/linux/blkdev.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 74469a4dc0a1..4b85dc066264 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -644,6 +644,10 @@ static inline bool blk_account_rq(struct request *rq) #define rq_dma_dir(rq) \ (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) +#define dma_map_bvec(dev, bv, dir, attrs) \ + dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ + (dir), (attrs)) + static inline bool queue_is_mq(struct request_queue *q) { return q->mq_ops; -- cgit v1.3-8-gc7d7 From 12adb7a013e318de553ccee4a006a718667972b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 30 Apr 2019 13:56:16 -0400 Subject: block: remove the unused blk_queue_dma_pad function Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-settings.c | 16 ---------------- include/linux/blkdev.h | 1 - 2 files changed, 17 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-settings.c b/block/blk-settings.c index ec150f88db09..3facc41476be 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -663,22 +663,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, } EXPORT_SYMBOL(disk_stack_limits); -/** - * blk_queue_dma_pad - set pad mask - * @q: the request queue for the device - * @mask: pad mask - * - * Set dma pad mask. - * - * Appending pad buffer to a request modifies the last entry of a - * scatter list such that it includes the pad buffer. - **/ -void blk_queue_dma_pad(struct request_queue *q, unsigned int mask) -{ - q->dma_pad_mask = mask; -} -EXPORT_SYMBOL(blk_queue_dma_pad); - /** * blk_queue_update_dma_pad - update pad mask * @q: the request queue for the device diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99aa98f60b9e..bd3e3f09bfa0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1069,7 +1069,6 @@ extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); -extern void blk_queue_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, -- cgit v1.3-8-gc7d7 From 2f8f1336a48bd5186de3476da0a3e2ec06d0533a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 30 Apr 2019 09:52:27 +0800 Subject: blk-mq: always free hctx after request queue is freed In normal queue cleanup path, hctx is released after request queue is freed, see blk_mq_release(). However, in __blk_mq_update_nr_hw_queues(), hctx may be freed because of hw queues shrinking. This way is easy to cause use-after-free, because: one implicit rule is that it is safe to call almost all block layer APIs if the request queue is alive; and one hctx may be retrieved by one API, then the hctx can be freed by blk_mq_update_nr_hw_queues(); finally use-after-free is triggered. Fixes this issue by always freeing hctx after releasing request queue. If some hctxs are removed in blk_mq_update_nr_hw_queues(), introduce a per-queue list to hold them, then try to resuse these hctxs if numa node is matched. Cc: Dongli Zhang Cc: James Smart Cc: Bart Van Assche Cc: linux-scsi@vger.kernel.org, Cc: Martin K . Petersen , Cc: Christoph Hellwig , Cc: James E . J . Bottomley , Reviewed-by: Hannes Reinecke Tested-by: James Smart Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 46 +++++++++++++++++++++++++++++++++------------- include/linux/blk-mq.h | 2 ++ include/linux/blkdev.h | 7 +++++++ 3 files changed, 42 insertions(+), 13 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-mq.c b/block/blk-mq.c index 17e63d80b6d6..08a6248d8536 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2269,6 +2269,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, set->ops->exit_hctx(hctx, hctx_idx); blk_mq_remove_cpuhp(hctx); + + spin_lock(&q->unused_hctx_lock); + list_add(&hctx->hctx_list, &q->unused_hctx_list); + spin_unlock(&q->unused_hctx_lock); } static void blk_mq_exit_hw_queues(struct request_queue *q, @@ -2351,6 +2355,8 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; + INIT_LIST_HEAD(&hctx->hctx_list); + /* * Allocate space for all possible cpus to avoid allocation at * runtime @@ -2664,15 +2670,17 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) */ void blk_mq_release(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned int i; + struct blk_mq_hw_ctx *hctx, *next; + int i; cancel_delayed_work_sync(&q->requeue_work); - /* hctx kobj stays in hctx */ - queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx) - continue; + queue_for_each_hw_ctx(q, hctx, i) + WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); + + /* all hctx are in .unused_hctx_list now */ + list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { + list_del_init(&hctx->hctx_list); kobject_put(&hctx->kobj); } @@ -2739,9 +2747,22 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx = NULL, *tmp; - hctx = blk_mq_alloc_hctx(q, set, node); + /* reuse dead hctx first */ + spin_lock(&q->unused_hctx_lock); + list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { + if (tmp->numa_node == node) { + hctx = tmp; + break; + } + } + if (hctx) + list_del_init(&hctx->hctx_list); + spin_unlock(&q->unused_hctx_lock); + + if (!hctx) + hctx = blk_mq_alloc_hctx(q, set, node); if (!hctx) goto fail; @@ -2779,10 +2800,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); if (hctx) { - if (hctxs[i]) { + if (hctxs[i]) blk_mq_exit_hctx(q, set, hctxs[i], i); - kobject_put(&hctxs[i]->kobj); - } hctxs[i] = hctx; } else { if (hctxs[i]) @@ -2813,9 +2832,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, if (hctx->tags) blk_mq_free_map_and_requests(set, j); blk_mq_exit_hctx(q, set, hctx, j); - kobject_put(&hctx->kobj); hctxs[j] = NULL; - } } mutex_unlock(&q->sysfs_lock); @@ -2858,6 +2875,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (!q->queue_hw_ctx) goto err_sys_init; + INIT_LIST_HEAD(&q->unused_hctx_list); + spin_lock_init(&q->unused_hctx_lock); + blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index db29928de467..15d1aa53d96c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -70,6 +70,8 @@ struct blk_mq_hw_ctx { struct dentry *sched_debugfs_dir; #endif + struct list_head hctx_list; + /* Must be the last member - see also blk_mq_hw_ctx_size(). */ struct srcu_struct srcu[0]; }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bd3e3f09bfa0..1aafeb923e7b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -535,6 +535,13 @@ struct request_queue { struct mutex sysfs_lock; + /* + * for reusing dead hctx instance in case of updating + * nr_hw_queues + */ + struct list_head unused_hctx_list; + spinlock_t unused_hctx_lock; + atomic_t mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) -- cgit v1.3-8-gc7d7 From 7996a8b5511a72465b0b286763c2d8f412b8874a Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 21 May 2019 11:25:55 +0800 Subject: blk-mq: fix hang caused by freeze/unfreeze sequence The following is a description of a hang in blk_mq_freeze_queue_wait(). The hang happens on attempt to freeze a queue while another task does queue unfreeze. The root cause is an incorrect sequence of percpu_ref_resurrect() and percpu_ref_kill() and as a result those two can be swapped: CPU#0 CPU#1 ---------------- ----------------- q1 = blk_mq_init_queue(shared_tags) q2 = blk_mq_init_queue(shared_tags): blk_mq_add_queue_tag_set(shared_tags): blk_mq_update_tag_set_depth(shared_tags): list_for_each_entry() blk_mq_freeze_queue(q1) > percpu_ref_kill() > blk_mq_freeze_queue_wait() blk_cleanup_queue(q1) blk_mq_freeze_queue(q1) > percpu_ref_kill() ^^^^^^ freeze_depth can't guarantee the order blk_mq_unfreeze_queue() > percpu_ref_resurrect() > blk_mq_freeze_queue_wait() ^^^^^^ Hang here!!!! This wrong sequence raises kernel warning: percpu_ref_kill_and_confirm called more than once on blk_queue_usage_counter_release! WARNING: CPU: 0 PID: 11854 at lib/percpu-refcount.c:336 percpu_ref_kill_and_confirm+0x99/0xb0 But the most unpleasant effect is a hang of a blk_mq_freeze_queue_wait(), which waits for a zero of a q_usage_counter, which never happens because percpu-ref was reinited (instead of being killed) and stays in PERCPU state forever. How to reproduce: - "insmod null_blk.ko shared_tags=1 nr_devices=0 queue_mode=2" - cpu0: python Script.py 0; taskset the corresponding process running on cpu0 - cpu1: python Script.py 1; taskset the corresponding process running on cpu1 Script.py: ------ #!/usr/bin/python3 import os import sys while True: on = "echo 1 > /sys/kernel/config/nullb/%s/power" % sys.argv[1] off = "echo 0 > /sys/kernel/config/nullb/%s/power" % sys.argv[1] os.system(on) os.system(off) ------ This bug was first reported and fixed by Roman, previous discussion: [1] Message id: 1443287365-4244-7-git-send-email-akinobu.mita@gmail.com [2] Message id: 1443563240-29306-6-git-send-email-tj@kernel.org [3] https://patchwork.kernel.org/patch/9268199/ Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Roman Pen Signed-off-by: Bob Liu Signed-off-by: Jens Axboe --- block/blk-core.c | 3 ++- block/blk-mq.c | 19 ++++++++++--------- include/linux/blkdev.h | 7 ++++++- 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 419d600e6637..1bf83a0df0f6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -413,7 +413,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) smp_rmb(); wait_event(q->mq_freeze_wq, - (atomic_read(&q->mq_freeze_depth) == 0 && + (!q->mq_freeze_depth && (pm || (blk_pm_request_resume(q), !blk_queue_pm_only(q)))) || blk_queue_dying(q)); @@ -503,6 +503,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); + mutex_init(&q->mq_freeze_lock); /* * Init percpu_ref in atomic mode so that it's faster to shutdown. diff --git a/block/blk-mq.c b/block/blk-mq.c index 08a6248d8536..32b8ad3d341b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -144,13 +144,14 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, void blk_freeze_queue_start(struct request_queue *q) { - int freeze_depth; - - freeze_depth = atomic_inc_return(&q->mq_freeze_depth); - if (freeze_depth == 1) { + mutex_lock(&q->mq_freeze_lock); + if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); + mutex_unlock(&q->mq_freeze_lock); if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); + } else { + mutex_unlock(&q->mq_freeze_lock); } } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -199,14 +200,14 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) { - int freeze_depth; - - freeze_depth = atomic_dec_return(&q->mq_freeze_depth); - WARN_ON_ONCE(freeze_depth < 0); - if (!freeze_depth) { + mutex_lock(&q->mq_freeze_lock); + q->mq_freeze_depth--; + WARN_ON_ONCE(q->mq_freeze_depth < 0); + if (!q->mq_freeze_depth) { percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } + mutex_unlock(&q->mq_freeze_lock); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1aafeb923e7b..592669bcc536 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -542,7 +542,7 @@ struct request_queue { struct list_head unused_hctx_list; spinlock_t unused_hctx_lock; - atomic_t mq_freeze_depth; + int mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) struct bsg_class_device bsg_dev; @@ -554,6 +554,11 @@ struct request_queue { #endif struct rcu_head rcu_head; wait_queue_head_t mq_freeze_wq; + /* + * Protect concurrent access to q_usage_counter by + * percpu_ref_kill() and percpu_ref_reinit(). + */ + struct mutex mq_freeze_lock; struct percpu_ref q_usage_counter; struct blk_mq_tag_set *tag_set; -- cgit v1.3-8-gc7d7