From b93af3055d6f32d3b0361cfdb110c9399c1241ba Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 27 Jul 2021 17:32:53 +0800 Subject: blk-mq-sched: Fix blk_mq_sched_alloc_tags() error handling If the blk_mq_sched_alloc_tags() -> blk_mq_alloc_rqs() call fails, then we call blk_mq_sched_free_tags() -> blk_mq_free_rqs(). It is incorrect to do so, as any rqs would have already been freed in the blk_mq_alloc_rqs() call. Fix by calling blk_mq_free_rq_map() only directly. Fixes: 6917ff0b5bd41 ("blk-mq-sched: refactor scheduler initialization") Signed-off-by: John Garry Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/1627378373-148090-1-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c838d81ac058..0f006cabfd91 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -515,17 +515,6 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, percpu_ref_put(&q->q_usage_counter); } -static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - if (hctx->sched_tags) { - blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); - blk_mq_free_rq_map(hctx->sched_tags, set->flags); - hctx->sched_tags = NULL; - } -} - static int blk_mq_sched_alloc_tags(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) @@ -539,8 +528,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, return -ENOMEM; ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); - if (ret) - blk_mq_sched_free_tags(set, hctx, hctx_idx); + if (ret) { + blk_mq_free_rq_map(hctx->sched_tags, set->flags); + hctx->sched_tags = NULL; + } return ret; } -- cgit v1.2.3-59-g8ed1b From 5ab189cf3abbc9994bae3be524c5b88589ed56e2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Jul 2021 14:38:09 -1000 Subject: blk-iocost: fix operation ordering in iocg_wake_fn() iocg_wake_fn() open-codes wait_queue_entry removal and wakeup because it wants the wq_entry to be always removed whether it ended up waking the task or not. finish_wait() tests whether wq_entry needs removal without grabbing the wait_queue lock and expects the waker to use list_del_init_careful() after all waking operations are complete, which iocg_wake_fn() didn't do. The operation order was wrong and the regular list_del_init() was used. The result is that if a waiter wakes up racing the waker, it can free pop the wq_entry off stack before the waker is still looking at it, which can lead to a backtrace like the following. [7312084.588951] general protection fault, probably for non-canonical address 0x586bf4005b2b88: 0000 [#1] SMP ... [7312084.647079] RIP: 0010:queued_spin_lock_slowpath+0x171/0x1b0 ... [7312084.858314] Call Trace: [7312084.863548] _raw_spin_lock_irqsave+0x22/0x30 [7312084.872605] try_to_wake_up+0x4c/0x4f0 [7312084.880444] iocg_wake_fn+0x71/0x80 [7312084.887763] __wake_up_common+0x71/0x140 [7312084.895951] iocg_kick_waitq+0xe8/0x2b0 [7312084.903964] ioc_rqos_throttle+0x275/0x650 [7312084.922423] __rq_qos_throttle+0x20/0x30 [7312084.930608] blk_mq_make_request+0x120/0x650 [7312084.939490] generic_make_request+0xca/0x310 [7312084.957600] submit_bio+0x173/0x200 [7312084.981806] swap_readpage+0x15c/0x240 [7312084.989646] read_swap_cache_async+0x58/0x60 [7312084.998527] swap_cluster_readahead+0x201/0x320 [7312085.023432] swapin_readahead+0x2df/0x450 [7312085.040672] do_swap_page+0x52f/0x820 [7312085.058259] handle_mm_fault+0xa16/0x1420 [7312085.066620] do_page_fault+0x2c6/0x5c0 [7312085.074459] page_fault+0x2f/0x40 Fix it by switching to list_del_init_careful() and putting it at the end. Signed-off-by: Tejun Heo Reported-by: Rik van Riel Fixes: 7caa47151ab2 ("blkcg: implement blk-iocost") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Jens Axboe --- block/blk-iocost.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index c2d6bc88d3f1..5fac3757e6e0 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1440,16 +1440,17 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, return -1; iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); + wait->committed = true; /* * autoremove_wake_function() removes the wait entry only when it - * actually changed the task state. We want the wait always - * removed. Remove explicitly and use default_wake_function(). + * actually changed the task state. We want the wait always removed. + * Remove explicitly and use default_wake_function(). Note that the + * order of operations is important as finish_wait() tests whether + * @wq_entry is removed without grabbing the lock. */ - list_del_init(&wq_entry->entry); - wait->committed = true; - default_wake_function(wq_entry, mode, flags, key); + list_del_init_careful(&wq_entry->entry); return 0; } -- cgit v1.2.3-59-g8ed1b From 340e84573878b2b9d63210482af46883366361b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:54 +0200 Subject: block: delay freeing the gendisk blkdev_get_no_open acquires a reference to the block_device through the block device inode and then tries to acquire a device model reference to the gendisk. But at this point the disk migh already be freed (although the race is free). Fix this by only freeing the gendisk from the whole device bdevs ->free_inode callback as well. Fixes: 22ae8ce8b892 ("block: simplify bdev/disk lookup in blkdev_get") Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210722075402.983367-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 3 +-- fs/block_dev.c | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index af4d2ab4a633..298ee78c1bda 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1079,10 +1079,9 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); - bdput(disk->part0); if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) blk_put_queue(disk->queue); - kfree(disk); + bdput(disk->part0); /* frees the disk */ } struct class block_class = { .name = "block", diff --git a/fs/block_dev.c b/fs/block_dev.c index ca8bf1869ca8..a38b0f33211c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -812,6 +812,8 @@ static void bdev_free_inode(struct inode *inode) free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); + if (!bdev_is_partition(bdev)) + kfree(bdev->bd_disk); kmem_cache_free(bdev_cachep, BDEV_I(inode)); } -- cgit v1.2.3-59-g8ed1b