From aa261f20589d894eb08b9a2b11c9672c548387cd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 25 Oct 2022 12:17:54 -0700 Subject: block: Constify most queue limits pointers Document which functions do not modify the queue limits. Reviewed-by: Ming Lei Cc: Christoph Hellwig Cc: Keith Busch Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20221025191755.1711437-3-bvanassche@acm.org Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'block/blk.h') diff --git a/block/blk.h b/block/blk.h index d6ea0d1a6db0..7f9e089ab1f7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -104,7 +104,7 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, return true; } -static inline bool __bvec_gap_to_prev(struct queue_limits *lim, +static inline bool __bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { return (offset & lim->virt_boundary_mask) || @@ -115,7 +115,7 @@ static inline bool __bvec_gap_to_prev(struct queue_limits *lim, * Check if adding a bio_vec after bprv with offset would create a gap in * the SG list. Most drivers don't care about this, but some do. */ -static inline bool bvec_gap_to_prev(struct queue_limits *lim, +static inline bool bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { if (!lim->virt_boundary_mask) @@ -297,7 +297,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); static inline bool bio_may_exceed_limits(struct bio *bio, - struct queue_limits *lim) + const struct queue_limits *lim) { switch (bio_op(bio)) { case REQ_OP_DISCARD: @@ -320,8 +320,9 @@ static inline bool bio_may_exceed_limits(struct bio *bio, bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; } -struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim, - unsigned int *nr_segs); +struct bio *__bio_split_to_limits(struct bio *bio, + const struct queue_limits *lim, + unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, -- cgit v1.2.3-59-g8ed1b From 64b36075eb0e50af6f59047b5f698a9f2bb2b4fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2022 11:07:14 +0100 Subject: block: split elevator_switch Split an elevator_disable helper from elevator_switch for the case where we want to switch to no scheduler at all. This includes removing the pointless elevator_switch_mq helper and removing the switch to no schedule logic from blk_mq_init_sched. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221030100714.876891-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 7 ----- block/blk-mq.c | 2 +- block/blk.h | 1 + block/elevator.c | 77 ++++++++++++++++++++++++++-------------------------- 4 files changed, 40 insertions(+), 47 deletions(-) (limited to 'block/blk.h') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 68227240fdea..23d1a90fec42 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -564,13 +564,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) unsigned long i; int ret; - if (!e) { - blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); - q->elevator = NULL; - q->nr_requests = q->tag_set->queue_depth; - return 0; - } - /* * Default to double of smaller one between hw queue_depth and 128, * since we don't split into sync/async like the old code did. diff --git a/block/blk-mq.c b/block/blk-mq.c index 623e8a506539..a78538586a40 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4588,7 +4588,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head, __elevator_get(qe->type); qe->type = q->elevator->type; list_add(&qe->node, head); - elevator_switch(q, NULL); + elevator_disable(q); mutex_unlock(&q->sysfs_lock); return true; diff --git a/block/blk.h b/block/blk.h index 7f9e089ab1f7..f1398fb96cec 100644 --- a/block/blk.h +++ b/block/blk.h @@ -278,6 +278,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, void blk_insert_flush(struct request *rq); int elevator_switch(struct request_queue *q, struct elevator_type *new_e); +void elevator_disable(struct request_queue *q); void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index 5781f5d50bb8..800e0038be0d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -554,39 +554,6 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -static int elevator_switch_mq(struct request_queue *q, - struct elevator_type *new_e) -{ - int ret; - - lockdep_assert_held(&q->sysfs_lock); - - if (q->elevator) { - elv_unregister_queue(q); - elevator_exit(q); - } - - ret = blk_mq_init_sched(q, new_e); - if (ret) - goto out; - - if (new_e) { - ret = elv_register_queue(q, true); - if (ret) { - elevator_exit(q); - goto out; - } - } - - if (new_e) - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); - else - blk_add_trace_msg(q, "elv switch: none"); - -out: - return ret; -} - static inline bool elv_support_iosched(struct request_queue *q) { if (!queue_is_mq(q) || @@ -691,19 +658,51 @@ void elevator_init_mq(struct request_queue *q) */ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { - int err; + int ret; lockdep_assert_held(&q->sysfs_lock); blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); - err = elevator_switch_mq(q, new_e); + if (q->elevator) { + elv_unregister_queue(q); + elevator_exit(q); + } + ret = blk_mq_init_sched(q, new_e); + if (ret) + goto out_unfreeze; + + ret = elv_register_queue(q, true); + if (ret) { + elevator_exit(q); + goto out_unfreeze; + } + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + +out_unfreeze: blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); + return ret; +} + +void elevator_disable(struct request_queue *q) +{ + lockdep_assert_held(&q->sysfs_lock); - return err; + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + elv_unregister_queue(q); + elevator_exit(q); + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); + q->elevator = NULL; + q->nr_requests = q->tag_set->queue_depth; + blk_add_trace_msg(q, "elv switch: none"); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); } /* @@ -722,9 +721,9 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) * Special case for mq, turn off scheduling */ if (!strncmp(elevator_name, "none", 4)) { - if (!q->elevator) - return 0; - return elevator_switch(q, NULL); + if (q->elevator) + elevator_disable(q); + return 0; } if (q->elevator && elevator_match(q->elevator->type, elevator_name)) -- cgit v1.2.3-59-g8ed1b From 80bd4a7aab4c9ce59bf5e35fdf52aa23d8a3c9f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:47 +0100 Subject: blk-mq: move the srcu_struct used for quiescing to the tagset All I/O submissions have fairly similar latencies, and a tagset-wide quiesce is a fairly common operation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Ming Lei Reviewed-by: Chao Leng Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-12-hch@lst.de [axboe: fix whitespace] Signed-off-by: Jens Axboe --- block/blk-core.c | 27 +++++---------------------- block/blk-mq.c | 33 +++++++++++++++++++++++++-------- block/blk-mq.h | 14 +++++++------- block/blk-sysfs.c | 9 ++------- block/blk.h | 9 +-------- block/genhd.c | 2 +- include/linux/blk-mq.h | 4 ++++ include/linux/blkdev.h | 9 --------- 8 files changed, 45 insertions(+), 62 deletions(-) (limited to 'block/blk.h') diff --git a/block/blk-core.c b/block/blk-core.c index 5d50dd16e2a5..e9e2bf15cd90 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -65,7 +65,6 @@ DEFINE_IDA(blk_queue_ida); * For queue allocation */ struct kmem_cache *blk_requestq_cachep; -struct kmem_cache *blk_requestq_srcu_cachep; /* * Controlling structure to kblockd @@ -373,26 +372,20 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) +struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; - q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu), - GFP_KERNEL | __GFP_ZERO, node_id); + q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, + node_id); if (!q) return NULL; - if (alloc_srcu) { - blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q); - if (init_srcu_struct(q->srcu) != 0) - goto fail_q; - } - q->last_merge = NULL; q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); if (q->id < 0) - goto fail_srcu; + goto fail_q; q->stats = blk_alloc_queue_stats(); if (!q->stats) @@ -435,11 +428,8 @@ fail_stats: blk_free_queue_stats(q->stats); fail_id: ida_free(&blk_queue_ida, q->id); -fail_srcu: - if (alloc_srcu) - cleanup_srcu_struct(q->srcu); fail_q: - kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q); + kmem_cache_free(blk_requestq_cachep, q); return NULL; } @@ -1172,9 +1162,6 @@ int __init blk_dev_init(void) sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct bio, bi_opf)); - BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu), - __alignof__(struct request_queue)) != - sizeof(struct request_queue)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", @@ -1185,10 +1172,6 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); - blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu", - sizeof(struct request_queue) + - sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL); - blk_debugfs_root = debugfs_create_dir("block", NULL); return 0; diff --git a/block/blk-mq.c b/block/blk-mq.c index a03abadfe4c6..bee728dac9cd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -261,8 +261,8 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); */ void blk_mq_wait_quiesce_done(struct request_queue *q) { - if (blk_queue_has_srcu(q)) - synchronize_srcu(q->srcu); + if (q->tag_set->flags & BLK_MQ_F_BLOCKING) + synchronize_srcu(q->tag_set->srcu); else synchronize_rcu(); } @@ -4003,7 +4003,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING); + q = blk_alloc_queue(set->numa_node); if (!q) return ERR_PTR(-ENOMEM); q->queuedata = queuedata; @@ -4168,9 +4168,6 @@ static void blk_mq_update_poll_flag(struct request_queue *q) int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { - WARN_ON_ONCE(blk_queue_has_srcu(q) != - !!(set->flags & BLK_MQ_F_BLOCKING)); - /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -4429,8 +4426,18 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) - return -ENOMEM; + if (set->flags & BLK_MQ_F_BLOCKING) { + set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); + if (!set->srcu) + return -ENOMEM; + ret = init_srcu_struct(set->srcu); + if (ret) + goto out_free_srcu; + } + + ret = blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues); + if (ret) + goto out_cleanup_srcu; ret = -ENOMEM; for (i = 0; i < set->nr_maps; i++) { @@ -4460,6 +4467,12 @@ out_free_mq_map: } kfree(set->tags); set->tags = NULL; +out_cleanup_srcu: + if (set->flags & BLK_MQ_F_BLOCKING) + cleanup_srcu_struct(set->srcu); +out_free_srcu: + if (set->flags & BLK_MQ_F_BLOCKING) + kfree(set->srcu); return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); @@ -4499,6 +4512,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) kfree(set->tags); set->tags = NULL; + if (set->flags & BLK_MQ_F_BLOCKING) { + cleanup_srcu_struct(set->srcu); + kfree(set->srcu); + } } EXPORT_SYMBOL(blk_mq_free_tag_set); diff --git a/block/blk-mq.h b/block/blk-mq.h index 0b2870839cdd..ef59fee62780 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -377,17 +377,17 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ - if (!blk_queue_has_srcu(q)) { \ - rcu_read_lock(); \ - (dispatch_ops); \ - rcu_read_unlock(); \ - } else { \ + if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ int srcu_idx; \ \ might_sleep_if(check_sleep); \ - srcu_idx = srcu_read_lock((q)->srcu); \ + srcu_idx = srcu_read_lock((q)->tag_set->srcu); \ (dispatch_ops); \ - srcu_read_unlock((q)->srcu, srcu_idx); \ + srcu_read_unlock((q)->tag_set->srcu, srcu_idx); \ + } else { \ + rcu_read_lock(); \ + (dispatch_ops); \ + rcu_read_unlock(); \ } \ } while (0) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7b98c7074771..02e94c4beff1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -742,10 +742,8 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, static void blk_free_queue_rcu(struct rcu_head *rcu_head) { - struct request_queue *q = container_of(rcu_head, struct request_queue, - rcu_head); - - kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q); + kmem_cache_free(blk_requestq_cachep, + container_of(rcu_head, struct request_queue, rcu_head)); } /** @@ -782,9 +780,6 @@ static void blk_release_queue(struct kobject *kobj) if (queue_is_mq(q)) blk_mq_release(q); - if (blk_queue_has_srcu(q)) - cleanup_srcu_struct(q->srcu); - ida_free(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); } diff --git a/block/blk.h b/block/blk.h index f1398fb96cec..e85703ae81dd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -27,7 +27,6 @@ struct blk_flush_queue { }; extern struct kmem_cache *blk_requestq_cachep; -extern struct kmem_cache *blk_requestq_srcu_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; @@ -429,13 +428,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); -static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu) -{ - if (srcu) - return blk_requestq_srcu_cachep; - return blk_requestq_cachep; -} -struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu); +struct request_queue *blk_alloc_queue(int node_id); int disk_scan_partitions(struct gendisk *disk, fmode_t mode); diff --git a/block/genhd.c b/block/genhd.c index e7bd036024fa..09cde914e054 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1414,7 +1414,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) struct request_queue *q; struct gendisk *disk; - q = blk_alloc_queue(node, false); + q = blk_alloc_queue(node); if (!q) return NULL; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 569053ed959d..f059edebb11d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -7,6 +7,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -500,6 +501,8 @@ enum hctx_type { * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. + * @srcu: Use as lock when type of the request queue is blocking + * (BLK_MQ_F_BLOCKING). */ struct blk_mq_tag_set { struct blk_mq_queue_map map[HCTX_MAX_TYPES]; @@ -520,6 +523,7 @@ struct blk_mq_tag_set { struct mutex tag_list_lock; struct list_head tag_list; + struct srcu_struct *srcu; }; /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 32137d85c9ad..6a6fa167fc82 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -543,18 +542,11 @@ struct request_queue { struct mutex debugfs_mutex; bool mq_sysfs_init_done; - - /** - * @srcu: Sleepable RCU. Use as lock when type of the request queue - * is blocking (BLK_MQ_F_BLOCKING). Must be the last member - */ - struct srcu_struct srcu[]; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ -#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ @@ -590,7 +582,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) -#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ -- cgit v1.2.3-59-g8ed1b From 2bd85221a625b316114bafaab527770b607095d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:26:36 +0100 Subject: block: untangle request_queue refcounting from sysfs The kobject embedded into the request_queue is used for the queue directory in sysfs, but that is a child of the gendisks directory and is intimately tied to it. Move this kobject to the gendisk and use a refcount_t in the request_queue for the actual request_queue refcounting that is completely unrelated to the device model. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221114042637.1009333-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 42 ++++++++++++++++++----- block/blk-crypto-sysfs.c | 4 +-- block/blk-ia-ranges.c | 3 +- block/blk-sysfs.c | 86 +++++++++++++----------------------------------- block/blk.h | 4 --- block/bsg.c | 11 ++++--- block/elevator.c | 2 +- include/linux/blkdev.h | 6 ++-- 8 files changed, 71 insertions(+), 87 deletions(-) (limited to 'block/blk.h') diff --git a/block/blk-core.c b/block/blk-core.c index e9e2bf15cd90..d14317bfdf65 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -59,12 +59,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert); -DEFINE_IDA(blk_queue_ida); +static DEFINE_IDA(blk_queue_ida); /* * For queue allocation */ -struct kmem_cache *blk_requestq_cachep; +static struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd @@ -252,19 +252,46 @@ void blk_clear_pm_only(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_clear_pm_only); +static void blk_free_queue_rcu(struct rcu_head *rcu_head) +{ + kmem_cache_free(blk_requestq_cachep, + container_of(rcu_head, struct request_queue, rcu_head)); +} + +static void blk_free_queue(struct request_queue *q) +{ + might_sleep(); + + percpu_ref_exit(&q->q_usage_counter); + + if (q->poll_stat) + blk_stat_remove_callback(q, q->poll_cb); + blk_stat_free_callback(q->poll_cb); + + blk_free_queue_stats(q->stats); + kfree(q->poll_stat); + + if (queue_is_mq(q)) + blk_mq_release(q); + + ida_free(&blk_queue_ida, q->id); + call_rcu(&q->rcu_head, blk_free_queue_rcu); +} + /** * blk_put_queue - decrement the request_queue refcount * @q: the request_queue structure to decrement the refcount for * - * Decrements the refcount of the request_queue kobject. When this reaches 0 - * we'll have blk_release_queue() called. + * Decrements the refcount of the request_queue and free it when the refcount + * reaches 0. * * Context: Any context, but the last reference must not be dropped from * atomic context. */ void blk_put_queue(struct request_queue *q) { - kobject_put(&q->kobj); + if (refcount_dec_and_test(&q->refs)) + blk_free_queue(q); } EXPORT_SYMBOL(blk_put_queue); @@ -399,8 +426,7 @@ struct request_queue *blk_alloc_queue(int node_id) INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); - kobject_init(&q->kobj, &blk_queue_ktype); - + refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); @@ -445,7 +471,7 @@ bool blk_get_queue(struct request_queue *q) { if (unlikely(blk_queue_dying(q))) return false; - kobject_get(&q->kobj); + refcount_inc(&q->refs); return true; } EXPORT_SYMBOL(blk_get_queue); diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c index e05f145cd797..55268edc0625 100644 --- a/block/blk-crypto-sysfs.c +++ b/block/blk-crypto-sysfs.c @@ -140,8 +140,8 @@ int blk_crypto_sysfs_register(struct gendisk *disk) return -ENOMEM; obj->profile = q->crypto_profile; - err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj, - "crypto"); + err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, + &disk->queue_kobj, "crypto"); if (err) { kobject_put(&obj->kobj); return err; diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index 2bd1d311033b..2141931ddd37 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -123,7 +123,8 @@ int disk_register_independent_access_ranges(struct gendisk *disk) */ WARN_ON(iars->sysfs_registered); ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype, - &q->kobj, "%s", "independent_access_ranges"); + &disk->queue_kobj, "%s", + "independent_access_ranges"); if (ret) { disk->ia_ranges = NULL; kobject_put(&iars->kobj); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index abd1784ff05e..93d9e9c9a6ea 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -683,8 +683,8 @@ static struct attribute *queue_attrs[] = { static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; if (attr == &queue_io_timeout_entry.attr && (!q->mq_ops || !q->mq_ops->timeout)) @@ -710,8 +710,8 @@ static ssize_t queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; ssize_t res; if (!entry->show) @@ -727,63 +727,19 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct queue_sysfs_entry *entry = to_queue(attr); - struct request_queue *q; + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; ssize_t res; if (!entry->store) return -EIO; - q = container_of(kobj, struct request_queue, kobj); mutex_lock(&q->sysfs_lock); res = entry->store(q, page, length); mutex_unlock(&q->sysfs_lock); return res; } -static void blk_free_queue_rcu(struct rcu_head *rcu_head) -{ - kmem_cache_free(blk_requestq_cachep, - container_of(rcu_head, struct request_queue, rcu_head)); -} - -/** - * blk_release_queue - releases all allocated resources of the request_queue - * @kobj: pointer to a kobject, whose container is a request_queue - * - * This function releases all allocated resources of the request queue. - * - * The struct request_queue refcount is incremented with blk_get_queue() and - * decremented with blk_put_queue(). Once the refcount reaches 0 this function - * is called. - * - * Drivers exist which depend on the release of the request_queue to be - * synchronous, it should not be deferred. - * - * Context: can sleep - */ -static void blk_release_queue(struct kobject *kobj) -{ - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); - - might_sleep(); - - percpu_ref_exit(&q->q_usage_counter); - - if (q->poll_stat) - blk_stat_remove_callback(q, q->poll_cb); - blk_stat_free_callback(q->poll_cb); - - blk_free_queue_stats(q->stats); - kfree(q->poll_stat); - - if (queue_is_mq(q)) - blk_mq_release(q); - - ida_free(&blk_queue_ida, q->id); - call_rcu(&q->rcu_head, blk_free_queue_rcu); -} - static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, @@ -794,10 +750,15 @@ static const struct attribute_group *blk_queue_attr_groups[] = { NULL }; -struct kobj_type blk_queue_ktype = { +static void blk_queue_release(struct kobject *kobj) +{ + /* nothing to do here, all data is associated with the parent gendisk */ +} + +static struct kobj_type blk_queue_ktype = { .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, - .release = blk_release_queue, + .release = blk_queue_release, }; static void blk_debugfs_remove(struct gendisk *disk) @@ -823,20 +784,20 @@ int blk_register_queue(struct gendisk *disk) int ret; mutex_lock(&q->sysfs_dir_lock); - ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue"); + kobject_init(&disk->queue_kobj, &blk_queue_ktype); + ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) - goto out_unlock_dir; + goto out_put_queue_kobj; if (queue_is_mq(q)) { ret = blk_mq_sysfs_register(disk); if (ret) - goto out_del_queue_kobj; + goto out_put_queue_kobj; } mutex_lock(&q->sysfs_lock); mutex_lock(&q->debugfs_mutex); - q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), - blk_debugfs_root); + q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root); if (queue_is_mq(q)) blk_mq_debugfs_register(q); mutex_unlock(&q->debugfs_mutex); @@ -860,7 +821,7 @@ int blk_register_queue(struct gendisk *disk) blk_throtl_register(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ - kobject_uevent(&q->kobj, KOBJ_ADD); + kobject_uevent(&disk->queue_kobj, KOBJ_ADD); if (q->elevator) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); @@ -889,9 +850,8 @@ out_unregister_ia_ranges: out_debugfs_remove: blk_debugfs_remove(disk); mutex_unlock(&q->sysfs_lock); -out_del_queue_kobj: - kobject_del(&q->kobj); -out_unlock_dir: +out_put_queue_kobj: + kobject_put(&disk->queue_kobj); mutex_unlock(&q->sysfs_dir_lock); return ret; } @@ -938,8 +898,8 @@ void blk_unregister_queue(struct gendisk *disk) mutex_unlock(&q->sysfs_lock); /* Now that we've deleted all child objects, we can delete the queue. */ - kobject_uevent(&q->kobj, KOBJ_REMOVE); - kobject_del(&q->kobj); + kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); + kobject_del(&disk->queue_kobj); mutex_unlock(&q->sysfs_dir_lock); blk_debugfs_remove(disk); diff --git a/block/blk.h b/block/blk.h index e85703ae81dd..a8ac9803fcb3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -26,10 +26,6 @@ struct blk_flush_queue { spinlock_t mq_flush_lock; }; -extern struct kmem_cache *blk_requestq_cachep; -extern struct kobj_type blk_queue_ktype; -extern struct ida blk_queue_ida; - bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, diff --git a/block/bsg.c b/block/bsg.c index 2ab1351eb082..8eba57b9bb46 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -175,8 +175,10 @@ static void bsg_device_release(struct device *dev) void bsg_unregister_queue(struct bsg_device *bd) { - if (bd->queue->kobj.sd) - sysfs_remove_link(&bd->queue->kobj, "bsg"); + struct gendisk *disk = bd->queue->disk; + + if (disk && disk->queue_kobj.sd) + sysfs_remove_link(&disk->queue_kobj, "bsg"); cdev_device_del(&bd->cdev, &bd->device); put_device(&bd->device); } @@ -216,8 +218,9 @@ struct bsg_device *bsg_register_queue(struct request_queue *q, if (ret) goto out_put_device; - if (q->kobj.sd) { - ret = sysfs_create_link(&q->kobj, &bd->device.kobj, "bsg"); + if (q->disk && q->disk->queue_kobj.sd) { + ret = sysfs_create_link(&q->disk->queue_kobj, &bd->device.kobj, + "bsg"); if (ret) goto out_device_del; } diff --git a/block/elevator.c b/block/elevator.c index 14e03632b5b5..adee58e48e2d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -467,7 +467,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) lockdep_assert_held(&q->sysfs_lock); - error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); + error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; if (attr) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 516e45246868..469299ea0660 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -155,6 +155,7 @@ struct gendisk { unsigned open_partitions; /* number of open partitions */ struct backing_dev_info *bdi; + struct kobject queue_kobj; /* the queue/ directory */ struct kobject *slave_dir; #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED struct list_head slave_bdevs; @@ -430,10 +431,7 @@ struct request_queue { struct gendisk *disk; - /* - * queue kobject - */ - struct kobject kobj; + refcount_t refs; /* * mq queue kobject -- cgit v1.2.3-59-g8ed1b From 36369f46e91785688a5f39d7a5590e3f07981316 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 30 Nov 2022 18:56:53 +0100 Subject: block: Do not reread partition table on exclusively open device Since commit 10c70d95c0f2 ("block: remove the bd_openers checks in blk_drop_partitions") we allow rereading of partition table although there are users of the block device. This has an undesirable consequence that e.g. if sda and sdb are assembled to a RAID1 device md0 with partitions, BLKRRPART ioctl on sda will rescan partition table and create sda1 device. This partition device under a raid device confuses some programs (such as libstorage-ng used for initial partitioning for distribution installation) leading to failures. Fix the problem refusing to rescan partitions if there is another user that has the block device exclusively open. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20221130135344.2ul4cyfstfs3znxg@quack3 Fixes: 10c70d95c0f2 ("block: remove the bd_openers checks in blk_drop_partitions") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221130175653.24299-1-jack@suse.cz [axboe: fold in followup fix] Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/genhd.c | 7 +++++-- block/ioctl.c | 12 +++++++----- 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'block/blk.h') diff --git a/block/blk.h b/block/blk.h index a8ac9803fcb3..8900001946c7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -426,7 +426,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct request_queue *blk_alloc_queue(int node_id); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode); +int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); diff --git a/block/genhd.c b/block/genhd.c index 075d8da284f5..52d71a94a809 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -356,7 +356,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action) } EXPORT_SYMBOL_GPL(disk_uevent); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode) +int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner) { struct block_device *bdev; @@ -366,6 +366,9 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) return -EINVAL; if (disk->open_partitions) return -EBUSY; + /* Someone else has bdev exclusively open? */ + if (disk->part0->bd_holder && disk->part0->bd_holder != owner) + return -EBUSY; set_bit(GD_NEED_PART_SCAN, &disk->state); bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL); @@ -495,7 +498,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, bdev_add(disk->part0, ddev->devt); if (get_capacity(disk)) - disk_scan_partitions(disk, FMODE_READ); + disk_scan_partitions(disk, FMODE_READ, NULL); /* * Announce the disk and partitions after all partitions are diff --git a/block/ioctl.c b/block/ioctl.c index 60121e89052b..96617512982e 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -467,9 +467,10 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, * user space. Note the separate arg/argp parameters that are needed * to deal with the compat_ptr() conversion. */ -static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg, void __user *argp) +static int blkdev_common_ioctl(struct file *file, fmode_t mode, unsigned cmd, + unsigned long arg, void __user *argp) { + struct block_device *bdev = I_BDEV(file->f_mapping->host); unsigned int max_sectors; switch (cmd) { @@ -527,7 +528,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; - return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL); + return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL, + file); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: @@ -605,7 +607,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; } - ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); + ret = blkdev_common_ioctl(file, mode, cmd, arg, argp); if (ret != -ENOIOCTLCMD) return ret; @@ -674,7 +676,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; } - ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); + ret = blkdev_common_ioctl(file, mode, cmd, arg, argp); if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl) ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg); -- cgit v1.2.3-59-g8ed1b