From 98e5440271172ca5c654784a31e46453464a3ca9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:40 -0700 Subject: block: Fix three kernel-doc warnings Fix the following kernel-doc warnings: block/t10-pi.c:242: warning: Function parameter or member 'rq' not described in 't10_pi_type3_prepare' block/t10-pi.c:249: warning: Function parameter or member 'rq' not described in 't10_pi_type3_complete' block/t10-pi.c:249: warning: Function parameter or member 'nr_bytes' not described in 't10_pi_type3_complete' Cc: Max Gurtovoy Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Fixes: 54d4e6ab91eb ("block: centralize PI remapping logic to the block layer") Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/t10-pi.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/t10-pi.c b/block/t10-pi.c index 9803c7e0376e..f4907d941f03 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -235,16 +235,12 @@ static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); } -/** - * Type 3 does not have a reference tag so no remapping is required. - */ +/* Type 3 does not have a reference tag so no remapping is required. */ static void t10_pi_type3_prepare(struct request *rq) { } -/** - * Type 3 does not have a reference tag so no remapping is required. - */ +/* Type 3 does not have a reference tag so no remapping is required. */ static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) { } -- cgit v1.2.3-59-g8ed1b From 9566256518de0520c964bdf23140eac324b136af Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:42 -0700 Subject: block: Remove request_queue.nr_queues Commit 897bb0c7f1ea ("blk-mq: Use proper cpumask iterator"; v4.6) removed the last use of request_queue.nr_queues from outside blk_mq_init_allocate_queue(). Remove this member variable to make struct request_queue smaller. This patch does not change any functionality. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- include/linux/blkdev.h | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ec791156e9cc..9d932939b576 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2876,9 +2876,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); - q->nr_queues = nr_hw_queues(set); - q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)), - GFP_KERNEL, set->numa_node); + q->queue_hw_ctx = kcalloc_node(nr_hw_queues(set), + sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, + set->numa_node); if (!q->queue_hw_ctx) goto err_sys_init; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f3ea78b0c91c..d4051acb92a1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -411,7 +411,6 @@ struct request_queue { /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; - unsigned int nr_queues; unsigned int queue_depth; -- cgit v1.2.3-59-g8ed1b From bae85c156f619939ef6261f1bd4fabbe24361b50 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:43 -0700 Subject: block: Remove "dying" checks from sysfs callbacks Block drivers must call del_gendisk() before blk_cleanup_queue(). del_gendisk() calls kobject_del() and kobject_del() waits until any ongoing sysfs callback functions have finished. In other words, the sysfs callback functions won't be called for a queue in the dying state. Hence remove the "dying" checks from the sysfs callback functions. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/blk-mq-sysfs.c | 16 ++++------------ block/blk-sysfs.c | 8 -------- 3 files changed, 6 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d5e668ec751b..8b51d9ec8ae3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -336,6 +336,8 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying); */ void blk_cleanup_queue(struct request_queue *q) { + WARN_ON_ONCE(blk_queue_registered(q)); + /* mark @q DYING, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); blk_set_queue_dying(q); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index a0d3ce30fa08..81a273b43329 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -74,10 +74,8 @@ static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, if (!entry->show) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->show(ctx, page); + res = entry->show(ctx, page); mutex_unlock(&q->sysfs_lock); return res; } @@ -97,10 +95,8 @@ static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, if (!entry->store) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->store(ctx, page, length); + res = entry->store(ctx, page, length); mutex_unlock(&q->sysfs_lock); return res; } @@ -120,10 +116,8 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, if (!entry->show) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->show(hctx, page); + res = entry->show(hctx, page); mutex_unlock(&q->sysfs_lock); return res; } @@ -144,10 +138,8 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, if (!entry->store) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->store(hctx, page, length); + res = entry->store(hctx, page, length); mutex_unlock(&q->sysfs_lock); return res; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 46f5198be017..fca9b158f4a0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -801,10 +801,6 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; mutex_lock(&q->sysfs_lock); - if (blk_queue_dying(q)) { - mutex_unlock(&q->sysfs_lock); - return -ENOENT; - } res = entry->show(q, page); mutex_unlock(&q->sysfs_lock); return res; @@ -823,10 +819,6 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, q = container_of(kobj, struct request_queue, kobj); mutex_lock(&q->sysfs_lock); - if (blk_queue_dying(q)) { - mutex_unlock(&q->sysfs_lock); - return -ENOENT; - } res = entry->store(q, page, length); mutex_unlock(&q->sysfs_lock); return res; -- cgit v1.2.3-59-g8ed1b From 73f1c77e65117e8f44074402e7cf5f4934505bfb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:44 -0700 Subject: block: Reduce sysfs_lock locking inside blk_cleanup_queue() Since blk_cleanup_queue() is called after blk_unregister_queue() and since that last function removes all sysfs attributes, serializing any code in blk_cleanup_queue() against sysfs callback methods nor against I/O scheduler changes is necessary. Hence remove the syfs_lock locking calls from the start of blk_cleanup_queue(). Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 8b51d9ec8ae3..ae506ac2dd48 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -339,13 +339,11 @@ void blk_cleanup_queue(struct request_queue *q) WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ - mutex_lock(&q->sysfs_lock); blk_set_queue_dying(q); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); blk_queue_flag_set(QUEUE_FLAG_DYING, q); - mutex_unlock(&q->sysfs_lock); /* * Drain all requests queued before DYING marking. Set DEAD flag to -- cgit v1.2.3-59-g8ed1b From 27a46989a82c71028f2ba15a3f2c8f30451fda33 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Sep 2019 11:25:49 +0300 Subject: blk-mq: Inline status checkers blk_mq_request_completed() and blk_mq_request_started() are short, inline it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ------------ block/blk-mq.h | 9 --------- include/linux/blk-mq.h | 20 ++++++++++++++++++-- 3 files changed, 18 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 9d932939b576..268a95e899b7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -663,18 +663,6 @@ bool blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -int blk_mq_request_started(struct request *rq) -{ - return blk_mq_rq_state(rq) != MQ_RQ_IDLE; -} -EXPORT_SYMBOL_GPL(blk_mq_request_started); - -int blk_mq_request_completed(struct request *rq) -{ - return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; -} -EXPORT_SYMBOL_GPL(blk_mq_request_completed); - void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; diff --git a/block/blk-mq.h b/block/blk-mq.h index 32c62c64e6c2..eaaca8fc1c28 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -128,15 +128,6 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_release(struct request_queue *q); -/** - * blk_mq_rq_state() - read the current MQ_RQ_* state of a request - * @rq: target request. - */ -static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) -{ - return READ_ONCE(rq->state); -} - static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a96b5cc957ab..e0fce93ac127 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -333,9 +333,25 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; } +/** + * blk_mq_rq_state() - read the current MQ_RQ_* state of a request + * @rq: target request. + */ +static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) +{ + return READ_ONCE(rq->state); +} + +static inline int blk_mq_request_started(struct request *rq) +{ + return blk_mq_rq_state(rq) != MQ_RQ_IDLE; +} + +static inline int blk_mq_request_completed(struct request *rq) +{ + return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; +} -int blk_mq_request_started(struct request *rq); -int blk_mq_request_completed(struct request *rq); void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); -- cgit v1.2.3-59-g8ed1b From bb4e6b149103c285aeeba43a8141ea3b7009c0fa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Sep 2019 21:55:33 +0300 Subject: blk-mq: Reuse callback in blk_mq_in_flight*() Reuse a more generic callback in both blk_mq_in_flight() and blk_mq_in_flight_rw(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-mq.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 268a95e899b7..19560a16a774 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -102,11 +102,8 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - /* - * index[0] counts the specific partition that was asked for. - */ if (rq->part == mi->part) - mi->inflight[0]++; + mi->inflight[rq_data_dir(rq)]++; return true; } @@ -119,19 +116,7 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - return inflight[0]; -} - -static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, - bool reserved) -{ - struct mq_inflight *mi = priv; - - if (rq->part == mi->part) - mi->inflight[rq_data_dir(rq)]++; - - return true; + return inflight[0] + inflight[1]; } void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, @@ -140,7 +125,7 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, struct mq_inflight mi = { .part = part, .inflight = inflight, }; inflight[0] = inflight[1] = 0; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); } void blk_freeze_queue_start(struct request_queue *q) -- cgit v1.2.3-59-g8ed1b From a2e80f6f044526e00a788aa8e1ed72b1911f4534 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Sep 2019 21:55:34 +0300 Subject: blk-mq: Embed counters into struct mq_inflight Store inflight counters immediately in struct mq_inflight. That's type-safer and removes extra indirection. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 19560a16a774..8538dc415499 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -93,7 +93,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct mq_inflight { struct hd_struct *part; - unsigned int *inflight; + unsigned int inflight[2]; }; static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, @@ -110,22 +110,21 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) { - unsigned inflight[2]; - struct mq_inflight mi = { .part = part, .inflight = inflight, }; + struct mq_inflight mi = { .part = part }; - inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - return inflight[0] + inflight[1]; + return mi.inflight[0] + mi.inflight[1]; } void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - struct mq_inflight mi = { .part = part, .inflight = inflight, }; + struct mq_inflight mi = { .part = part }; - inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + inflight[0] = mi.inflight[0]; + inflight[1] = mi.inflight[1]; } void blk_freeze_queue_start(struct request_queue *q) -- cgit v1.2.3-59-g8ed1b From 8148f0b5647a831c5d94b59da240c8e76dbacae9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 8 Oct 2019 00:16:51 +0300 Subject: blk-stat: Optimise blk_stat_add() blk_stat_add() calls {get,put}_cpu_ptr() in a loop, which entails overhead of disabling/enabling preemption. The loop is under RCU (i.e.short) anyway, so do get_cpu() in advance. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-stat.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-stat.c b/block/blk-stat.c index 940f15d600f8..7da302ff88d0 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -53,7 +53,7 @@ void blk_stat_add(struct request *rq, u64 now) struct request_queue *q = rq->q; struct blk_stat_callback *cb; struct blk_rq_stat *stat; - int bucket; + int bucket, cpu; u64 value; value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; @@ -61,6 +61,7 @@ void blk_stat_add(struct request *rq, u64 now) blk_throtl_stat_add(rq, value); rcu_read_lock(); + cpu = get_cpu(); list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { if (!blk_stat_is_active(cb)) continue; @@ -69,10 +70,10 @@ void blk_stat_add(struct request *rq, u64 now) if (bucket < 0) continue; - stat = &get_cpu_ptr(cb->cpu_stat)[bucket]; + stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket]; blk_rq_stat_add(stat, value); - put_cpu_ptr(cb->cpu_stat); } + put_cpu(); rcu_read_unlock(); } -- cgit v1.2.3-59-g8ed1b From 48d9b0d43105e0da2b7c135eedd24e51234fb5e4 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 10 Oct 2019 17:36:26 -0600 Subject: block: account statistics for passthrough requests Presently, passthrough requests are not accounted for because blk_do_io_stat() expressly rejects them. Based on some digging in the history, this doesn't seem like a concious decision but one that evolved from the change from blk_fs_request() to blk_rq_is_passthrough(). To support this, call blk_account_io_start() in blk_execute_rq_nowait() and remove the passthrough check in blk_do_io_stat(). Link: https://lore.kernel.org/linux-block/20191010100526.GA27209@lst.de/ Signed-off-by: Logan Gunthorpe Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-exec.c | 2 ++ block/blk.h | 7 ++----- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-exec.c b/block/blk-exec.c index 1db44ca0f4a6..e20a852ae432 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -55,6 +55,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq->rq_disk = bd_disk; rq->end_io = done; + blk_account_io_start(rq, true); + /* * don't check dying flag for MQ because the request won't * be reused after dying flag is set diff --git a/block/blk.h b/block/blk.h index 47fba9362e60..2bea40180b6f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -242,14 +242,11 @@ int blk_dev_init(void); * Contribute to IO statistics IFF: * * a) it's attached to a gendisk, and - * b) the queue had IO stats enabled when this request was started, and - * c) it's a file system request + * b) the queue had IO stats enabled when this request was started */ static inline bool blk_do_io_stat(struct request *rq) { - return rq->rq_disk && - (rq->rq_flags & RQF_IO_STAT) && - !blk_rq_is_passthrough(rq); + return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT); } static inline void req_set_nomerge(struct request_queue *q, struct request *req) -- cgit v1.2.3-59-g8ed1b From a9a808084d6ac015ac68b4223ac5f6ef31f8076e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 25 Oct 2019 09:50:08 -0700 Subject: block: Remove the synchronize_rcu() call from __blk_mq_update_nr_hw_queues() Since the blk_mq_{,un}freeze_queue() calls in __blk_mq_update_nr_hw_queues() already serialize __blk_mq_update_nr_hw_queues() against blk_mq_queue_tag_busy_iter(), the synchronize_rcu() call in __blk_mq_update_nr_hw_queues() is not necessary. Hence remove it. Note: the synchronize_rcu() call in __blk_mq_update_nr_hw_queues() was introduced by commit f5bbbbe4d635 ("blk-mq: sync the update nr_hw_queues with blk_mq_queue_tag_busy_iter"). Commit 530ca2c9bd69 ("blk-mq: Allow blocking queue tag iter callbacks") removed the rcu_read_{,un}lock() calls that correspond to the synchronize_rcu() call in __blk_mq_update_nr_hw_queues(). Reviewed-by: Ming Lei Cc: Jianchao Wang Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8538dc415499..7528678ef41f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3242,10 +3242,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); - /* - * Sync with blk_mq_queue_tag_busy_iter. - */ - synchronize_rcu(); /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done -- cgit v1.2.3-59-g8ed1b From ac0d6b926e741f328b23c8af0134312af7c032d9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 25 Oct 2019 09:50:09 -0700 Subject: block: Reduce the amount of memory required per request queue Instead of always allocating at least nr_cpu_ids hardware queues per request queue, reallocate q->queue_hw_ctx if it has to grow. This patch improves behavior that was introduced by commit 868f2f0b7206 ("blk-mq: dynamic h/w context count"). Cc: Keith Busch Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 7528678ef41f..ba09cda49953 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2761,6 +2761,23 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, int i, j, end; struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + if (q->nr_hw_queues < set->nr_hw_queues) { + struct blk_mq_hw_ctx **new_hctxs; + + new_hctxs = kcalloc_node(set->nr_hw_queues, + sizeof(*new_hctxs), GFP_KERNEL, + set->numa_node); + if (!new_hctxs) + return; + if (hctxs) + memcpy(new_hctxs, hctxs, q->nr_hw_queues * + sizeof(*hctxs)); + q->queue_hw_ctx = new_hctxs; + q->nr_hw_queues = set->nr_hw_queues; + kfree(hctxs); + hctxs = new_hctxs; + } + /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { @@ -2848,12 +2865,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); - q->queue_hw_ctx = kcalloc_node(nr_hw_queues(set), - sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, - set->numa_node); - if (!q->queue_hw_ctx) - goto err_sys_init; - INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); @@ -2901,7 +2912,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, err_hctxs: kfree(q->queue_hw_ctx); q->nr_hw_queues = 0; -err_sys_init: blk_mq_sysfs_deinit(q); err_poll: blk_stat_free_callback(q->poll_cb); -- cgit v1.2.3-59-g8ed1b From f7e76dbc24df695f1b8e88ed3201be22215ec969 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 25 Oct 2019 09:50:10 -0700 Subject: block: Reduce the amount of memory used for tag sets Instead of allocating an array of size nr_cpu_ids for set->tags, allocate an array of size set->nr_hw_queues. This patch improves behavior that was introduced by commit 868f2f0b7206 ("blk-mq: dynamic h/w context count"). Reallocating tag sets from inside __blk_mq_update_nr_hw_queues() is safe because: - All request queues that share the tag sets are frozen before the tag sets are reallocated. - blk_mq_queue_tag_busy_iter() holds q->q_usage_counter while active and hence is serialized against __blk_mq_update_nr_hw_queues(). Cc: Keith Busch Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ba09cda49953..df41b2d16261 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2833,19 +2833,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } -/* - * Maximum number of hardware queues we support. For single sets, we'll never - * have more than the CPUs (software queues). For multiple sets, the tag_set - * user may have set ->nr_hw_queues larger. - */ -static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) -{ - if (set->nr_maps == 1) - return nr_cpu_ids; - - return max(set->nr_hw_queues, nr_cpu_ids); -} - struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q, bool elevator_init) @@ -3012,6 +2999,29 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) } } +static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, + int cur_nr_hw_queues, int new_nr_hw_queues) +{ + struct blk_mq_tags **new_tags; + + if (cur_nr_hw_queues >= new_nr_hw_queues) + return 0; + + new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), + GFP_KERNEL, set->numa_node); + if (!new_tags) + return -ENOMEM; + + if (set->tags) + memcpy(new_tags, set->tags, cur_nr_hw_queues * + sizeof(*set->tags)); + kfree(set->tags); + set->tags = new_tags; + set->nr_hw_queues = new_nr_hw_queues; + + return 0; +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -3065,9 +3075,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *), - GFP_KERNEL, set->numa_node); - if (!set->tags) + if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) return -ENOMEM; ret = -ENOMEM; @@ -3108,7 +3116,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; - for (i = 0; i < nr_hw_queues(set); i++) + for (i = 0; i < set->nr_hw_queues; i++) blk_mq_free_map_and_requests(set, i); for (j = 0; j < set->nr_maps; j++) { @@ -3266,6 +3274,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_sysfs_unregister(q); } + if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < + 0) + goto reregister; + prev_nr_hw_queues = set->nr_hw_queues; set->nr_hw_queues = nr_hw_queues; blk_mq_update_queue_map(set); @@ -3282,6 +3294,7 @@ fallback: blk_mq_map_swqueue(q); } +reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register(q); blk_mq_debugfs_register_hctxs(q); -- cgit v1.2.3-59-g8ed1b From 1fead7182f381ab0ebab5eaf1a060a15550da994 Mon Sep 17 00:00:00 2001 From: André Almeida Date: Fri, 25 Oct 2019 14:16:51 -0600 Subject: blk-mq: remove needless goto from blk_mq_get_driver_tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only usage of the label "done" is when (rq->tag != -1) at the beginning of the function. Rather than jumping to label, we can just remove this label and execute the code at the "if". Besides that, the code that would be executed after the label "done" is the return of the logical expression (rq->tag != -1) but since we are already inside the if, we now that this is true. Remove the label and replace the goto with the proper result of the label. Signed-off-by: André Almeida Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index df41b2d16261..c0f3357a2050 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1036,7 +1036,7 @@ bool blk_mq_get_driver_tag(struct request *rq) bool shared; if (rq->tag != -1) - goto done; + return true; if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) data.flags |= BLK_MQ_REQ_RESERVED; @@ -1051,7 +1051,6 @@ bool blk_mq_get_driver_tag(struct request *rq) data.hctx->tags->rqs[rq->tag] = rq; } -done: return rq->tag != -1; } -- cgit v1.2.3-59-g8ed1b From 626fb735a43ddcb7b2c58c27cb03b098acc03339 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 30 Oct 2019 00:59:30 +0800 Subject: blk-mq: Make blk_mq_run_hw_queue() return void Since commit 97889f9ac24f ("blk-mq: remove synchronize_rcu() from blk_mq_del_queue_tag_set()"), the return value of blk_mq_run_hw_queue() is never checked, so make it return void, which very marginally simplifies the code. Reviewed-by: Bob Liu Signed-off-by: John Garry Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++------ include/linux/blk-mq.h | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c0f3357a2050..5c9adcaa27ac 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1457,7 +1457,7 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); -bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { int srcu_idx; bool need_run; @@ -1475,12 +1475,8 @@ bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) blk_mq_hctx_has_pending(hctx); hctx_unlock(hctx, srcu_idx); - if (need_run) { + if (need_run) __blk_mq_delay_run_hw_queue(hctx, async, 0); - return true; - } - - return false; } EXPORT_SYMBOL(blk_mq_run_hw_queue); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4919d22e1aff..dc03e059fdff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -502,7 +502,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); -- cgit v1.2.3-59-g8ed1b From 8962842ca5abdcf98e22ab3b2b45a103f0408b95 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 2 Nov 2019 16:02:15 +0800 Subject: blk-mq: avoid sysfs buffer overflow with too many CPU cores It is reported that sysfs buffer overflow can be triggered if the system has too many CPU cores(>841 on 4K PAGE_SIZE) when showing CPUs of hctx via /sys/block/$DEV/mq/$N/cpu_list. Use snprintf to avoid the potential buffer overflow. This version doesn't change the attribute format, and simply stops showing CPU numbers if the buffer is going to overflow. Cc: stable@vger.kernel.org Fixes: 676141e48af7("blk-mq: don't dump CPU -> hw queue map on driver load") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 81a273b43329..4caa56d1bc85 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -158,20 +158,25 @@ static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { + const size_t size = PAGE_SIZE - 1; unsigned int i, first = 1; - ssize_t ret = 0; + int ret = 0, pos = 0; for_each_cpu(i, hctx->cpumask) { if (first) - ret += sprintf(ret + page, "%u", i); + ret = snprintf(pos + page, size - pos, "%u", i); else - ret += sprintf(ret + page, ", %u", i); + ret = snprintf(pos + page, size - pos, ", %u", i); + + if (ret >= size - pos) + break; first = 0; + pos += ret; } - ret += sprintf(ret + page, "\n"); - return ret; + ret = snprintf(pos + page, size - pos, "\n"); + return pos + ret; } static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { -- cgit v1.2.3-59-g8ed1b From 3495ea1b5f6083b03af062095eecb37283a2cc8f Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 31 Oct 2019 10:13:20 -0600 Subject: block: sed-opal: Generalizing write data to any opal table This patch refactors the existing "write_shadowmbr" func and creates a new generalized function "generic_table_write_data", to write data to any opal table. Also, a few cleanups are included in this patch. Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/sed-opal.c | 138 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 74 insertions(+), 64 deletions(-) (limited to 'block') diff --git a/block/sed-opal.c b/block/sed-opal.c index b4c761973ac1..d6e2ec0d8a3a 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1139,11 +1139,11 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, * * the result is provided in dev->resp->tok[4] */ -static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table, +static int generic_get_table_info(struct opal_dev *dev, const u8 *table_uid, u64 column) { u8 uid[OPAL_UID_LENGTH]; - const unsigned int half = OPAL_UID_LENGTH/2; + const unsigned int half = OPAL_UID_LENGTH_HALF; /* sed-opal UIDs can be split in two halves: * first: actual table index @@ -1152,7 +1152,7 @@ static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table, * first part of the target table as relative index into that table */ memcpy(uid, opaluid[OPAL_TABLE_TABLE], half); - memcpy(uid+half, opaluid[table], half); + memcpy(uid + half, table_uid, half); return generic_get_column(dev, uid, column); } @@ -1221,6 +1221,75 @@ static int get_active_key(struct opal_dev *dev, void *data) return get_active_key_cont(dev); } +static int generic_table_write_data(struct opal_dev *dev, const u64 data, + u64 offset, u64 size, const u8 *uid) +{ + const u8 __user *src = (u8 __user *)(uintptr_t)data; + u8 *dst; + u64 len; + size_t off = 0; + int err; + + /* do we fit in the available space? */ + err = generic_get_table_info(dev, uid, OPAL_TABLE_ROWS); + if (err) { + pr_debug("Couldn't get the table size\n"); + return err; + } + + len = response_get_u64(&dev->parsed, 4); + if (size > len || offset > len - size) { + pr_debug("Does not fit in the table (%llu vs. %llu)\n", + offset + size, len); + return -ENOSPC; + } + + /* do the actual transmission(s) */ + while (off < size) { + err = cmd_start(dev, uid, opalmethod[OPAL_SET]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WHERE); + add_token_u64(&err, dev, offset + off); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + + /* + * The bytestring header is either 1 or 2 bytes, so assume 2. + * There also needs to be enough space to accommodate the + * trailing OPAL_ENDNAME (1 byte) and tokens added by + * cmd_finalize. + */ + len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), + (size_t)(size - off)); + pr_debug("Write bytes %zu+%llu/%llu\n", off, len, size); + + dst = add_bytestring_header(&err, dev, len); + if (!dst) + break; + + if (copy_from_user(dst, src + off, len)) { + err = -EFAULT; + break; + } + + dev->pos += len; + + add_token_u8(&err, dev, OPAL_ENDNAME); + if (err) + break; + + err = finalize_and_send(dev, parse_and_check_status); + if (err) + break; + + off += len; + } + + return err; +} + static int generic_lr_enable_disable(struct opal_dev *dev, u8 *uid, bool rle, bool wle, bool rl, bool wl) @@ -1583,68 +1652,9 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data) static int write_shadow_mbr(struct opal_dev *dev, void *data) { struct opal_shadow_mbr *shadow = data; - const u8 __user *src; - u8 *dst; - size_t off = 0; - u64 len; - int err = 0; - - /* do we fit in the available shadow mbr space? */ - err = generic_get_table_info(dev, OPAL_MBR, OPAL_TABLE_ROWS); - if (err) { - pr_debug("MBR: could not get shadow size\n"); - return err; - } - - len = response_get_u64(&dev->parsed, 4); - if (shadow->size > len || shadow->offset > len - shadow->size) { - pr_debug("MBR: does not fit in shadow (%llu vs. %llu)\n", - shadow->offset + shadow->size, len); - return -ENOSPC; - } - - /* do the actual transmission(s) */ - src = (u8 __user *)(uintptr_t)shadow->data; - while (off < shadow->size) { - err = cmd_start(dev, opaluid[OPAL_MBR], opalmethod[OPAL_SET]); - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_WHERE); - add_token_u64(&err, dev, shadow->offset + off); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_VALUES); - - /* - * The bytestring header is either 1 or 2 bytes, so assume 2. - * There also needs to be enough space to accommodate the - * trailing OPAL_ENDNAME (1 byte) and tokens added by - * cmd_finalize. - */ - len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), - (size_t)(shadow->size - off)); - pr_debug("MBR: write bytes %zu+%llu/%llu\n", - off, len, shadow->size); - - dst = add_bytestring_header(&err, dev, len); - if (!dst) - break; - if (copy_from_user(dst, src + off, len)) - err = -EFAULT; - dev->pos += len; - add_token_u8(&err, dev, OPAL_ENDNAME); - if (err) - break; - - err = finalize_and_send(dev, parse_and_check_status); - if (err) - break; - - off += len; - } - - return err; + return generic_table_write_data(dev, shadow->data, shadow->offset, + shadow->size, opaluid[OPAL_MBR]); } static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, -- cgit v1.2.3-59-g8ed1b From 51f421c85c880dcb37df11e672b384eaa4444328 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 31 Oct 2019 10:13:21 -0600 Subject: block: sed-opal: Add support to read/write opal tables generically This feature gives the user RW access to any opal table with admin1 authority. The flags described in the new structure determines if the user wants to read/write the data. Flags are checked for valid values in order to allow future features to be added to the ioctl. The user can provide the desired table's UID. Also, the ioctl provides a size and offset field and internally will loop data accesses to return the full data block. Read overrun is prevented by the initiator's sec_send_recv() backend. The ioctl provides a private field with the intention to accommodate any future expansions to the ioctl. Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 - block/sed-opal.c | 172 ++++++++++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 20 +++++ 4 files changed, 193 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/opal_proto.h b/block/opal_proto.h index 5532412d567c..710911864e1d 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -76,7 +76,6 @@ enum opal_response_token { * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 * Section: 6.3 Assigned UIDs */ -#define OPAL_UID_LENGTH 8 #define OPAL_METHOD_LENGTH 8 #define OPAL_MSID_KEYLEN 15 #define OPAL_UID_LENGTH_HALF 4 diff --git a/block/sed-opal.c b/block/sed-opal.c index d6e2ec0d8a3a..1d5afaaf0826 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1967,6 +1967,113 @@ static int get_msid_cpin_pin(struct opal_dev *dev, void *data) return 0; } +static int write_table_data(struct opal_dev *dev, void *data) +{ + struct opal_read_write_table *write_tbl = data; + + return generic_table_write_data(dev, write_tbl->data, write_tbl->offset, + write_tbl->size, write_tbl->table_uid); +} + +static int read_table_data_cont(struct opal_dev *dev) +{ + int err; + const char *data_read; + + err = parse_and_check_status(dev); + if (err) + return err; + + dev->prev_d_len = response_get_string(&dev->parsed, 1, &data_read); + dev->prev_data = (void *)data_read; + if (!dev->prev_data) { + pr_debug("%s: Couldn't read data from the table.\n", __func__); + return OPAL_INVAL_PARAM; + } + + return 0; +} + +/* + * IO_BUFFER_LENGTH = 2048 + * sizeof(header) = 56 + * No. of Token Bytes in the Response = 11 + * MAX size of data that can be carried in response buffer + * at a time is : 2048 - (56 + 11) = 1981 = 0x7BD. + */ +#define OPAL_MAX_READ_TABLE (0x7BD) + +static int read_table_data(struct opal_dev *dev, void *data) +{ + struct opal_read_write_table *read_tbl = data; + int err; + size_t off = 0, max_read_size = OPAL_MAX_READ_TABLE; + u64 table_len, len; + u64 offset = read_tbl->offset, read_size = read_tbl->size - 1; + u8 __user *dst; + + err = generic_get_table_info(dev, read_tbl->table_uid, OPAL_TABLE_ROWS); + if (err) { + pr_debug("Couldn't get the table size\n"); + return err; + } + + table_len = response_get_u64(&dev->parsed, 4); + + /* Check if the user is trying to read from the table limits */ + if (read_size > table_len || offset > table_len - read_size) { + pr_debug("Read size exceeds the Table size limits (%llu vs. %llu)\n", + offset + read_size, table_len); + return -EINVAL; + } + + while (off < read_size) { + err = cmd_start(dev, read_tbl->table_uid, opalmethod[OPAL_GET]); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_STARTROW); + add_token_u64(&err, dev, offset + off); /* start row value */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_ENDROW); + + len = min(max_read_size, (size_t)(read_size - off)); + add_token_u64(&err, dev, offset + off + len); /* end row value + */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_debug("Error building read table data command.\n"); + break; + } + + err = finalize_and_send(dev, read_table_data_cont); + if (err) + break; + + /* len+1: This includes the NULL terminator at the end*/ + if (dev->prev_d_len > len + 1) { + err = -EOVERFLOW; + break; + } + + dst = (u8 __user *)(uintptr_t)read_tbl->data; + if (copy_to_user(dst + off, dev->prev_data, dev->prev_d_len)) { + pr_debug("Error copying data to userspace\n"); + err = -EFAULT; + break; + } + dev->prev_data = NULL; + + off += len; + } + + return err; +} + static int end_opal_session(struct opal_dev *dev, void *data) { int err = 0; @@ -2453,6 +2560,68 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) } EXPORT_SYMBOL(opal_unlock_from_suspend); +static int opal_read_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + const struct opal_step read_table_steps[] = { + { start_admin1LSP_opal_session, &rw_tbl->key }, + { read_table_data, rw_tbl }, + { end_opal_session, } + }; + int ret = 0; + + if (!rw_tbl->size) + return ret; + + return execute_steps(dev, read_table_steps, + ARRAY_SIZE(read_table_steps)); +} + +static int opal_write_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + const struct opal_step write_table_steps[] = { + { start_admin1LSP_opal_session, &rw_tbl->key }, + { write_table_data, rw_tbl }, + { end_opal_session, } + }; + int ret = 0; + + if (!rw_tbl->size) + return ret; + + return execute_steps(dev, write_table_steps, + ARRAY_SIZE(write_table_steps)); +} + +static int opal_generic_read_write_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + int ret, bit_set; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + + bit_set = fls64(rw_tbl->flags) - 1; + switch (bit_set) { + case OPAL_READ_TABLE: + ret = opal_read_table(dev, rw_tbl); + break; + case OPAL_WRITE_TABLE: + ret = opal_write_table(dev, rw_tbl); + break; + default: + pr_debug("Invalid bit set in the flag (%016llx).\n", + rw_tbl->flags); + ret = -EINVAL; + break; + } + + mutex_unlock(&dev->dev_lock); + + return ret; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -2515,6 +2684,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_PSID_REVERT_TPR: ret = opal_reverttper(dev, p, true); break; + case IOC_OPAL_GENERIC_TABLE_RW: + ret = opal_generic_read_write_table(dev, p); + break; default: break; } diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 53c28d750a45..1ac0d712a9c3 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -42,6 +42,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_PSID_REVERT_TPR: case IOC_OPAL_MBR_DONE: case IOC_OPAL_WRITE_SHADOW_MBR: + case IOC_OPAL_GENERIC_TABLE_RW: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index c6d035fa1b6c..6f5af1a84213 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -113,6 +113,25 @@ struct opal_shadow_mbr { __u64 size; }; +/* Opal table operations */ +enum opal_table_ops { + OPAL_READ_TABLE, + OPAL_WRITE_TABLE, +}; + +#define OPAL_UID_LENGTH 8 +struct opal_read_write_table { + struct opal_key key; + const __u64 data; + const __u8 table_uid[OPAL_UID_LENGTH]; + __u64 offset; + __u64 size; +#define OPAL_TABLE_READ (1 << OPAL_READ_TABLE) +#define OPAL_TABLE_WRITE (1 << OPAL_WRITE_TABLE) + __u64 flags; + __u64 priv; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -128,5 +147,6 @@ struct opal_shadow_mbr { #define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key) #define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) #define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr) +#define IOC_OPAL_GENERIC_TABLE_RW _IOW('p', 235, struct opal_read_write_table) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3-59-g8ed1b From 62c441c6ae054a0f9ff2944908ed09603b035fd3 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 31 Oct 2019 10:13:22 -0600 Subject: block: sed-opal: Introduce Opal Datastore UID This patch introduces Opal Datastore UID. The generic read/write table ioctl can use this UID to access the Opal Datastore. Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + block/sed-opal.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'block') diff --git a/block/opal_proto.h b/block/opal_proto.h index 710911864e1d..736e67c3e7c5 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -107,6 +107,7 @@ enum opal_uid { OPAL_C_PIN_TABLE, OPAL_LOCKING_INFO_TABLE, OPAL_ENTERPRISE_LOCKING_INFO_TABLE, + OPAL_DATASTORE, /* C_PIN_TABLE object ID's */ OPAL_C_PIN_MSID, OPAL_C_PIN_SID, diff --git a/block/sed-opal.c b/block/sed-opal.c index 1d5afaaf0826..b2cacc9ddd11 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -149,6 +149,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 }, [OPAL_ENTERPRISE_LOCKING_INFO_TABLE] = { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 }, + [OPAL_DATASTORE] = + { 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, 0x00, 0x00 }, /* C_PIN_TABLE object ID's */ [OPAL_C_PIN_MSID] = -- cgit v1.2.3-59-g8ed1b From d2c9be89f8ebe7ebcc97676ac40f8dec1cf9b43a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 4 Nov 2019 16:26:53 +0800 Subject: blk-mq: make sure that line break can be printed 8962842ca5ab ("blk-mq: avoid sysfs buffer overflow with too many CPU cores") avoids sysfs buffer overflow, and reserves one character for line break. However, the last snprintf() doesn't get correct 'size' parameter passed in, so fixed it. Fixes: 8962842ca5ab ("blk-mq: avoid sysfs buffer overflow with too many CPU cores") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 4caa56d1bc85..062229395a50 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -175,7 +175,7 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) pos += ret; } - ret = snprintf(pos + page, size - pos, "\n"); + ret = snprintf(pos + page, size + 1 - pos, "\n"); return pos + ret; } -- cgit v1.2.3-59-g8ed1b From fa53228721876515adabc7bc74368490bd97aa3b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2019 10:05:43 -0800 Subject: block: avoid blk_bio_segment_split for small I/O operations __blk_queue_split() adds significant overhead for small I/O operations. Add a shortcut to avoid it for cases where we know we never need to split. Based on a patch from Ming Lei. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 48e6725b32ee..f22cb6251d06 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -293,7 +293,7 @@ split: void __blk_queue_split(struct request_queue *q, struct bio **bio, unsigned int *nr_segs) { - struct bio *split; + struct bio *split = NULL; switch (bio_op(*bio)) { case REQ_OP_DISCARD: @@ -309,6 +309,20 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, nr_segs); break; default: + /* + * All drivers must accept single-segments bios that are <= + * PAGE_SIZE. This is a quick and dirty check that relies on + * the fact that bi_io_vec[0] is always valid if a bio has data. + * The check might lead to occasional false negatives when bios + * are cloned, but compared to the performance impact of cloned + * bios themselves the loop below doesn't matter anyway. + */ + if (!q->limits.chunk_sectors && + (*bio)->bi_vcnt == 1 && + (*bio)->bi_io_vec[0].bv_len <= PAGE_SIZE) { + *nr_segs = 1; + break; + } split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); break; } -- cgit v1.2.3-59-g8ed1b From f8db383507d658c5a729b062c97710efda876cd4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Nov 2019 11:48:57 +0100 Subject: block: Warn if elevator= parameter is used With transition to blk-mq, the elevator= kernel argument was removed as it makes less and less sense with the current variety of devices. Since this may surprise some users and there are advices on the Internet that still suggest to use it, let's at least warn if the parameter is used. Reviewed-by: Jeff Moyer Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/elevator.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 5437059c9261..0b1db9afb586 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -831,3 +831,12 @@ struct request *elv_rb_latter_request(struct request_queue *q, return NULL; } EXPORT_SYMBOL(elv_rb_latter_request); + +static int __init elevator_setup(char *str) +{ + pr_warn("Kernel parameter elevator= does not have any effect anymore.\n" + "Please use sysfs to set IO scheduler for individual devices.\n"); + return 1; +} + +__setup("elevator=", elevator_setup); -- cgit v1.2.3-59-g8ed1b From a84324d2ed05c06b46fb6079d39a0736bde6e16a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 27 Oct 2019 23:05:42 +0900 Subject: block: Remove REQ_OP_ZONE_RESET plugging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit REQ_OP_ZONE_RESET operations cannot be merged as these bios and requests do not have a size and are never sequential due to the zone start sector position required for their execution. As a result, there is no point in using a plug around blkdev_reset_zones() bio issuing loop. This patch removes this unnecessary plugging. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 4bc5f260248a..7fe376eede86 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -258,7 +258,6 @@ int blkdev_reset_zones(struct block_device *bdev, sector_t zone_sectors; sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; - struct blk_plug plug; int ret; if (!blk_queue_is_zoned(q)) @@ -283,7 +282,6 @@ int blkdev_reset_zones(struct block_device *bdev, end_sector != bdev->bd_part->nr_sects) return -EINVAL; - blk_start_plug(&plug); while (sector < end_sector) { bio = blk_next_bio(bio, 0, gfp_mask); @@ -301,8 +299,6 @@ int blkdev_reset_zones(struct block_device *bdev, ret = submit_bio_wait(bio); bio_put(bio); - blk_finish_plug(&plug); - return ret; } EXPORT_SYMBOL_GPL(blkdev_reset_zones); -- cgit v1.2.3-59-g8ed1b From c7a1d926dc4076aadad187614500afcd8de78818 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 27 Oct 2019 23:05:43 +0900 Subject: block: Simplify REQ_OP_ZONE_RESET_ALL handling There is no need for the function __blkdev_reset_all_zones() as REQ_OP_ZONE_RESET_ALL can be handled directly in blkdev_reset_zones() bio loop with an early break from the loop. This patch removes this function and modifies blkdev_reset_zones(), simplifying the code. Reviewed-by: Christoph Hellwig Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 7fe376eede86..14785011e798 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -202,32 +202,14 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(blkdev_report_zones); -/* - * Special case of zone reset operation to reset all zones in one command, - * useful for applications like mkfs. - */ -static int __blkdev_reset_all_zones(struct block_device *bdev, gfp_t gfp_mask) -{ - struct bio *bio = bio_alloc(gfp_mask, 0); - int ret; - - /* across the zones operations, don't need any sectors */ - bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_ZONE_RESET_ALL, 0); - - ret = submit_bio_wait(bio); - bio_put(bio); - - return ret; -} - static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, + sector_t sector, sector_t nr_sectors) { if (!blk_queue_zone_resetall(bdev_get_queue(bdev))) return false; - if (nr_sectors != part_nr_sects_read(bdev->bd_part)) + if (sector || nr_sectors != part_nr_sects_read(bdev->bd_part)) return false; /* * REQ_OP_ZONE_RESET_ALL can be executed only if the block device is @@ -270,9 +252,6 @@ int blkdev_reset_zones(struct block_device *bdev, /* Out of range */ return -EINVAL; - if (blkdev_allow_reset_all_zones(bdev, nr_sectors)) - return __blkdev_reset_all_zones(bdev, gfp_mask); - /* Check alignment (handle eventual smaller last zone) */ zone_sectors = blk_queue_zone_sectors(q); if (sector & (zone_sectors - 1)) @@ -283,17 +262,24 @@ int blkdev_reset_zones(struct block_device *bdev, return -EINVAL; while (sector < end_sector) { - bio = blk_next_bio(bio, 0, gfp_mask); - bio->bi_iter.bi_sector = sector; bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0); + /* + * Special case for the zone reset operation that reset all + * zones, this is useful for applications like mkfs. + */ + if (blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { + bio->bi_opf = REQ_OP_ZONE_RESET_ALL; + break; + } + + bio->bi_opf = REQ_OP_ZONE_RESET; + bio->bi_iter.bi_sector = sector; sector += zone_sectors; /* This may take a while, so be nice to others */ cond_resched(); - } ret = submit_bio_wait(bio); -- cgit v1.2.3-59-g8ed1b From 6c1b1da58f8c7a697a88ae35afeba196fc7b701e Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Sun, 27 Oct 2019 23:05:45 +0900 Subject: block: add zone open, close and finish operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zoned block devices (ZBC and ZAC devices) allow an explicit control over the condition (state) of zones. The operations allowed are: * Open a zone: Transition to open condition to indicate that a zone will actively be written * Close a zone: Transition to closed condition to release the drive resources used for writing to a zone * Finish a zone: Transition an open or closed zone to the full condition to prevent write operations To enable this control for in-kernel zoned block device users, define the new request operations REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH as well as the generic function blkdev_zone_mgmt() for submitting these operations on a range of zones. This results in blkdev_reset_zones() removal and replacement with this new zone magement function. Users of blkdev_reset_zones() (f2fs and dm-zoned) are updated accordingly. Contains contributions from Matias Bjorling, Hans Holmberg, Dmitry Fomichev, Keith Busch, Damien Le Moal and Christoph Hellwig. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Keith Busch Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-core.c | 12 +++++++++--- block/blk-zoned.c | 35 +++++++++++++++++++++-------------- drivers/md/dm-zoned-metadata.c | 6 +++--- fs/f2fs/segment.c | 3 ++- include/linux/blk_types.h | 25 +++++++++++++++++++++++++ include/linux/blkdev.h | 5 +++-- 6 files changed, 63 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index ae506ac2dd48..f0d82227a2fc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -132,6 +132,9 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(SECURE_ERASE), REQ_OP_NAME(ZONE_RESET), REQ_OP_NAME(ZONE_RESET_ALL), + REQ_OP_NAME(ZONE_OPEN), + REQ_OP_NAME(ZONE_CLOSE), + REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(SCSI_IN), @@ -849,10 +852,10 @@ static inline int blk_partition_remap(struct bio *bio) goto out; /* - * Zone reset does not include bi_size so bio_sectors() is always 0. - * Include a test for the reset op code and perform the remap if needed. + * Zone management bios do not have a sector count but they do have + * a start sector filled out and need to be remapped. */ - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) { + if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) { if (bio_check_eod(bio, part_nr_sects_read(p))) goto out; bio->bi_iter.bi_sector += p->start_sect; @@ -936,6 +939,9 @@ generic_make_request_checks(struct bio *bio) goto not_supported; break; case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: if (!blk_queue_is_zoned(q)) goto not_supported; break; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 14785011e798..dab34dc48fb6 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -221,23 +221,27 @@ static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, } /** - * blkdev_reset_zones - Reset zones write pointer + * blkdev_zone_mgmt - Execute a zone management operation on a range of zones * @bdev: Target block device - * @sector: Start sector of the first zone to reset - * @nr_sectors: Number of sectors, at least the length of one zone + * @op: Operation to be performed on the zones + * @sector: Start sector of the first zone to operate on + * @nr_sectors: Number of sectors, should be at least the length of one zone and + * must be zone size aligned. * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: - * Reset the write pointer of the zones contained in the range + * Perform the specified operation on the range of zones specified by * @sector..@sector+@nr_sectors. Specifying the entire disk sector range * is valid, but the specified range should not contain conventional zones. + * The operation to execute on each zone can be a zone reset, open, close + * or finish request. */ -int blkdev_reset_zones(struct block_device *bdev, - sector_t sector, sector_t nr_sectors, - gfp_t gfp_mask) +int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, + sector_t sector, sector_t nr_sectors, + gfp_t gfp_mask) { struct request_queue *q = bdev_get_queue(bdev); - sector_t zone_sectors; + sector_t zone_sectors = blk_queue_zone_sectors(q); sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; int ret; @@ -248,12 +252,14 @@ int blkdev_reset_zones(struct block_device *bdev, if (bdev_read_only(bdev)) return -EPERM; + if (!op_is_zone_mgmt(op)) + return -EOPNOTSUPP; + if (!nr_sectors || end_sector > bdev->bd_part->nr_sects) /* Out of range */ return -EINVAL; /* Check alignment (handle eventual smaller last zone) */ - zone_sectors = blk_queue_zone_sectors(q); if (sector & (zone_sectors - 1)) return -EINVAL; @@ -269,12 +275,13 @@ int blkdev_reset_zones(struct block_device *bdev, * Special case for the zone reset operation that reset all * zones, this is useful for applications like mkfs. */ - if (blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { + if (op == REQ_OP_ZONE_RESET && + blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { bio->bi_opf = REQ_OP_ZONE_RESET_ALL; break; } - bio->bi_opf = REQ_OP_ZONE_RESET; + bio->bi_opf = op; bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -287,7 +294,7 @@ int blkdev_reset_zones(struct block_device *bdev, return ret; } -EXPORT_SYMBOL_GPL(blkdev_reset_zones); +EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); /* * BLKREPORTZONE ioctl processing. @@ -379,8 +386,8 @@ int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) return -EFAULT; - return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors, - GFP_KERNEL); + return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + zrange.sector, zrange.nr_sectors, GFP_KERNEL); } static inline unsigned long *blk_alloc_zone_bitmap(int node, diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 595a73110e17..feb4718ce6a6 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1312,9 +1312,9 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { struct dmz_dev *dev = zmd->dev; - ret = blkdev_reset_zones(dev->bdev, - dmz_start_sect(zmd, zone), - dev->zone_nr_sectors, GFP_NOIO); + ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, + dmz_start_sect(zmd, zone), + dev->zone_nr_sectors, GFP_NOIO); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", dmz_id(zmd, zone), ret); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 808709581481..2c997f94a3b2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1771,7 +1771,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, return -EIO; } trace_f2fs_issue_reset_zone(bdev, blkstart); - return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); + return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sector, nr_sects, GFP_NOFS); } /* For conventional zones, use regular discard if supported */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1e7eeec16458..23a2fd534817 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -290,6 +290,12 @@ enum req_opf { REQ_OP_ZONE_RESET_ALL = 8, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = 9, + /* Open a zone */ + REQ_OP_ZONE_OPEN = 10, + /* Close a zone */ + REQ_OP_ZONE_CLOSE = 11, + /* Transition a zone to full */ + REQ_OP_ZONE_FINISH = 12, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, @@ -417,6 +423,25 @@ static inline bool op_is_discard(unsigned int op) return (op & REQ_OP_MASK) == REQ_OP_DISCARD; } +/* + * Check if a bio or request operation is a zone management operation, with + * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case + * due to its different handling in the block layer and device response in + * case of command failure. + */ +static inline bool op_is_zone_mgmt(enum req_opf op) +{ + switch (op & REQ_OP_MASK) { + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + return true; + default: + return false; + } +} + static inline int op_stat_group(unsigned int op) { if (op_is_discard(op)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d4051acb92a1..9cfafff86f66 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -360,8 +360,9 @@ extern unsigned int blkdev_nr_zones(struct block_device *bdev); extern int blkdev_report_zones(struct block_device *bdev, sector_t sector, struct blk_zone *zones, unsigned int *nr_zones); -extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, - sector_t nr_sectors, gfp_t gfp_mask); +extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, + sector_t sectors, sector_t nr_sectors, + gfp_t gfp_mask); extern int blk_revalidate_disk_zones(struct gendisk *disk); extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, -- cgit v1.2.3-59-g8ed1b From e876df1fe0ad1b191284ee6ed2db7960bd322d00 Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Sun, 27 Oct 2019 23:05:46 +0900 Subject: block: add zone open, close and finish ioctl support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce three new ioctl commands BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE to allow applications to control the condition of zones on a zoned block device through the execution of the REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH operations. Contains contributions from Matias Bjorling, Hans Holmberg, Dmitry Fomichev, Keith Busch, Damien Le Moal and Christoph Hellwig. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Keith Busch Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 28 +++++++++++++++++++++++----- block/ioctl.c | 5 ++++- include/linux/blkdev.h | 10 +++++----- include/uapi/linux/blkzoned.h | 17 ++++++++++++++--- 4 files changed, 46 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index dab34dc48fb6..481eaf7d04d4 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -357,15 +357,16 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, } /* - * BLKRESETZONE ioctl processing. + * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. * Called from blkdev_ioctl. */ -int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct request_queue *q; struct blk_zone_range zrange; + enum req_opf op; if (!argp) return -EINVAL; @@ -386,8 +387,25 @@ int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) return -EFAULT; - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - zrange.sector, zrange.nr_sectors, GFP_KERNEL); + switch (cmd) { + case BLKRESETZONE: + op = REQ_OP_ZONE_RESET; + break; + case BLKOPENZONE: + op = REQ_OP_ZONE_OPEN; + break; + case BLKCLOSEZONE: + op = REQ_OP_ZONE_CLOSE; + break; + case BLKFINISHZONE: + op = REQ_OP_ZONE_FINISH; + break; + default: + return -ENOTTY; + } + + return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); } static inline unsigned long *blk_alloc_zone_bitmap(int node, diff --git a/block/ioctl.c b/block/ioctl.c index 15a0eb80ada9..8756efb1419e 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -532,7 +532,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); case BLKRESETZONE: - return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg); + case BLKOPENZONE: + case BLKCLOSEZONE: + case BLKFINISHZONE: + return blkdev_zone_mgmt_ioctl(bdev, mode, cmd, arg); case BLKGETZONESZ: return put_uint(arg, bdev_zone_sectors(bdev)); case BLKGETNRZONES: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9cfafff86f66..6a4f7abbdcf7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -367,8 +367,8 @@ extern int blk_revalidate_disk_zones(struct gendisk *disk); extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); -extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); +extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ @@ -389,9 +389,9 @@ static inline int blkdev_report_zones_ioctl(struct block_device *bdev, return -ENOTTY; } -static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, - fmode_t mode, unsigned int cmd, - unsigned long arg) +static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) { return -ENOTTY; } diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 498eec813494..0cdef67135f0 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -120,9 +120,11 @@ struct blk_zone_report { }; /** - * struct blk_zone_range - BLKRESETZONE ioctl request - * @sector: starting sector of the first zone to issue reset write pointer - * @nr_sectors: Total number of sectors of 1 or more zones to reset + * struct blk_zone_range - BLKRESETZONE/BLKOPENZONE/ + * BLKCLOSEZONE/BLKFINISHZONE ioctl + * requests + * @sector: Starting sector of the first zone to operate on. + * @nr_sectors: Total number of sectors of all zones to operate on. */ struct blk_zone_range { __u64 sector; @@ -139,10 +141,19 @@ struct blk_zone_range { * sector range. The sector range must be zone aligned. * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors. * @BLKGETNRZONES: Get the total number of zones of the device. + * @BLKOPENZONE: Open the zones in the specified sector range. + * The 512 B sector range must be zone aligned. + * @BLKCLOSEZONE: Close the zones in the specified sector range. + * The 512 B sector range must be zone aligned. + * @BLKFINISHZONE: Mark the zones as full in the specified sector range. + * The 512 B sector range must be zone aligned. */ #define BLKREPORTZONE _IOWR(0x12, 130, struct blk_zone_report) #define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range) #define BLKGETZONESZ _IOR(0x12, 132, __u32) #define BLKGETNRZONES _IOR(0x12, 133, __u32) +#define BLKOPENZONE _IOW(0x12, 134, struct blk_zone_range) +#define BLKCLOSEZONE _IOW(0x12, 135, struct blk_zone_range) +#define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range) #endif /* _UAPI_BLKZONED_H */ -- cgit v1.2.3-59-g8ed1b From a557f1c7fee2f2059234647fea32ed1f3c07dce2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:17:59 -0800 Subject: bfq-iosched: relocate bfqg_*rwstat*() helpers Collect them right under #ifdef CONFIG_BFQ_CGROUP_DEBUG. The next patch will use them from !DEBUG path and this makes it easy to move them out of the ifdef block. This is pure code reorganization. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 86a607cf19a1..d4755d4ad009 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -1058,17 +1058,34 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, } #ifdef CONFIG_BFQ_CGROUP_DEBUG -static int bfqg_print_stat(struct seq_file *sf, void *v) +static int bfqg_print_rwstat(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, - &blkcg_policy_bfq, seq_cft(sf)->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, + &blkcg_policy_bfq, seq_cft(sf)->private, true); return 0; } -static int bfqg_print_rwstat(struct seq_file *sf, void *v) +static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - &blkcg_policy_bfq, seq_cft(sf)->private, true); + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, + seq_cft(sf)->private, true); + return 0; +} + +static int bfqg_print_stat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, + &blkcg_policy_bfq, seq_cft(sf)->private, false); return 0; } @@ -1097,15 +1114,6 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, sum); } -static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat_sample sum; - - blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); - return __blkg_prfill_rwstat(sf, pd, &sum); -} - static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), @@ -1114,14 +1122,6 @@ static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) return 0; } -static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, - seq_cft(sf)->private, true); - return 0; -} - static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, int off) { -- cgit v1.2.3-59-g8ed1b From fd41e60331b13b8fb35cc5048185a46de98db77c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:00 -0800 Subject: bfq-iosched: stop using blkg->stat_bytes and ->stat_ios When used on cgroup1, bfq uses the blkg->stat_bytes and ->stat_ios from blk-cgroup core to populate six stat knobs. blk-cgroup core is moving away from blkg_rwstat to improve scalability and won't be able to support this usage. It isn't like the sharing gains all that much. Let's break it out to dedicated rwstat counters which are updated when on cgroup1. This makes use of bfqg_*rwstat*() helpers outside of CONFIG_BFQ_CGROUP_DEBUG. Move them out. v2: Compile fix when !CONFIG_BFQ_CGROUP_DEBUG. Signed-off-by: Tejun Heo Cc: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 39 +++++++++++++++++++++++++++------------ block/bfq-iosched.c | 4 ++++ block/bfq-iosched.h | 4 ++++ 3 files changed, 35 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index d4755d4ad009..cea0ae12f937 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -347,6 +347,14 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg) bfqg_put(bfqg); } +void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) +{ + struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); + + blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); + blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); +} + /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { @@ -431,6 +439,8 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) static void bfqg_stats_exit(struct bfqg_stats *stats) { + blkg_rwstat_exit(&stats->bytes); + blkg_rwstat_exit(&stats->ios); #ifdef CONFIG_BFQ_CGROUP_DEBUG blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); @@ -448,6 +458,10 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { + if (blkg_rwstat_init(&stats->bytes, gfp) || + blkg_rwstat_init(&stats->ios, gfp)) + return -ENOMEM; + #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || @@ -1057,7 +1071,6 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, return bfq_io_set_device_weight(of, buf, nbytes, off); } -#ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfqg_print_rwstat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, @@ -1082,6 +1095,7 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } +#ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfqg_print_stat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, @@ -1125,7 +1139,8 @@ static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg); + u64 sum = blkg_rwstat_total(&bfqg->stats.bytes); return __blkg_prfill_u64(sf, pd, sum >> 9); } @@ -1142,8 +1157,8 @@ static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, { struct blkg_rwstat_sample tmp; - blkg_rwstat_recursive_sum(pd->blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes), &tmp); + blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq, + offsetof(struct bfq_group, stats.bytes), &tmp); return __blkg_prfill_u64(sf, pd, (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); @@ -1226,13 +1241,13 @@ struct cftype bfq_blkcg_legacy_files[] = { /* statistics, covers only the tasks in the bfqg */ { .name = "bfq.io_service_bytes", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, + .private = offsetof(struct bfq_group, stats.bytes), + .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_serviced", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, + .private = offsetof(struct bfq_group, stats.ios), + .seq_show = bfqg_print_rwstat, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { @@ -1269,13 +1284,13 @@ struct cftype bfq_blkcg_legacy_files[] = { /* the same statistics which cover the bfqg and its descendants */ { .name = "bfq.io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, + .private = offsetof(struct bfq_group, stats.bytes), + .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, + .private = offsetof(struct bfq_group, stats.ios), + .seq_show = bfqg_print_rwstat_recursive, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0319d6339822..41d2d83c919b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5464,6 +5464,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool idle_timer_disabled = false; unsigned int cmd_flags; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) + bfqg_stats_update_legacy_io(q, rq); +#endif spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { spin_unlock_irq(&bfqd->lock); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 5d1a519640f6..2676c06218f1 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -809,6 +809,9 @@ struct bfq_stat { }; struct bfqg_stats { + /* basic stats */ + struct blkg_rwstat bytes; + struct blkg_rwstat ios; #ifdef CONFIG_BFQ_CGROUP_DEBUG /* number of ios merged */ struct blkg_rwstat merged; @@ -956,6 +959,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); /* ---------------- cgroups-support interface ---------------- */ +void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, unsigned int op); void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op); -- cgit v1.2.3-59-g8ed1b From 7ca464383aecef5c2e32b987030187ee1e4848fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:01 -0800 Subject: blk-throtl: stop using blkg->stat_bytes and ->stat_ios When used on cgroup1, blk-throtl uses the blkg->stat_bytes and ->stat_ios from blk-cgroup core to populate four stat knobs. blk-cgroup core is moving away from blkg_rwstat to improve scalability and won't be able to support this usage. It isn't like the sharing gains all that much. Let's break them out to dedicated rwstat counters which are updated when on cgroup1. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 70 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 18f773e52dfb..2d0fc73d9781 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -176,6 +176,9 @@ struct throtl_grp { unsigned int bio_cnt; /* total bios */ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; + + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; }; /* We measure latency for request size from <= 4k to >= 1M */ @@ -489,6 +492,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, if (!tg) return NULL; + if (blkg_rwstat_init(&tg->stat_bytes, gfp)) + goto err_free_tg; + + if (blkg_rwstat_init(&tg->stat_ios, gfp)) + goto err_exit_stat_bytes; + throtl_service_queue_init(&tg->service_queue); for (rw = READ; rw <= WRITE; rw++) { @@ -513,6 +522,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; return &tg->pd; + +err_exit_stat_bytes: + blkg_rwstat_exit(&tg->stat_bytes); +err_free_tg: + kfree(tg); + return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) @@ -611,6 +626,8 @@ static void throtl_pd_free(struct blkg_policy_data *pd) struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); + blkg_rwstat_exit(&tg->stat_bytes); + blkg_rwstat_exit(&tg->stat_ios); kfree(tg); } @@ -1464,6 +1481,32 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, return tg_set_conf(of, buf, nbytes, off, false); } +static int tg_print_rwstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + +static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, + &sum); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + tg_prfill_rwstat_recursive, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", @@ -1491,23 +1534,23 @@ static struct cftype throtl_legacy_files[] = { }, { .name = "throttle.io_service_bytes", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_bytes, + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_bytes_recursive, + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat_recursive, }, { .name = "throttle.io_serviced", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_ios, + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_ios_recursive, + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat_recursive, }, { } /* terminate */ }; @@ -2127,7 +2170,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, WARN_ON_ONCE(!rcu_read_lock_held()); /* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) + if (bio_flagged(bio, BIO_THROTTLED)) + goto out; + + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); + } + + if (!tg->has_rules[rw]) goto out; spin_lock_irq(&q->queue_lock); -- cgit v1.2.3-59-g8ed1b From 8a80d5d6638b7d58480a83aef49d587de63d4cbb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:02 -0800 Subject: blk-cgroup: remove now unused blkg_print_stat_{bytes|ios}_recursive() These don't have users anymore. Remove them. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 83 ---------------------------------------------- include/linux/blk-cgroup.h | 5 --- 2 files changed, 88 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1eb8895be4c6..e7e93377e320 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -615,89 +615,6 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); -static u64 blkg_prfill_rwstat_field(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat_sample rwstat = { }; - - blkg_rwstat_read((void *)pd->blkg + off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -/** - * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes - * @sf: seq_file to print to - * @v: unused - * - * To be used as cftype->seq_show to print blkg->stat_bytes. - * cftype->private must be set to the blkcg_policy. - */ -int blkg_print_stat_bytes(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_bytes), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); - -/** - * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios - * @sf: seq_file to print to - * @v: unused - * - * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private - * must be set to the blkcg_policy. - */ -int blkg_print_stat_ios(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_ios), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_ios); - -static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat_sample rwstat; - - blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -/** - * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes - * @sf: seq_file to print to - * @v: unused - */ -int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field_recursive, - (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_bytes), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); - -/** - * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios - * @sf: seq_file to print to - * @v: unused - */ -int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field_recursive, - (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_ios), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); - /** * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat * @blkg: blkg of interest diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index bed9e43f9426..914ce55fa8c2 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -220,11 +220,6 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, const struct blkg_rwstat_sample *rwstat); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -int blkg_print_stat_bytes(struct seq_file *sf, void *v); -int blkg_print_stat_ios(struct seq_file *sf, void *v); -int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v); -int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); - void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off, struct blkg_rwstat_sample *sum); -- cgit v1.2.3-59-g8ed1b From f73316482977ac401ac37245c9df48079d4e11f3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:03 -0800 Subject: blk-cgroup: reimplement basic IO stats using cgroup rstat blk-cgroup has been using blkg_rwstat to track basic IO stats. Unfortunately, reading recursive stats scales badly as itinvolves walking all descendants. On systems with a huge number of cgroups (dead or alive), this can lead to substantial CPU cost when reading IO stats. This patch reimplements basic IO stats using cgroup rstat which uses more memory but makes recursive stat reading O(# descendants which have been active since last reading) instead of O(# descendants). * blk-cgroup core no longer uses sync/async stats. Introduce new stat enums - BLKG_IOSTAT_{READ|WRITE|DISCARD}. * Add blkg_iostat[_set] which encapsulates byte and io stats, last values for propagation delta calculation and u64_stats_sync for correctness on 32bit archs. * Update the new percpu stat counters directly and implement blkcg_rstat_flush() to implement propagation. * blkg_print_stat() can now bring the stats up to date by calling cgroup_rstat_flush() and print them instead of directly summing up all descendants. * It now allocates 96 bytes per cpu. It used to be 40 bytes. Signed-off-by: Tejun Heo Cc: Dan Schatzberg Cc: Daniel Xu Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 124 ++++++++++++++++++++++++++++++++++++--------- include/linux/blk-cgroup.h | 48 ++++++++++++++++-- 2 files changed, 142 insertions(+), 30 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e7e93377e320..b3429be62057 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -80,8 +80,7 @@ static void blkg_free(struct blkcg_gq *blkg) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - blkg_rwstat_exit(&blkg->stat_ios); - blkg_rwstat_exit(&blkg->stat_bytes); + free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } @@ -146,7 +145,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, gfp_t gfp_mask) { struct blkcg_gq *blkg; - int i; + int i, cpu; /* alloc and init base part */ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); @@ -156,8 +155,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) goto err_free; - if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || - blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); + if (!blkg->iostat_cpu) goto err_free; blkg->q = q; @@ -167,6 +166,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); blkg->blkcg = blkcg; + u64_stats_init(&blkg->iostat.sync); + for_each_possible_cpu(cpu) + u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; @@ -393,7 +396,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; - struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(&blkg->q->queue_lock); @@ -410,11 +412,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) pol->pd_offline_fn(blkg->pd[i]); } - if (parent) { - blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); - blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); - } - blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -464,7 +461,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; - int i; + int i, cpu; mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); @@ -475,8 +472,12 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - blkg_rwstat_reset(&blkg->stat_bytes); - blkg_rwstat_reset(&blkg->stat_ios); + for_each_possible_cpu(cpu) { + struct blkg_iostat_set *bis = + per_cpu_ptr(blkg->iostat_cpu, cpu); + memset(bis, 0, sizeof(*bis)); + } + memset(&blkg->iostat, 0, sizeof(blkg->iostat)); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -840,16 +841,18 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; + cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkg_iostat_set *bis = &blkg->iostat; const char *dname; char *buf; - struct blkg_rwstat_sample rwstat; u64 rbytes, wbytes, rios, wios, dbytes, dios; size_t size = seq_get_buf(sf, &buf), off = 0; int i; bool has_stats = false; + unsigned seq; spin_lock_irq(&blkg->q->queue_lock); @@ -868,17 +871,16 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) */ off += scnprintf(buf+off, size-off, "%s ", dname); - blkg_rwstat_recursive_sum(blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes), &rwstat); - rbytes = rwstat.cnt[BLKG_RWSTAT_READ]; - wbytes = rwstat.cnt[BLKG_RWSTAT_WRITE]; - dbytes = rwstat.cnt[BLKG_RWSTAT_DISCARD]; + do { + seq = u64_stats_fetch_begin(&bis->sync); - blkg_rwstat_recursive_sum(blkg, NULL, - offsetof(struct blkcg_gq, stat_ios), &rwstat); - rios = rwstat.cnt[BLKG_RWSTAT_READ]; - wios = rwstat.cnt[BLKG_RWSTAT_WRITE]; - dios = rwstat.cnt[BLKG_RWSTAT_DISCARD]; + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { has_stats = true; @@ -1214,6 +1216,77 @@ static int blkcg_can_attach(struct cgroup_taskset *tset) return ret; } +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] += src->bytes[i]; + dst->ios[i] += src->ios[i]; + } +} + +static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] -= src->bytes[i]; + dst->ios[i] -= src->ios[i]; + } +} + +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkcg_gq *parent = blkg->parent; + struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); + struct blkg_iostat cur, delta; + unsigned seq; + + /* fetch the current per-cpu values */ + do { + seq = u64_stats_fetch_begin(&bisc->sync); + blkg_iostat_set(&cur, &bisc->cur); + } while (u64_stats_fetch_retry(&bisc->sync, seq)); + + /* propagate percpu delta to global */ + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&delta, &cur); + blkg_iostat_sub(&delta, &bisc->last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(&bisc->last, &delta); + u64_stats_update_end(&blkg->iostat.sync); + + /* propagate global delta to parent */ + if (parent) { + u64_stats_update_begin(&parent->iostat.sync); + blkg_iostat_set(&delta, &blkg->iostat.cur); + blkg_iostat_sub(&delta, &blkg->iostat.last); + blkg_iostat_add(&parent->iostat.cur, &delta); + blkg_iostat_add(&blkg->iostat.last, &delta); + u64_stats_update_end(&parent->iostat.sync); + } + } + + rcu_read_unlock(); +} + static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; @@ -1246,6 +1319,7 @@ struct cgroup_subsys io_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, + .css_rstat_flush = blkcg_rstat_flush, .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 914ce55fa8c2..867ab391e409 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -15,7 +15,9 @@ */ #include +#include #include +#include #include #include #include @@ -31,6 +33,14 @@ #ifdef CONFIG_BLK_CGROUP +enum blkg_iostat_type { + BLKG_IOSTAT_READ, + BLKG_IOSTAT_WRITE, + BLKG_IOSTAT_DISCARD, + + BLKG_IOSTAT_NR, +}; + enum blkg_rwstat_type { BLKG_RWSTAT_READ, BLKG_RWSTAT_WRITE, @@ -61,6 +71,17 @@ struct blkcg { #endif }; +struct blkg_iostat { + u64 bytes[BLKG_IOSTAT_NR]; + u64 ios[BLKG_IOSTAT_NR]; +}; + +struct blkg_iostat_set { + struct u64_stats_sync sync; + struct blkg_iostat cur; + struct blkg_iostat last; +}; + /* * blkg_[rw]stat->aux_cnt is excluded for local stats but included for * recursive. Used to carry stats of dead children. @@ -127,8 +148,8 @@ struct blkcg_gq { /* is this blkg online? protected by both blkcg and q locks */ bool online; - struct blkg_rwstat stat_bytes; - struct blkg_rwstat stat_ios; + struct blkg_iostat_set __percpu *iostat_cpu; + struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; @@ -740,15 +761,32 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, throtl = blk_throtl_bio(q, blkg, bio); if (!throtl) { + struct blkg_iostat_set *bis; + int rwd, cpu; + + if (op_is_discard(bio->bi_opf)) + rwd = BLKG_IOSTAT_DISCARD; + else if (op_is_write(bio->bi_opf)) + rwd = BLKG_IOSTAT_WRITE; + else + rwd = BLKG_IOSTAT_READ; + + cpu = get_cpu(); + bis = per_cpu_ptr(blkg->iostat_cpu, cpu); + u64_stats_update_begin(&bis->sync); + /* * If the bio is flagged with BIO_QUEUE_ENTERED it means this * is a split bio and we would have already accounted for the * size of the bio. */ if (!bio_flagged(bio, BIO_QUEUE_ENTERED)) - blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); - blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); + bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + bis->cur.ios[rwd]++; + + u64_stats_update_end(&bis->sync); + cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu); + put_cpu(); } blkcg_bio_issue_init(bio); -- cgit v1.2.3-59-g8ed1b From 1d156646e0d8ec390e5d5ac288137df02d4207be Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:04 -0800 Subject: blk-cgroup: separate out blkg_rwstat under CONFIG_BLK_CGROUP_RWSTAT blkg_rwstat is now only used by bfq-iosched and blk-throtl when on cgroup1. Let's move it into its own files and gate it behind a config option. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/Kconfig | 4 ++ block/Kconfig.iosched | 1 + block/Makefile | 1 + block/bfq-iosched.h | 2 + block/blk-cgroup-rwstat.c | 129 ++++++++++++++++++++++++++++++++++++ block/blk-cgroup-rwstat.h | 149 ++++++++++++++++++++++++++++++++++++++++++ block/blk-cgroup.c | 97 --------------------------- block/blk-throttle.c | 1 + include/linux/blk-cgroup.h | 159 --------------------------------------------- 9 files changed, 287 insertions(+), 256 deletions(-) create mode 100644 block/blk-cgroup-rwstat.c create mode 100644 block/blk-cgroup-rwstat.h (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 41c0917ce622..c23094a14a2b 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -32,6 +32,9 @@ config BLK_RQ_ALLOC_TIME config BLK_SCSI_REQUEST bool +config BLK_CGROUP_RWSTAT + bool + config BLK_DEV_BSG bool "Block layer SG support v4" default y @@ -86,6 +89,7 @@ config BLK_DEV_ZONED config BLK_DEV_THROTTLING bool "Block layer bio throttling support" depends on BLK_CGROUP=y + select BLK_CGROUP_RWSTAT ---help--- Block layer bio throttling support. It can be used to limit the IO rate to a device. IO rate policies are per cgroup and diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index b89310a022ad..7df14133adc8 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -31,6 +31,7 @@ config IOSCHED_BFQ config BFQ_GROUP_IOSCHED bool "BFQ hierarchical scheduling support" depends on IOSCHED_BFQ && BLK_CGROUP + select BLK_CGROUP_RWSTAT ---help--- Enable hierarchical scheduling in BFQ, using the blkio diff --git a/block/Makefile b/block/Makefile index 9ef57ace90d4..205a5f2fef17 100644 --- a/block/Makefile +++ b/block/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o +obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 2676c06218f1..9c82c1f35716 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -10,6 +10,8 @@ #include #include +#include "blk-cgroup-rwstat.h" + #define BFQ_IOPRIO_CLASSES 3 #define BFQ_CL_IDLE_TIMEOUT (HZ/5) diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c new file mode 100644 index 000000000000..85d5790ac49b --- /dev/null +++ b/block/blk-cgroup-rwstat.c @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. + * Do not use in new code. + */ +#include "blk-cgroup-rwstat.h" + +int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) +{ + int i, ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); + if (ret) { + while (--i >= 0) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); + return ret; + } + atomic64_set(&rwstat->aux_cnt[i], 0); + } + return 0; +} +EXPORT_SYMBOL_GPL(blkg_rwstat_init); + +void blkg_rwstat_exit(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); +} +EXPORT_SYMBOL_GPL(blkg_rwstat_exit); + +/** + * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @rwstat: rwstat to print + * + * Print @rwstat to @sf for the device assocaited with @pd. + */ +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat_sample *rwstat) +{ + static const char *rwstr[] = { + [BLKG_RWSTAT_READ] = "Read", + [BLKG_RWSTAT_WRITE] = "Write", + [BLKG_RWSTAT_SYNC] = "Sync", + [BLKG_RWSTAT_ASYNC] = "Async", + [BLKG_RWSTAT_DISCARD] = "Discard", + }; + const char *dname = blkg_dev_name(pd->blkg); + u64 v; + int i; + + if (!dname) + return 0; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], + rwstat->cnt[i]); + + v = rwstat->cnt[BLKG_RWSTAT_READ] + + rwstat->cnt[BLKG_RWSTAT_WRITE] + + rwstat->cnt[BLKG_RWSTAT_DISCARD]; + seq_printf(sf, "%s Total %llu\n", dname, v); + return v; +} +EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); + +/** + * blkg_prfill_rwstat - prfill callback for blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the blkg_rwstat in @pd + * + * prfill callback for printing a blkg_rwstat. + */ +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat_sample rwstat = { }; + + blkg_rwstat_read((void *)pd + off, &rwstat); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} +EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); + +/** + * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg + * @sum: blkg_rwstat_sample structure containing the results + * + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. + */ +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat_sample *sum) +{ + struct blkcg_gq *pos_blkg; + struct cgroup_subsys_state *pos_css; + unsigned int i; + + lockdep_assert_held(&blkg->q->queue_lock); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat; + + if (!pos_blkg->online) + continue; + + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h new file mode 100644 index 000000000000..ee746919c41f --- /dev/null +++ b/block/blk-cgroup-rwstat.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. + * Do not use in new code. + */ +#ifndef _BLK_CGROUP_RWSTAT_H +#define _BLK_CGROUP_RWSTAT_H + +#include + +enum blkg_rwstat_type { + BLKG_RWSTAT_READ, + BLKG_RWSTAT_WRITE, + BLKG_RWSTAT_SYNC, + BLKG_RWSTAT_ASYNC, + BLKG_RWSTAT_DISCARD, + + BLKG_RWSTAT_NR, + BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, +}; + +/* + * blkg_[rw]stat->aux_cnt is excluded for local stats but included for + * recursive. Used to carry stats of dead children. + */ +struct blkg_rwstat { + struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; + atomic64_t aux_cnt[BLKG_RWSTAT_NR]; +}; + +struct blkg_rwstat_sample { + u64 cnt[BLKG_RWSTAT_NR]; +}; + +static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, + unsigned int idx) +{ + return atomic64_read(&rwstat->aux_cnt[idx]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); +} + +int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp); +void blkg_rwstat_exit(struct blkg_rwstat *rwstat); +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat_sample *rwstat); +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off); +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat_sample *sum); + + +/** + * blkg_rwstat_add - add a value to a blkg_rwstat + * @rwstat: target blkg_rwstat + * @op: REQ_OP and flags + * @val: value to add + * + * Add @val to @rwstat. The counters are chosen according to @rw. The + * caller is responsible for synchronizing calls to this function. + */ +static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, + unsigned int op, uint64_t val) +{ + struct percpu_counter *cnt; + + if (op_is_discard(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; + else if (op_is_write(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; + + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); + + if (op_is_sync(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; + + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); +} + +/** + * blkg_rwstat_read - read the current values of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Read the current snapshot of @rwstat and return it in the aux counts. + */ +static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, + struct blkg_rwstat_sample *result) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + result->cnt[i] = + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); +} + +/** + * blkg_rwstat_total - read the total count of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Return the total count of @rwstat regardless of the IO direction. This + * function can be called without synchronization and takes care of u64 + * atomicity. + */ +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) +{ + struct blkg_rwstat_sample tmp = { }; + + blkg_rwstat_read(rwstat, &tmp); + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; +} + +/** + * blkg_rwstat_reset - reset a blkg_rwstat + * @rwstat: blkg_rwstat to reset + */ +static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + percpu_counter_set(&rwstat->cpu_cnt[i], 0); + atomic64_set(&rwstat->aux_cnt[i], 0); + } +} + +/** + * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's count including the aux one to @to's aux count. + */ +static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + u64 sum[BLKG_RWSTAT_NR]; + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), + &to->aux_cnt[i]); +} +#endif /* _BLK_CGROUP_RWSTAT_H */ diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b3429be62057..708dea92dac8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -561,103 +561,6 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -/** - * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @rwstat: rwstat to print - * - * Print @rwstat to @sf for the device assocaited with @pd. - */ -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat_sample *rwstat) -{ - static const char *rwstr[] = { - [BLKG_RWSTAT_READ] = "Read", - [BLKG_RWSTAT_WRITE] = "Write", - [BLKG_RWSTAT_SYNC] = "Sync", - [BLKG_RWSTAT_ASYNC] = "Async", - [BLKG_RWSTAT_DISCARD] = "Discard", - }; - const char *dname = blkg_dev_name(pd->blkg); - u64 v; - int i; - - if (!dname) - return 0; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - rwstat->cnt[i]); - - v = rwstat->cnt[BLKG_RWSTAT_READ] + - rwstat->cnt[BLKG_RWSTAT_WRITE] + - rwstat->cnt[BLKG_RWSTAT_DISCARD]; - seq_printf(sf, "%s Total %llu\n", dname, v); - return v; -} -EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); - -/** - * blkg_prfill_rwstat - prfill callback for blkg_rwstat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @off: offset to the blkg_rwstat in @pd - * - * prfill callback for printing a blkg_rwstat. - */ -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat_sample rwstat = { }; - - blkg_rwstat_read((void *)pd + off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} -EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); - -/** - * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @blkg: blkg of interest - * @pol: blkcg_policy which contains the blkg_rwstat - * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg - * @sum: blkg_rwstat_sample structure containing the results - * - * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its - * online descendants and their aux counts. The caller must be holding the - * queue lock for online tests. - * - * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it - * is at @off bytes into @blkg's blkg_policy_data of the policy. - */ -void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, - int off, struct blkg_rwstat_sample *sum) -{ - struct blkcg_gq *pos_blkg; - struct cgroup_subsys_state *pos_css; - unsigned int i; - - lockdep_assert_held(&blkg->q->queue_lock); - - rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { - struct blkg_rwstat *rwstat; - - if (!pos_blkg->online) - continue; - - if (pol) - rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; - else - rwstat = (void *)pos_blkg + off; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); - /* Performs queue bypass and policy enabled checks then looks up blkg. */ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, const struct blkcg_policy *pol, diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2d0fc73d9781..98233c9c65a8 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -12,6 +12,7 @@ #include #include #include "blk.h" +#include "blk-cgroup-rwstat.h" /* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 867ab391e409..48a66738143d 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -41,17 +41,6 @@ enum blkg_iostat_type { BLKG_IOSTAT_NR, }; -enum blkg_rwstat_type { - BLKG_RWSTAT_READ, - BLKG_RWSTAT_WRITE, - BLKG_RWSTAT_SYNC, - BLKG_RWSTAT_ASYNC, - BLKG_RWSTAT_DISCARD, - - BLKG_RWSTAT_NR, - BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, -}; - struct blkcg_gq; struct blkcg { @@ -82,19 +71,6 @@ struct blkg_iostat_set { struct blkg_iostat last; }; -/* - * blkg_[rw]stat->aux_cnt is excluded for local stats but included for - * recursive. Used to carry stats of dead children. - */ -struct blkg_rwstat { - struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; - atomic64_t aux_cnt[BLKG_RWSTAT_NR]; -}; - -struct blkg_rwstat_sample { - u64 cnt[BLKG_RWSTAT_NR]; -}; - /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track @@ -223,13 +199,6 @@ int blkcg_activate_policy(struct request_queue *q, void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol); -static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, - unsigned int idx) -{ - return atomic64_read(&rwstat->aux_cnt[idx]) + - percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); -} - const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, @@ -237,12 +206,6 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat_sample *rwstat); -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off); -void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, - int off, struct blkg_rwstat_sample *sum); struct blkg_conf_ctx { struct gendisk *disk; @@ -594,128 +557,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) -static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) -{ - int i, ret; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) { - ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); - if (ret) { - while (--i >= 0) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); - return ret; - } - atomic64_set(&rwstat->aux_cnt[i], 0); - } - return 0; -} - -static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); -} - -/** - * blkg_rwstat_add - add a value to a blkg_rwstat - * @rwstat: target blkg_rwstat - * @op: REQ_OP and flags - * @val: value to add - * - * Add @val to @rwstat. The counters are chosen according to @rw. The - * caller is responsible for synchronizing calls to this function. - */ -static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - unsigned int op, uint64_t val) -{ - struct percpu_counter *cnt; - - if (op_is_discard(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; - else if (op_is_write(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; - else - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; - - percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); - - if (op_is_sync(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; - else - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; - - percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); -} - -/** - * blkg_rwstat_read - read the current values of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Read the current snapshot of @rwstat and return it in the aux counts. - */ -static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, - struct blkg_rwstat_sample *result) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - result->cnt[i] = - percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); -} - -/** - * blkg_rwstat_total - read the total count of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Return the total count of @rwstat regardless of the IO direction. This - * function can be called without synchronization and takes care of u64 - * atomicity. - */ -static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) -{ - struct blkg_rwstat_sample tmp = { }; - - blkg_rwstat_read(rwstat, &tmp); - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; -} - -/** - * blkg_rwstat_reset - reset a blkg_rwstat - * @rwstat: blkg_rwstat to reset - */ -static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) { - percpu_counter_set(&rwstat->cpu_cnt[i], 0); - atomic64_set(&rwstat->aux_cnt[i], 0); - } -} - -/** - * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count - * @to: the destination blkg_rwstat - * @from: the source - * - * Add @from's count including the aux one to @to's aux count. - */ -static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, - struct blkg_rwstat *from) -{ - u64 sum[BLKG_RWSTAT_NR]; - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), - &to->aux_cnt[i]); -} - #ifdef CONFIG_BLK_DEV_THROTTLING extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio); -- cgit v1.2.3-59-g8ed1b From 59db8ba2f6528a9c39668f9ed8c81eac1dff3b38 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 8 Nov 2019 18:15:27 +0800 Subject: block: still try to split bio if the bvec crosses pages Some device may set segment boundary as PAGE_SIZE - 1. If the bvec crosses pages, and meantime its length is <= PAGE_SIZE, we still need to split the bvec into 2 segments. Fixes this issue by still splitting bio if the single bvec crosses pages. Reported-by: kernel test robot Fixes: fa5322872187 (block: avoid blk_bio_segment_split for small I/O operations) Cc: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index f22cb6251d06..d783bdc4559b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -319,7 +319,8 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, */ if (!q->limits.chunk_sectors && (*bio)->bi_vcnt == 1 && - (*bio)->bi_io_vec[0].bv_len <= PAGE_SIZE) { + ((*bio)->bi_io_vec[0].bv_len + + (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) { *nr_segs = 1; break; } -- cgit v1.2.3-59-g8ed1b From 6952a7f8446ee85ea9d10ab87b64797a031eaae3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 8 Nov 2019 18:15:28 +0800 Subject: block: split bio if the only bvec's length is > SZ_4K 64K PAGE_SIZE is popular on ARM64 or other ARCHs, and 64K has been big enough to break some devices probably, so change the logic to split bio if the only bvec's length is > SZ_4K instead of PAGE_SIZE. Fixes: fa5322872187 (block: avoid blk_bio_segment_split for small I/O operations) Cc: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index d783bdc4559b..f35327f63ef4 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -320,7 +320,7 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, if (!q->limits.chunk_sectors && (*bio)->bi_vcnt == 1 && ((*bio)->bi_io_vec[0].bv_len + - (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) { + (*bio)->bi_io_vec[0].bv_offset) <= SZ_4K) { *nr_segs = 1; break; } -- cgit v1.2.3-59-g8ed1b From cb711b91a3c685192f2cabd3735ca3de04694ed8 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 14 Nov 2019 01:27:21 +0800 Subject: blk-mq: Delete blk_mq_has_free_tags() and blk_mq_can_queue() These functions are not referenced, so delete them. Signed-off-by: John Garry Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 8 -------- block/blk-mq-tag.h | 1 - block/blk-mq.c | 6 ------ include/linux/blk-mq.h | 1 - 4 files changed, 16 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 008388e82b5c..fbacde454718 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -15,14 +15,6 @@ #include "blk-mq.h" #include "blk-mq-tag.h" -bool blk_mq_has_free_tags(struct blk_mq_tags *tags) -{ - if (!tags) - return true; - - return sbitmap_any_bit_clear(&tags->bitmap_tags.sb); -} - /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 61deab0b5a5a..15bc74acb57e 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -28,7 +28,6 @@ extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); -extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); diff --git a/block/blk-mq.c b/block/blk-mq.c index 5c9adcaa27ac..323c9cb28066 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -260,12 +260,6 @@ void blk_mq_wake_waiters(struct request_queue *q) blk_mq_tag_wakeup_all(hctx->tags, true); } -bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) -{ - return blk_mq_has_free_tags(hctx->tags); -} -EXPORT_SYMBOL(blk_mq_can_queue); - /* * Only need start/end time stamping if we have iostat or * blk stats enabled, or using an IO scheduler. diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index dc03e059fdff..11cfd6470b1a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -424,7 +424,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_free_request(struct request *rq); -bool blk_mq_can_queue(struct blk_mq_hw_ctx *); bool blk_mq_queue_inflight(struct request_queue *q); -- cgit v1.2.3-59-g8ed1b From de678bc63cc659d056a5ff3a3b11866d3eb4c1a9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 18 Nov 2019 12:01:22 +0100 Subject: block: Don't disable interrupts in trigger_softirq() trigger_softirq() is always invoked as a SMP-function call which is always invoked with disables interrupts. Don't disable interrupt in trigger_softirq() because interrupts are already disabled. Reviewed-by: Ming Lei Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jens Axboe --- block/blk-softirq.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'block') diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 457d9ba3eb20..6e7ec87d49fa 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -42,17 +42,13 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h) static void trigger_softirq(void *data) { struct request *rq = data; - unsigned long flags; struct list_head *list; - local_irq_save(flags); list = this_cpu_ptr(&blk_cpu_done); list_add_tail(&rq->ipi_list, list); if (list->next == &rq->ipi_list) raise_softirq_irqoff(BLOCK_SOFTIRQ); - - local_irq_restore(flags); } /* -- cgit v1.2.3-59-g8ed1b From c6da429ea988de8f9330405fc405ee32479b5bd5 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Fri, 8 Nov 2019 16:09:04 -0700 Subject: block: sed-opal: Introduce SUM_SET_LIST parameter and append it using 'add_token_u64' In function 'activate_lsp', rather than hard-coding the short atom header(0x83), we need to let the function 'add_short_atom_header' append the header based on the parameter being appended. The parameter has been defined in Section 3.1.2.1 of https://trustedcomputinggroup.org/wp-content/uploads/TCG_Storage-Opal_Feature_Set_Single_User_Mode_v1-00_r1-00-Final.pdf Reviewed-by: Jon Derrick Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/opal_proto.h | 4 ++++ block/sed-opal.c | 6 +----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/opal_proto.h b/block/opal_proto.h index 736e67c3e7c5..325cbba2465f 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -205,6 +205,10 @@ enum opal_lockingstate { OPAL_LOCKING_LOCKED = 0x03, }; +enum opal_parameter { + OPAL_SUM_SET_LIST = 0x060000, +}; + /* Packets derived from: * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 * Secion: 3.2.3 ComPackets, Packets & Subpackets diff --git a/block/sed-opal.c b/block/sed-opal.c index b2cacc9ddd11..880cc57a5f6b 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1886,7 +1886,6 @@ static int activate_lsp(struct opal_dev *dev, void *data) { struct opal_lr_act *opal_act = data; u8 user_lr[OPAL_UID_LENGTH]; - u8 uint_3 = 0x83; int err, i; err = cmd_start(dev, opaluid[OPAL_LOCKINGSP_UID], @@ -1899,10 +1898,7 @@ static int activate_lsp(struct opal_dev *dev, void *data) return err; add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, uint_3); - add_token_u8(&err, dev, 6); - add_token_u8(&err, dev, 0); - add_token_u8(&err, dev, 0); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); add_token_u8(&err, dev, OPAL_STARTLIST); add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); -- cgit v1.2.3-59-g8ed1b From 40d47c155e8ae9bcb3f2d0d01cf14d903c664726 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 1 Nov 2019 13:11:10 +0000 Subject: block,bfq: Skip tracing hooks if possible In most cases blk_tracing is not active, but bfq_log_bfqq macro generate pid_str unconditionally, which result in significant overhead. ## Test modprobe null_blk echo bfq > /sys/block/nullb0/queue/scheduler fio --name=t --ioengine=libaio --direct=1 --filename=/dev/nullb0 \ --runtime=30 --time_based=1 --rw=write --iodepth=128 --bs=4k # Results | | baseline | w/ patch | gain | | iops | 113.19K | 126.42K | +11% | Acked-by: Paolo Valente Signed-off-by: Dmitry Monakhov Signed-off-by: Jens Axboe --- block/bfq-iosched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'block') diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 9c82c1f35716..8526f20c53bc 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1068,6 +1068,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ char pid_str[MAX_PID_STR_LENGTH]; \ + if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ + break; \ bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ blk_add_cgroup_trace_msg((bfqd)->queue, \ bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ @@ -1084,6 +1086,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ char pid_str[MAX_PID_STR_LENGTH]; \ + if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ + break; \ bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- cgit v1.2.3-59-g8ed1b From b6866318657717c8914673a6394894d12bc9ff5e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 21 Nov 2019 13:40:26 +0300 Subject: block: add iostat counters for flush requests Requests that triggers flushing volatile writeback cache to disk (barriers) have significant effect to overall performance. Block layer has sophisticated engine for combining several flush requests into one. But there is no statistics for actual flushes executed by disk. Requests which trigger flushes usually are barriers - zero-size writes. This patch adds two iostat counters into /sys/class/block/$dev/stat and /proc/diskstats - count of completed flush requests and their total time. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- Documentation/ABI/testing/procfs-diskstats | 5 +++++ Documentation/ABI/testing/sysfs-block | 6 ++++++ Documentation/admin-guide/iostats.rst | 9 +++++++++ Documentation/block/stat.rst | 14 ++++++++++++-- block/blk-flush.c | 15 ++++++++++++++- block/genhd.c | 8 ++++++-- block/partition-generic.c | 7 +++++-- include/linux/blk_types.h | 1 + 8 files changed, 58 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats index 2c44b4f1b060..70dcaf2481f4 100644 --- a/Documentation/ABI/testing/procfs-diskstats +++ b/Documentation/ABI/testing/procfs-diskstats @@ -29,4 +29,9 @@ Description: 17 - sectors discarded 18 - time spent discarding + Kernel 5.5+ appends two more fields for flush requests: + + 19 - flush requests completed successfully + 20 - time spent flushing + For more details refer to Documentation/admin-guide/iostats.rst diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index f8c7c7126bb1..ed8c14f161ee 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -15,6 +15,12 @@ Description: 9 - I/Os currently in progress 10 - time spent doing I/Os (ms) 11 - weighted time spent doing I/Os (ms) + 12 - discards completed + 13 - discards merged + 14 - sectors discarded + 15 - time spent discarding (ms) + 16 - flush requests completed + 17 - time spent flushing (ms) For more details refer Documentation/admin-guide/iostats.rst diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst index 5d63b18bd6d1..4f0462af3ca7 100644 --- a/Documentation/admin-guide/iostats.rst +++ b/Documentation/admin-guide/iostats.rst @@ -121,6 +121,15 @@ Field 15 -- # of milliseconds spent discarding This is the total number of milliseconds spent by all discards (as measured from __make_request() to end_that_request_last()). +Field 16 -- # of flush requests completed + This is the total number of flush requests completed successfully. + + Block layer combines flush requests and executes at most one at a time. + This counts flush requests executed by disk. Not tracked for partitions. + +Field 17 -- # of milliseconds spent flushing + This is the total number of milliseconds spent by all flush requests. + To avoid introducing performance bottlenecks, no locks are held while modifying these counters. This implies that minor inaccuracies may be introduced when changes collide, so (for instance) adding up all the diff --git a/Documentation/block/stat.rst b/Documentation/block/stat.rst index 9c07bc22b0bc..77311335c08b 100644 --- a/Documentation/block/stat.rst +++ b/Documentation/block/stat.rst @@ -41,6 +41,8 @@ discard I/Os requests number of discard I/Os processed discard merges requests number of discard I/Os merged with in-queue I/O discard sectors sectors number of sectors discarded discard ticks milliseconds total wait time for discard requests +flush I/Os requests number of flush I/Os processed +flush ticks milliseconds total wait time for flush requests =============== ============= ================================================= read I/Os, write I/Os, discard I/0s @@ -48,6 +50,14 @@ read I/Os, write I/Os, discard I/0s These values increment when an I/O request completes. +flush I/Os +========== + +These values increment when an flush I/O request completes. + +Block layer combines flush requests and executes at most one at a time. +This counts flush requests executed by disk. Not tracked for partitions. + read merges, write merges, discard merges ========================================= @@ -62,8 +72,8 @@ discarded from this block device. The "sectors" in question are the standard UNIX 512-byte sectors, not any device- or filesystem-specific block size. The counters are incremented when the I/O completes. -read ticks, write ticks, discard ticks -====================================== +read ticks, write ticks, discard ticks, flush ticks +=================================================== These values count the number of milliseconds that I/O requests have waited on this block device. If there are multiple I/O requests waiting, diff --git a/block/blk-flush.c b/block/blk-flush.c index 1eec9cbe5a0a..1777346baf06 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -136,6 +136,17 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) blk_mq_add_to_requeue_list(rq, add_front, true); } +static void blk_account_io_flush(struct request *rq) +{ + struct hd_struct *part = &rq->rq_disk->part0; + + part_stat_lock(); + part_stat_inc(part, ios[STAT_FLUSH]); + part_stat_add(part, nsecs[STAT_FLUSH], + ktime_get_ns() - rq->start_time_ns); + part_stat_unlock(); +} + /** * blk_flush_complete_seq - complete flush sequence * @rq: PREFLUSH/FUA request being sequenced @@ -185,7 +196,7 @@ static void blk_flush_complete_seq(struct request *rq, case REQ_FSEQ_DONE: /* - * @rq was previously adjusted by blk_flush_issue() for + * @rq was previously adjusted by blk_insert_flush() for * flush sequencing and may already have gone through the * flush data request completion path. Restore @rq for * normal completion and end it. @@ -212,6 +223,8 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); struct blk_mq_hw_ctx *hctx; + blk_account_io_flush(flush_rq); + /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); diff --git a/block/genhd.c b/block/genhd.c index 26b31fcae217..ff6268970ddc 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1385,7 +1385,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%lu %lu %lu %u " "%lu %lu %lu %u " "%u %u %u " - "%lu %lu %lu %u\n", + "%lu %lu %lu %u " + "%lu %u" + "\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), part_stat_read(hd, ios[STAT_READ]), @@ -1402,7 +1404,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, ios[STAT_DISCARD]), part_stat_read(hd, merges[STAT_DISCARD]), part_stat_read(hd, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD) + (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD), + part_stat_read(hd, ios[STAT_FLUSH]), + (unsigned int)part_stat_read_msecs(hd, STAT_FLUSH) ); } disk_part_iter_exit(&piter); diff --git a/block/partition-generic.c b/block/partition-generic.c index aee643ce13d1..3db8b73a96b1 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -127,7 +127,8 @@ ssize_t part_stat_show(struct device *dev, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " "%8u %8u %8u " - "%8lu %8lu %8llu %8u" + "%8lu %8lu %8llu %8u " + "%8lu %8u" "\n", part_stat_read(p, ios[STAT_READ]), part_stat_read(p, merges[STAT_READ]), @@ -143,7 +144,9 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, ios[STAT_DISCARD]), part_stat_read(p, merges[STAT_DISCARD]), (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(p, STAT_DISCARD)); + (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), + part_stat_read(p, ios[STAT_FLUSH]), + (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 23a2fd534817..70254ae11769 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -377,6 +377,7 @@ enum stat_group { STAT_READ, STAT_WRITE, STAT_DISCARD, + STAT_FLUSH, NR_STAT_GROUPS }; -- cgit v1.2.3-59-g8ed1b From 1e279153dfd53e76006720df804d5935a6cbc6d5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Nov 2019 10:16:12 -0700 Subject: Revert "block: split bio if the only bvec's length is > SZ_4K" We really don't need this, as the slow path will do the right thing anyway. This reverts commit 6952a7f8446ee85ea9d10ab87b64797a031eaae3. Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index f35327f63ef4..d783bdc4559b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -320,7 +320,7 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, if (!q->limits.chunk_sectors && (*bio)->bi_vcnt == 1 && ((*bio)->bi_io_vec[0].bv_len + - (*bio)->bi_io_vec[0].bv_offset) <= SZ_4K) { + (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) { *nr_segs = 1; break; } -- cgit v1.2.3-59-g8ed1b