From 3280d66a6363af0df0441709bc0bc302bd9a2510 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 14 Aug 2017 16:40:11 -0400 Subject: blk-mq: Fix queue usage on failed request allocation blk_mq_get_request() does not release the callers queue usage counter when allocation fails. The caller still needs to account for its own queue usage when it is unable to allocate a request. Fixes: 1ad43c0078b7 ("blk-mq: don't leak preempt counter/q_usage_counter when allocating rq failed") Reported-by: Max Gurtovoy Reviewed-by: Ming Lei Reviewed-by: Sagi Grimberg Tested-by: Max Gurtovoy Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 535cbdf32aab..4603b115e234 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -360,12 +360,12 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, return ERR_PTR(ret); rq = blk_mq_get_request(q, NULL, op, &alloc_data); + blk_queue_exit(q); if (!rq) return ERR_PTR(-EWOULDBLOCK); blk_mq_put_ctx(alloc_data.ctx); - blk_queue_exit(q); rq->__data_len = 0; rq->__sector = (sector_t) -1; @@ -411,12 +411,11 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, alloc_data.ctx = __blk_mq_get_ctx(q, cpu); rq = blk_mq_get_request(q, NULL, op, &alloc_data); + blk_queue_exit(q); if (!rq) return ERR_PTR(-EWOULDBLOCK); - blk_queue_exit(q); - return rq; } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); -- cgit v1.2.3-59-g8ed1b From c005390374957baacbc38eef96ea360559510aa7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Aug 2017 12:24:47 +0200 Subject: blk-mq-pci: add a fallback when pci_irq_get_affinity returns NULL While pci_irq_get_affinity should never fail for SMP kernel that implement the affinity mapping, it will always return NULL in the UP case, so provide a fallback mapping of all queues to CPU 0 in that case. Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-pci.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index 0c3354cf3552..76944e3271bf 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -36,12 +36,18 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev) for (queue = 0; queue < set->nr_hw_queues; queue++) { mask = pci_irq_get_affinity(pdev, queue); if (!mask) - return -EINVAL; + goto fallback; for_each_cpu(cpu, mask) set->mq_map[cpu] = queue; } return 0; + +fallback: + WARN_ON_ONCE(set->nr_hw_queues > 1); + for_each_possible_cpu(cpu) + set->mq_map[cpu] = 0; + return 0; } EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); -- cgit v1.2.3-59-g8ed1b From ea0ea2bc6dd8923d86a0fa98743dbeed98645486 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 18 Aug 2017 16:08:13 -0700 Subject: blk-throttle: cap discard request size discard request usually is very big and easily use all bandwidth budget of a cgroup. discard request size doesn't really mean the size of data written, so it doesn't make sense to account it into bandwidth budget. Jens pointed out treating the size 0 doesn't make sense too, because discard request does have cost. But it's not easy to find the actual cost. This patch simply makes the size one sector. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-throttle.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a7285bf2831c..80f5481fe9f6 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -382,6 +382,14 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) } \ } while (0) +static inline unsigned int throtl_bio_data_size(struct bio *bio) +{ + /* assume it's one sector */ + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) + return 512; + return bio->bi_iter.bi_size; +} + static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); @@ -934,6 +942,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, bool rw = bio_data_dir(bio); u64 bytes_allowed, extra_bytes, tmp; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + unsigned int bio_size = throtl_bio_data_size(bio); jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; @@ -947,14 +956,14 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, do_div(tmp, HZ); bytes_allowed = tmp; - if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { + if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { if (wait) *wait = 0; return true; } /* Calc approx time to dispatch */ - extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; + extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw)); if (!jiffy_wait) @@ -1034,11 +1043,12 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); + unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ - tg->bytes_disp[rw] += bio->bi_iter.bi_size; + tg->bytes_disp[rw] += bio_size; tg->io_disp[rw]++; - tg->last_bytes_disp[rw] += bio->bi_iter.bi_size; + tg->last_bytes_disp[rw] += bio_size; tg->last_io_disp[rw]++; /* -- cgit v1.2.3-59-g8ed1b From 50b4d485528d1dbe0bd249f2073140e3444f4a7b Mon Sep 17 00:00:00 2001 From: Benjamin Block Date: Thu, 24 Aug 2017 01:57:56 +0200 Subject: bsg-lib: fix kernel panic resulting from missing allocation of reply-buffer Since we split the scsi_request out of struct request bsg fails to provide a reply-buffer for the drivers. This was done via the pointer for sense-data, that is not preallocated anymore. Failing to allocate/assign it results in illegal dereferences because LLDs use this pointer unquestioned. An example panic on s390x, using the zFCP driver, looks like this (I had debugging on, otherwise NULL-pointer dereferences wouldn't even panic on s390x): Unable to handle kernel pointer dereference in virtual kernel address space Failing address: 6b6b6b6b6b6b6000 TEID: 6b6b6b6b6b6b6403 Fault in home space mode while using kernel ASCE. AS:0000000001590007 R3:0000000000000024 Oops: 0038 ilc:2 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.12.0-bsg-regression+ #3 Hardware name: IBM 2964 N96 702 (z/VM 6.4.0) task: 0000000065cb0100 task.stack: 0000000065cb4000 Krnl PSW : 0704e00180000000 000003ff801e4156 (zfcp_fc_ct_els_job_handler+0x16/0x58 [zfcp]) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3 Krnl GPRS: 0000000000000001 000000005fa9d0d0 000000005fa9d078 0000000000e16866 000003ff00000290 6b6b6b6b6b6b6b6b 0000000059f78f00 000000000000000f 00000000593a0958 00000000593a0958 0000000060d88800 000000005ddd4c38 0000000058b50100 07000000659cba08 000003ff801e8556 00000000659cb9a8 Krnl Code: 000003ff801e4146: e31020500004 lg %r1,80(%r2) 000003ff801e414c: 58402040 l %r4,64(%r2) #000003ff801e4150: e35020200004 lg %r5,32(%r2) >000003ff801e4156: 50405004 st %r4,4(%r5) 000003ff801e415a: e54c50080000 mvhi 8(%r5),0 000003ff801e4160: e33010280012 lt %r3,40(%r1) 000003ff801e4166: a718fffb lhi %r1,-5 000003ff801e416a: 1803 lr %r0,%r3 Call Trace: ([<000003ff801e8556>] zfcp_fsf_req_complete+0x726/0x768 [zfcp]) [<000003ff801ea82a>] zfcp_fsf_reqid_check+0x102/0x180 [zfcp] [<000003ff801eb980>] zfcp_qdio_int_resp+0x230/0x278 [zfcp] [<00000000009b91b6>] qdio_kick_handler+0x2ae/0x2c8 [<00000000009b9e3e>] __tiqdio_inbound_processing+0x406/0xc10 [<00000000001684c2>] tasklet_action+0x15a/0x1d8 [<0000000000bd28ec>] __do_softirq+0x3ec/0x848 [<00000000001675a4>] irq_exit+0x74/0xf8 [<000000000010dd6a>] do_IRQ+0xba/0xf0 [<0000000000bd19e8>] io_int_handler+0x104/0x2d4 [<00000000001033b6>] enabled_wait+0xb6/0x188 ([<000000000010339e>] enabled_wait+0x9e/0x188) [<000000000010396a>] arch_cpu_idle+0x32/0x50 [<0000000000bd0112>] default_idle_call+0x52/0x68 [<00000000001cd0fa>] do_idle+0x102/0x188 [<00000000001cd41e>] cpu_startup_entry+0x3e/0x48 [<0000000000118c64>] smp_start_secondary+0x11c/0x130 [<0000000000bd2016>] restart_int_handler+0x62/0x78 [<0000000000000000>] (null) INFO: lockdep is turned off. Last Breaking-Event-Address: [<000003ff801e41d6>] zfcp_fc_ct_job_handler+0x3e/0x48 [zfcp] Kernel panic - not syncing: Fatal exception in interrupt This patch moves bsg-lib to allocate and setup struct bsg_job ahead of time, including the allocation of a buffer for the reply-data. This means, struct bsg_job is not allocated separately anymore, but as part of struct request allocation - similar to struct scsi_cmd. Reflect this in the function names that used to handle creation/destruction of struct bsg_job. Reported-by: Steffen Maier Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Benjamin Block Fixes: 82ed4db499b8 ("block: split scsi_request out of struct request") Cc: #4.11+ Signed-off-by: Jens Axboe --- block/bsg-lib.c | 74 +++++++++++++++++++++++++++++-------------------- include/linux/blkdev.h | 1 - include/linux/bsg-lib.h | 2 ++ 3 files changed, 46 insertions(+), 31 deletions(-) (limited to 'block') diff --git a/block/bsg-lib.c b/block/bsg-lib.c index c4513b23f57a..dd56d7460cb9 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -29,26 +29,25 @@ #include /** - * bsg_destroy_job - routine to teardown/delete a bsg job + * bsg_teardown_job - routine to teardown a bsg job * @job: bsg_job that is to be torn down */ -static void bsg_destroy_job(struct kref *kref) +static void bsg_teardown_job(struct kref *kref) { struct bsg_job *job = container_of(kref, struct bsg_job, kref); struct request *rq = job->req; - blk_end_request_all(rq, BLK_STS_OK); - put_device(job->dev); /* release reference for the request */ kfree(job->request_payload.sg_list); kfree(job->reply_payload.sg_list); - kfree(job); + + blk_end_request_all(rq, BLK_STS_OK); } void bsg_job_put(struct bsg_job *job) { - kref_put(&job->kref, bsg_destroy_job); + kref_put(&job->kref, bsg_teardown_job); } EXPORT_SYMBOL_GPL(bsg_job_put); @@ -100,7 +99,7 @@ EXPORT_SYMBOL_GPL(bsg_job_done); */ static void bsg_softirq_done(struct request *rq) { - struct bsg_job *job = rq->special; + struct bsg_job *job = blk_mq_rq_to_pdu(rq); bsg_job_put(job); } @@ -122,33 +121,20 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) } /** - * bsg_create_job - create the bsg_job structure for the bsg request + * bsg_prepare_job - create the bsg_job structure for the bsg request * @dev: device that is being sent the bsg request * @req: BSG request that needs a job structure */ -static int bsg_create_job(struct device *dev, struct request *req) +static int bsg_prepare_job(struct device *dev, struct request *req) { struct request *rsp = req->next_rq; - struct request_queue *q = req->q; struct scsi_request *rq = scsi_req(req); - struct bsg_job *job; + struct bsg_job *job = blk_mq_rq_to_pdu(req); int ret; - BUG_ON(req->special); - - job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL); - if (!job) - return -ENOMEM; - - req->special = job; - job->req = req; - if (q->bsg_job_size) - job->dd_data = (void *)&job[1]; job->request = rq->cmd; job->request_len = rq->cmd_len; - job->reply = rq->sense; - job->reply_len = SCSI_SENSE_BUFFERSIZE; /* Size of sense buffer - * allocated */ + if (req->bio) { ret = bsg_map_buffer(&job->request_payload, req); if (ret) @@ -187,7 +173,6 @@ static void bsg_request_fn(struct request_queue *q) { struct device *dev = q->queuedata; struct request *req; - struct bsg_job *job; int ret; if (!get_device(dev)) @@ -199,7 +184,7 @@ static void bsg_request_fn(struct request_queue *q) break; spin_unlock_irq(q->queue_lock); - ret = bsg_create_job(dev, req); + ret = bsg_prepare_job(dev, req); if (ret) { scsi_req(req)->result = ret; blk_end_request_all(req, BLK_STS_OK); @@ -207,8 +192,7 @@ static void bsg_request_fn(struct request_queue *q) continue; } - job = req->special; - ret = q->bsg_job_fn(job); + ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req)); spin_lock_irq(q->queue_lock); if (ret) break; @@ -219,6 +203,35 @@ static void bsg_request_fn(struct request_queue *q) spin_lock_irq(q->queue_lock); } +static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(req); + struct scsi_request *sreq = &job->sreq; + + memset(job, 0, sizeof(*job)); + + scsi_req_init(sreq); + sreq->sense_len = SCSI_SENSE_BUFFERSIZE; + sreq->sense = kzalloc(sreq->sense_len, gfp); + if (!sreq->sense) + return -ENOMEM; + + job->req = req; + job->reply = sreq->sense; + job->reply_len = sreq->sense_len; + job->dd_data = job + 1; + + return 0; +} + +static void bsg_exit_rq(struct request_queue *q, struct request *req) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(req); + struct scsi_request *sreq = &job->sreq; + + kfree(sreq->sense); +} + /** * bsg_setup_queue - Create and add the bsg hooks so we can receive requests * @dev: device to attach bsg device to @@ -235,7 +248,9 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name, q = blk_alloc_queue(GFP_KERNEL); if (!q) return ERR_PTR(-ENOMEM); - q->cmd_size = sizeof(struct scsi_request); + q->cmd_size = sizeof(struct bsg_job) + dd_job_size; + q->init_rq_fn = bsg_init_rq; + q->exit_rq_fn = bsg_exit_rq; q->request_fn = bsg_request_fn; ret = blk_init_allocated_queue(q); @@ -243,7 +258,6 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name, goto out_cleanup_queue; q->queuedata = dev; - q->bsg_job_size = dd_job_size; q->bsg_job_fn = job_fn; queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q); queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 25f6a0cb27d3..2a5d52fa90f5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -568,7 +568,6 @@ struct request_queue { #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; - int bsg_job_size; struct bsg_class_device bsg_dev; #endif diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h index e34dde2da0ef..637a20cfb237 100644 --- a/include/linux/bsg-lib.h +++ b/include/linux/bsg-lib.h @@ -24,6 +24,7 @@ #define _BLK_BSG_ #include +#include struct request; struct device; @@ -37,6 +38,7 @@ struct bsg_buffer { }; struct bsg_job { + struct scsi_request sreq; struct device *dev; struct request *req; -- cgit v1.2.3-59-g8ed1b From 22d538213ec4fa65b08b1edbf610066d8aab7bbb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 18 Aug 2017 15:52:54 -0700 Subject: blk-mq-debugfs: Add names for recently added flags The symbolic constants QUEUE_FLAG_SCSI_PASSTHROUGH, QUEUE_FLAG_QUIESCED and REQ_NOWAIT are missing from blk-mq-debugfs.c. Add these to blk-mq-debugfs.c such that these appear as names in debugfs instead of as numbers. Reviewed-by: Omar Sandoval Signed-off-by: Bart Van Assche Cc: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 9ebc2945f991..4f927a58dff8 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -75,6 +75,8 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(POLL_STATS), QUEUE_FLAG_NAME(REGISTERED), + QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), + QUEUE_FLAG_NAME(QUIESCED), }; #undef QUEUE_FLAG_NAME @@ -265,6 +267,7 @@ static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(RAHEAD), CMD_FLAG_NAME(BACKGROUND), CMD_FLAG_NAME(NOUNMAP), + CMD_FLAG_NAME(NOWAIT), }; #undef CMD_FLAG_NAME -- cgit v1.2.3-59-g8ed1b