From 22cb4e681523b35f1a0eba40c25ddc1b16898801 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Jan 2019 21:23:30 +0100 Subject: mtip32xx: Ń•top abusing the managed resource APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mtip32xx driver uses managed resources for DMA coherent memory and irqs, but then always pairs them with free calls anyway, making the resource tracking rather pointless. Given some DMA allocations are transient anyway, the irq freeing seems to require ordering vs other hardware access the best solution seems to be to stop using the managed resource API entirely. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 88e8440e75c3..9a6f40cd8df6 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1415,7 +1415,7 @@ static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba, WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE); /* Allocate a DMA buffer for the trim structure */ - buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr, + buf = dma_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr, GFP_KERNEL); if (!buf) return BLK_STS_RESOURCE; @@ -1452,7 +1452,7 @@ static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba, MTIP_TRIM_TIMEOUT_MS) < 0) ret = BLK_STS_IOERR; - dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr); + dma_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr); return ret; } @@ -1655,7 +1655,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, if (!user_buffer) return -EFAULT; - buf = dmam_alloc_coherent(&port->dd->pdev->dev, + buf = dma_alloc_coherent(&port->dd->pdev->dev, ATA_SECT_SIZE * xfer_sz, &dma_addr, GFP_KERNEL); @@ -1733,7 +1733,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, } exit_drive_command: if (buf) - dmam_free_coherent(&port->dd->pdev->dev, + dma_free_coherent(&port->dd->pdev->dev, ATA_SECT_SIZE * xfer_sz, buf, dma_addr); return rv; } @@ -2837,11 +2837,11 @@ static void mtip_dma_free(struct driver_data *dd) struct mtip_port *port = dd->port; if (port->block1) - dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, + dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, port->block1, port->block1_dma); if (port->command_list) { - dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, + dma_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, port->command_list, port->command_list_dma); } } @@ -2860,7 +2860,7 @@ static int mtip_dma_alloc(struct driver_data *dd) /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ port->block1 = - dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, + dma_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, &port->block1_dma, GFP_KERNEL); if (!port->block1) return -ENOMEM; @@ -2868,10 +2868,10 @@ static int mtip_dma_alloc(struct driver_data *dd) /* Allocate dma memory for command list */ port->command_list = - dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, + dma_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, &port->command_list_dma, GFP_KERNEL); if (!port->command_list) { - dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, + dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, port->block1, port->block1_dma); port->block1 = NULL; port->block1_dma = 0; @@ -3056,13 +3056,8 @@ static int mtip_hw_init(struct driver_data *dd) mtip_start_port(dd->port); /* Setup the ISR and enable interrupts. */ - rv = devm_request_irq(&dd->pdev->dev, - dd->pdev->irq, - mtip_irq_handler, - IRQF_SHARED, - dev_driver_string(&dd->pdev->dev), - dd); - + rv = request_irq(dd->pdev->irq, mtip_irq_handler, IRQF_SHARED, + dev_driver_string(&dd->pdev->dev), dd); if (rv) { dev_err(&dd->pdev->dev, "Unable to allocate IRQ %d\n", dd->pdev->irq); @@ -3090,7 +3085,7 @@ out3: /* Release the IRQ. */ irq_set_affinity_hint(dd->pdev->irq, NULL); - devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); + free_irq(dd->pdev->irq, dd); out2: mtip_deinit_port(dd->port); @@ -3145,7 +3140,7 @@ static int mtip_hw_exit(struct driver_data *dd) /* Release the IRQ. */ irq_set_affinity_hint(dd->pdev->irq, NULL); - devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); + free_irq(dd->pdev->irq, dd); msleep(1000); /* Free dma regions */ @@ -3609,8 +3604,8 @@ static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq, if (!cmd->command) return; - dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, - cmd->command, cmd->command_dma); + dma_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, cmd->command, + cmd->command_dma); } static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq, @@ -3619,7 +3614,7 @@ static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq, struct driver_data *dd = set->driver_data; struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); - cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, + cmd->command = dma_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, &cmd->command_dma, GFP_KERNEL); if (!cmd->command) return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From 20cd32450bcbec37c6d881b84bdddd8ba047ab01 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:25 +0100 Subject: block, bfq: do not consider interactive queues in srt filtering The speed at which a bfq_queue receives I/O is one of the parameters by which bfq decides whether the queue is soft real-time (i.e., whether the queue contains the I/O of a soft real-time application). In particular, when a bfq_queue remains without outstanding I/O requests, bfq computes the minimum time instant, named soft_rt_next_start, at which the next request of the queue may arrive for the queue to be deemed as soft real time. Unfortunately this filtering may cause problems with a queue in interactive weight raising. In fact, such a queue may be conveying the I/O needed to load a soft real-time application. The latter will actually exhibit a soft real-time I/O pattern after it finally starts doing its job. But, if soft_rt_next_start is updated for an interactive bfq_queue, and the queue has received a lot of service before remaining with no outstanding request (likely to happen on a fast device), then soft_rt_next_start is assigned such a high value that, for a very long time, the queue is prevented from being possibly considered as soft real time. This commit removes the updating of soft_rt_next_start for bfq_queues in interactive weight raising. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index cd307767a134..c7a4a15c7c19 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3274,16 +3274,32 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, * requests, then the request pattern is isochronous * (see the comments on the function * bfq_bfqq_softrt_next_start()). Thus we can compute - * soft_rt_next_start. If, instead, the queue still - * has outstanding requests, then we have to wait for - * the completion of all the outstanding requests to - * discover whether the request pattern is actually - * isochronous. + * soft_rt_next_start. And we do it, unless bfqq is in + * interactive weight raising. We do not do it in the + * latter subcase, for the following reason. bfqq may + * be conveying the I/O needed to load a soft + * real-time application. Such an application will + * actually exhibit a soft real-time I/O pattern after + * it finally starts doing its job. But, if + * soft_rt_next_start is computed here for an + * interactive bfqq, and bfqq had received a lot of + * service before remaining with no outstanding + * request (likely to happen on a fast device), then + * soft_rt_next_start would be assigned such a high + * value that, for a very long time, bfqq would be + * prevented from being possibly considered as soft + * real time. + * + * If, instead, the queue still has outstanding + * requests, then we have to wait for the completion + * of all the outstanding requests to discover whether + * the request pattern is actually isochronous. */ - if (bfqq->dispatched == 0) + if (bfqq->dispatched == 0 && + bfqq->wr_coeff != bfqd->bfq_wr_coeff) bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); - else { + else if (bfqq->dispatched > 0) { /* * Schedule an update of soft_rt_next_start to when * the task may be discovered to be isochronous. @@ -4834,11 +4850,14 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) * isochronous, and both requisites for this condition to hold * are now satisfied, then compute soft_rt_next_start (see the * comments on the function bfq_bfqq_softrt_next_start()). We - * schedule this delayed check when bfqq expires, if it still - * has in-flight requests. + * do not compute soft_rt_next_start if bfqq is in interactive + * weight raising (see the comments in bfq_bfqq_expire() for + * an explanation). We schedule this delayed update when bfqq + * expires, if it still has in-flight requests. */ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && - RB_EMPTY_ROOT(&bfqq->sort_list)) + RB_EMPTY_ROOT(&bfqq->sort_list) && + bfqq->wr_coeff != bfqd->bfq_wr_coeff) bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); -- cgit v1.2.3-59-g8ed1b From 218cb897be6940b8d18eec9bcb32eaa28f1ae8ee Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:26 +0100 Subject: block, bfq: avoid selecting a queue w/o budget To boost throughput on devices with internal queueing and in scenarios where device idling is not strictly needed, bfq immediately starts serving a new bfq_queue if the in-service bfq_queue remains without pending I/O, even if new I/O may arrive soon for the latter queue. Then, if such I/O actually arrives soon, bfq preempts the new in-service bfq_queue so as to give the previous queue a chance to go on being served (in case the previous queue should actually be the one to be served, according to its timestamps). However, the in-service bfq_queue, say Q, may also be without further budget when it remains also pending I/O. Since bfq changes budgets dynamically to fit the needs of bfq_queues, this happens more often than one may expect. If this happens, then there is no point in trying to go on serving Q when new I/O arrives for it soon: Q would be expired immediately after being selected for service. This would only cause useless overhead. This commit avoids such a useless selection. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c7a4a15c7c19..9ea2c4f42501 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1380,7 +1380,15 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, { struct bfq_entity *entity = &bfqq->entity; - if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { + /* + * In the next compound condition, we check also whether there + * is some budget left, because otherwise there is no point in + * trying to go on serving bfqq with this same budget: bfqq + * would be expired immediately after being selected for + * service. This would only cause useless overhead. + */ + if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time && + bfq_bfqq_budget_left(bfqq) > 0) { /* * We do not clear the flag non_blocking_wait_rq here, as * the latter is used in bfq_activate_bfqq to signal -- cgit v1.2.3-59-g8ed1b From f3218ad8c6acc9b874060814d6d3ddf154463ac9 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:27 +0100 Subject: block, bfq: make sure queue budgets are not below service received With some unlucky sequences of events, the function bfq_updated_next_req updates the current budget of a bfq_queue to a lower value than the service received by the queue using such a budget. Unfortunately, if this happens, then the return value of the function bfq_bfqq_budget_left becomes inconsistent. This commit solves this problem by lower-bounding the budget computed in bfq_updated_next_req to the service currently charged to the queue. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9ea2c4f42501..b0e8006475be 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -907,8 +907,10 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, */ return; - new_budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); + new_budget = max_t(unsigned long, + max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)), + entity->service); if (entity->budget != new_budget) { entity->budget = new_budget; bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -- cgit v1.2.3-59-g8ed1b From 03e565e4204c6cf8687d995de5cafd0341503b4e Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:28 +0100 Subject: block, bfq: remove case of redirected bic from insert_request Before commit 18e5a57d7987 ("block, bfq: postpone rq preparation to insert or merge"), the destination queue for a request was chosen by a different hook than the one that then inserted the request. So, between the execution of the two hooks, the bic of the process generating the request could happen to be redirected to a different bfq_queue. As a consequence, the destination bfq_queue stored in the request could be wrong. Such an event does not need to ba handled any longer. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b0e8006475be..a9275ed57726 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4633,8 +4633,6 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bool waiting, idle_timer_disabled = false; if (new_bfqq) { - if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) - new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); /* * Release the request's reference to the old bfqq * and make sure one is taken to the shared queue. -- cgit v1.2.3-59-g8ed1b From 73d58118498b14e4d2f2391105459b997b586ddc Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:29 +0100 Subject: block, bfq: consider also ioprio classes in symmetry detection In asymmetric scenarios, i.e., when some bfq_queue or bfq_group needs to be guaranteed a different bandwidth than other bfq_queues or bfq_groups, these service guaranteed can be provided only by plugging I/O dispatch, completely or partially, when the queue in service remains temporarily empty. A case where asymmetry is particularly strong is when some active bfq_queues belong to a higher-priority class than some other active bfq_queues. Unfortunately, this important case is not considered at all in the code for detecting asymmetric scenarios. This commit adds the missing logic. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 86 ++++++++++++++++++++++++++++------------------------- block/bfq-iosched.h | 8 +++-- block/bfq-wf2q.c | 12 ++++++-- 3 files changed, 59 insertions(+), 47 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a9275ed57726..6bfbfa65610b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -623,26 +623,6 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->pos_root = NULL; } -/* - * Tell whether there are active queues with different weights or - * active groups. - */ -static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd) -{ - /* - * For queue weights to differ, queue_weights_tree must contain - * at least two nodes. - */ - return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && - (bfqd->queue_weights_tree.rb_node->rb_left || - bfqd->queue_weights_tree.rb_node->rb_right) -#ifdef CONFIG_BFQ_GROUP_IOSCHED - ) || - (bfqd->num_groups_with_pending_reqs > 0 -#endif - ); -} - /* * The following function returns true if every queue must receive the * same share of the throughput (this condition is used when deciding @@ -651,25 +631,48 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd) * * Such a scenario occurs when: * 1) all active queues have the same weight, - * 2) all active groups at the same level in the groups tree have the same - * weight, + * 2) all active queues belong to the same I/O-priority class, * 3) all active groups at the same level in the groups tree have the same + * weight, + * 4) all active groups at the same level in the groups tree have the same * number of children. * * Unfortunately, keeping the necessary state for evaluating exactly * the last two symmetry sub-conditions above would be quite complex - * and time consuming. Therefore this function evaluates, instead, - * only the following stronger two sub-conditions, for which it is + * and time consuming. Therefore this function evaluates, instead, + * only the following stronger three sub-conditions, for which it is * much easier to maintain the needed state: * 1) all active queues have the same weight, - * 2) there are no active groups. + * 2) all active queues belong to the same I/O-priority class, + * 3) there are no active groups. * In particular, the last condition is always true if hierarchical * support or the cgroups interface are not enabled, thus no state * needs to be maintained in this case. */ static bool bfq_symmetric_scenario(struct bfq_data *bfqd) { - return !bfq_varied_queue_weights_or_active_groups(bfqd); + /* + * For queue weights to differ, queue_weights_tree must contain + * at least two nodes. + */ + bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && + (bfqd->queue_weights_tree.rb_node->rb_left || + bfqd->queue_weights_tree.rb_node->rb_right); + + bool multiple_classes_busy = + (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || + (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || + (bfqd->busy_queues[1] && bfqd->busy_queues[2]); + + /* + * For queue weights to differ, queue_weights_tree must contain + * at least two nodes. + */ + return !(varied_queue_weights || multiple_classes_busy +#ifdef BFQ_GROUP_IOSCHED_ENABLED + || bfqd->num_groups_with_pending_reqs > 0 +#endif + ); } /* @@ -728,15 +731,14 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* * In the unlucky event of an allocation failure, we just * exit. This will cause the weight of queue to not be - * considered in bfq_varied_queue_weights_or_active_groups, - * which, in its turn, causes the scenario to be deemed - * wrongly symmetric in case bfqq's weight would have been - * the only weight making the scenario asymmetric. On the - * bright side, no unbalance will however occur when bfqq - * becomes inactive again (the invocation of this function - * is triggered by an activation of queue). In fact, - * bfq_weights_tree_remove does nothing if - * !bfqq->weight_counter. + * considered in bfq_symmetric_scenario, which, in its turn, + * causes the scenario to be deemed wrongly symmetric in case + * bfqq's weight would have been the only weight making the + * scenario asymmetric. On the bright side, no unbalance will + * however occur when bfqq becomes inactive again (the + * invocation of this function is triggered by an activation + * of queue). In fact, bfq_weights_tree_remove does nothing + * if !bfqq->weight_counter. */ if (unlikely(!bfqq->weight_counter)) return; @@ -2227,7 +2229,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, return NULL; /* If there is only one backlogged queue, don't search. */ - if (bfqd->busy_queues == 1) + if (bfq_tot_busy_queues(bfqd) == 1) return NULL; in_service_bfqq = bfqd->in_service_queue; @@ -3681,7 +3683,8 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * the requests already queued in the device have been served. */ asymmetric_scenario = (bfqq->wr_coeff > 1 && - bfqd->wr_busy_queues < bfqd->busy_queues) || + bfqd->wr_busy_queues < + bfq_tot_busy_queues(bfqd)) || !bfq_symmetric_scenario(bfqd); /* @@ -3960,7 +3963,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, * belongs to CLASS_IDLE and other queues are waiting for * service. */ - if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq))) + if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) goto return_rq; bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); @@ -3978,7 +3981,7 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) * most a call to dispatch for nothing */ return !list_empty_careful(&bfqd->dispatch) || - bfqd->busy_queues > 0; + bfq_tot_busy_queues(bfqd) > 0; } static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) @@ -4032,9 +4035,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) goto start_rq; } - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + bfq_log(bfqd, "dispatch requests: %d busy queues", + bfq_tot_busy_queues(bfqd)); - if (bfqd->busy_queues == 0) + if (bfq_tot_busy_queues(bfqd) == 0) goto exit; /* diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 0b02bf302de0..30be669be465 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -501,10 +501,11 @@ struct bfq_data { unsigned int num_groups_with_pending_reqs; /* - * Number of bfq_queues containing requests (including the - * queue in service, even if it is idling). + * Per-class (RT, BE, IDLE) number of bfq_queues containing + * requests (including the queue in service, even if it is + * idling). */ - int busy_queues; + unsigned int busy_queues[3]; /* number of weight-raised busy @bfq_queues */ int wr_busy_queues; /* number of queued requests */ @@ -974,6 +975,7 @@ extern struct blkcg_policy blkcg_policy_bfq; struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq); struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); +unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd); struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity); struct bfq_entity *bfq_entity_of(struct rb_node *node); unsigned short bfq_ioprio_to_weight(int ioprio); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 72adbbe975d5..ce37d709a34f 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -44,6 +44,12 @@ static unsigned int bfq_class_idx(struct bfq_entity *entity) BFQ_DEFAULT_GRP_CLASS - 1; } +unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd) +{ + return bfqd->busy_queues[0] + bfqd->busy_queues[1] + + bfqd->busy_queues[2]; +} + static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, bool expiration); @@ -1513,7 +1519,7 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) struct bfq_sched_data *sd; struct bfq_queue *bfqq; - if (bfqd->busy_queues == 0) + if (bfq_tot_busy_queues(bfqd) == 0) return NULL; /* @@ -1665,7 +1671,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_clear_bfqq_busy(bfqq); - bfqd->busy_queues--; + bfqd->busy_queues[bfqq->ioprio_class - 1]--; if (!bfqq->dispatched) bfq_weights_tree_remove(bfqd, bfqq); @@ -1688,7 +1694,7 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_activate_bfqq(bfqd, bfqq); bfq_mark_bfqq_busy(bfqq); - bfqd->busy_queues++; + bfqd->busy_queues[bfqq->ioprio_class - 1]++; if (!bfqq->dispatched) if (bfqq->wr_coeff == 1) -- cgit v1.2.3-59-g8ed1b From 05c2f5c30b3ca2346a5bb7c74b0c9515d8f4fbd2 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:30 +0100 Subject: block, bfq: split function bfq_better_to_idle This is a preparatory commit for commits that need to check only one of the two main reasons for idling. This change should also improve the quality of the code a little bit, by splitting a function that contains very long, non-trivial and little related comments. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 155 +++++++++++++++++++++++++++------------------------- 1 file changed, 82 insertions(+), 73 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6bfbfa65610b..2756f4b1432b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3404,53 +3404,13 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) bfq_bfqq_budget_timeout(bfqq); } -/* - * For a queue that becomes empty, device idling is allowed only if - * this function returns true for the queue. As a consequence, since - * device idling plays a critical role in both throughput boosting and - * service guarantees, the return value of this function plays a - * critical role in both these aspects as well. - * - * In a nutshell, this function returns true only if idling is - * beneficial for throughput or, even if detrimental for throughput, - * idling is however necessary to preserve service guarantees (low - * latency, desired throughput distribution, ...). In particular, on - * NCQ-capable devices, this function tries to return false, so as to - * help keep the drives' internal queues full, whenever this helps the - * device boost the throughput without causing any service-guarantee - * issue. - * - * In more detail, the return value of this function is obtained by, - * first, computing a number of boolean variables that take into - * account throughput and service-guarantee issues, and, then, - * combining these variables in a logical expression. Most of the - * issues taken into account are not trivial. We discuss these issues - * individually while introducing the variables. - */ -static bool bfq_better_to_idle(struct bfq_queue *bfqq) +static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, + struct bfq_queue *bfqq) { - struct bfq_data *bfqd = bfqq->bfqd; bool rot_without_queueing = !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, bfqq_sequential_and_IO_bound, - idling_boosts_thr, idling_boosts_thr_without_issues, - idling_needed_for_service_guarantees, - asymmetric_scenario; - - if (bfqd->strict_guarantees) - return true; - - /* - * Idling is performed only if slice_idle > 0. In addition, we - * do not idle if - * (a) bfqq is async - * (b) bfqq is in the idle io prio class: in this case we do - * not idle because we want to minimize the bandwidth that - * queues in this class can steal to higher-priority queues - */ - if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || - bfq_class_idle(bfqq)) - return false; + idling_boosts_thr; bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); @@ -3482,8 +3442,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) bfqq_sequential_and_IO_bound); /* - * The value of the next variable, - * idling_boosts_thr_without_issues, is equal to that of + * The return value of this function is equal to that of * idling_boosts_thr, unless a special case holds. In this * special case, described below, idling may cause problems to * weight-raised queues. @@ -3500,32 +3459,35 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * which enqueue several requests in advance, and further * reorder internally-queued requests. * - * For this reason, we force to false the value of - * idling_boosts_thr_without_issues if there are weight-raised - * busy queues. In this case, and if bfqq is not weight-raised, - * this guarantees that the device is not idled for bfqq (if, - * instead, bfqq is weight-raised, then idling will be - * guaranteed by another variable, see below). Combined with - * the timestamping rules of BFQ (see [1] for details), this - * behavior causes bfqq, and hence any sync non-weight-raised - * queue, to get a lower number of requests served, and thus - * to ask for a lower number of requests from the request - * pool, before the busy weight-raised queues get served - * again. This often mitigates starvation problems in the - * presence of heavy write workloads and NCQ, thereby - * guaranteeing a higher application and system responsiveness - * in these hostile scenarios. + * For this reason, we force to false the return value if + * there are weight-raised busy queues. In this case, and if + * bfqq is not weight-raised, this guarantees that the device + * is not idled for bfqq (if, instead, bfqq is weight-raised, + * then idling will be guaranteed by another variable, see + * below). Combined with the timestamping rules of BFQ (see + * [1] for details), this behavior causes bfqq, and hence any + * sync non-weight-raised queue, to get a lower number of + * requests served, and thus to ask for a lower number of + * requests from the request pool, before the busy + * weight-raised queues get served again. This often mitigates + * starvation problems in the presence of heavy write + * workloads and NCQ, thereby guaranteeing a higher + * application and system responsiveness in these hostile + * scenarios. */ - idling_boosts_thr_without_issues = idling_boosts_thr && + return idling_boosts_thr && bfqd->wr_busy_queues == 0; +} +static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ /* - * There is then a case where idling must be performed not - * for throughput concerns, but to preserve service - * guarantees. + * There is a case where idling must be performed not for + * throughput concerns, but to preserve service guarantees. * * To introduce this case, we can note that allowing the drive - * to enqueue more than one request at a time, and hence + * to enqueue more than one request at a time, and thereby * delegating de facto final scheduling decisions to the * drive's internal scheduler, entails loss of control on the * actual request service order. In particular, the critical @@ -3682,9 +3644,9 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * to let requests be served in the desired order until all * the requests already queued in the device have been served. */ - asymmetric_scenario = (bfqq->wr_coeff > 1 && - bfqd->wr_busy_queues < - bfq_tot_busy_queues(bfqd)) || + bool asymmetric_scenario = (bfqq->wr_coeff > 1 && + bfqd->wr_busy_queues < + bfq_tot_busy_queues(bfqd)) || !bfq_symmetric_scenario(bfqd); /* @@ -3701,17 +3663,64 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * now establish when idling is actually needed to preserve * service guarantees. */ - idling_needed_for_service_guarantees = - asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); + return asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); +} + +/* + * For a queue that becomes empty, device idling is allowed only if + * this function returns true for that queue. As a consequence, since + * device idling plays a critical role for both throughput boosting + * and service guarantees, the return value of this function plays a + * critical role as well. + * + * In a nutshell, this function returns true only if idling is + * beneficial for throughput or, even if detrimental for throughput, + * idling is however necessary to preserve service guarantees (low + * latency, desired throughput distribution, ...). In particular, on + * NCQ-capable devices, this function tries to return false, so as to + * help keep the drives' internal queues full, whenever this helps the + * device boost the throughput without causing any service-guarantee + * issue. + * + * Most of the issues taken into account to get the return value of + * this function are not trivial. We discuss these issues in the two + * functions providing the main pieces of information needed by this + * function. + */ +static bool bfq_better_to_idle(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; + + if (unlikely(bfqd->strict_guarantees)) + return true; + + /* + * Idling is performed only if slice_idle > 0. In addition, we + * do not idle if + * (a) bfqq is async + * (b) bfqq is in the idle io prio class: in this case we do + * not idle because we want to minimize the bandwidth that + * queues in this class can steal to higher-priority queues + */ + if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || + bfq_class_idle(bfqq)) + return false; + + idling_boosts_thr_with_no_issue = + idling_boosts_thr_without_issues(bfqd, bfqq); + + idling_needed_for_service_guar = + idling_needed_for_service_guarantees(bfqd, bfqq); /* - * We have now all the components we need to compute the + * We have now the two components we need to compute the * return value of the function, which is true only if idling * either boosts the throughput (without issues), or is * necessary to preserve service guarantees. */ - return idling_boosts_thr_without_issues || - idling_needed_for_service_guarantees; + return idling_boosts_thr_with_no_issue || + idling_needed_for_service_guar; } /* -- cgit v1.2.3-59-g8ed1b From ac8b0cb415f3aa9162009d39624501d37031533b Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:31 +0100 Subject: block, bfq: do not plug I/O of in-service queue when harmful If the in-service bfq_queue is sync and remains temporarily idle, then I/O dispatching (from other queues) may be plugged. It may be dome for two reasons: either to boost throughput, or to preserve the bandwidth share of the in-service queue. In the first case, if the I/O of the in-service queue, when it finally arrives, consists only of one small I/O request, then it makes sense to plug even the I/O of the in-service queue. In fact, serving such a small request immediately is likely to lower throughput instead of boosting it, whereas waiting a little bit is likely to let that request grow, thanks to request merging, and become more profitable in terms of throughput (this is likely to happen exactly because the I/O of the queue has been detected to boost throughput). On the opposite end, if I/O dispatching is being plugged only to preserve the bandwidth of the in-service queue, then it would be better not to plug also the I/O of the in-service queue, because such a plugging is likely to cause only loss of bandwidth for the queue. Unfortunately, no distinction is made between the two cases, and the I/O of the in-service queue is always plugged in case just a small I/O request arrives. This commit draws this missing distinction and does not perform harmful plugging. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2756f4b1432b..a6fe60114ade 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4599,28 +4599,31 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); /* - * There is just this request queued: if the request - * is small and the queue is not to be expired, then - * just exit. + * There is just this request queued: if + * - the request is small, and + * - we are idling to boost throughput, and + * - the queue is not to be expired, + * then just exit. * * In this way, if the device is being idled to wait * for a new request from the in-service queue, we * avoid unplugging the device and committing the - * device to serve just a small request. On the - * contrary, we wait for the block layer to decide - * when to unplug the device: hopefully, new requests - * will be merged to this one quickly, then the device - * will be unplugged and larger requests will be - * dispatched. + * device to serve just a small request. In contrast + * we wait for the block layer to decide when to + * unplug the device: hopefully, new requests will be + * merged to this one quickly, then the device will be + * unplugged and larger requests will be dispatched. */ - if (small_req && !budget_timeout) + if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) && + !budget_timeout) return; /* - * A large enough request arrived, or the queue is to - * be expired: in both cases disk idling is to be - * stopped, so clear wait_request flag and reset - * timer. + * A large enough request arrived, or idling is being + * performed to preserve service guarantees, or + * finally the queue is to be expired: in all these + * cases disk idling is to be stopped, so clear + * wait_request flag and reset timer. */ bfq_clear_bfqq_wait_request(bfqq); hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -- cgit v1.2.3-59-g8ed1b From 530c4cbb3c62f9e42dbf39279fb346f2d2ab4dbb Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:32 +0100 Subject: block, bfq: unconditionally plug I/O in asymmetric scenarios bfq detects the creation of multiple bfq_queues shortly after each other, namely a burst of queue creations in the terminology used in the code. If the burst is large, then no queue in the burst is granted - either I/O-dispatch plugging when the queue remains temporarily idle while in service; - or weight raising, because it causes even longer plugging. In fact, such a plugging tends to lower throughput, while these bursts are typically due to applications or services that spawn multiple processes, to reach a common goal as soon as possible. Examples are a "git grep" or the booting of a system. Unfortunately, disabling plugging may cause a loss of service guarantees in asymmetric scenarios, i.e., if queue weights are differentiated or if more than one group is active. This commit addresses this issue by no longer disabling I/O-dispatch plugging for queues in large bursts. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 346 +++++++++++++++++++++++++--------------------------- 1 file changed, 165 insertions(+), 181 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a6fe60114ade..c1bb5e5fcdc4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3479,191 +3479,175 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, bfqd->wr_busy_queues == 0; } +/* + * There is a case where idling must be performed not for + * throughput concerns, but to preserve service guarantees. + * + * To introduce this case, we can note that allowing the drive + * to enqueue more than one request at a time, and hence + * delegating de facto final scheduling decisions to the + * drive's internal scheduler, entails loss of control on the + * actual request service order. In particular, the critical + * situation is when requests from different processes happen + * to be present, at the same time, in the internal queue(s) + * of the drive. In such a situation, the drive, by deciding + * the service order of the internally-queued requests, does + * determine also the actual throughput distribution among + * these processes. But the drive typically has no notion or + * concern about per-process throughput distribution, and + * makes its decisions only on a per-request basis. Therefore, + * the service distribution enforced by the drive's internal + * scheduler is likely to coincide with the desired + * device-throughput distribution only in a completely + * symmetric scenario where: + * (i) each of these processes must get the same throughput as + * the others; + * (ii) the I/O of each process has the same properties, in + * terms of locality (sequential or random), direction + * (reads or writes), request sizes, greediness + * (from I/O-bound to sporadic), and so on. + * In fact, in such a scenario, the drive tends to treat + * the requests of each of these processes in about the same + * way as the requests of the others, and thus to provide + * each of these processes with about the same throughput + * (which is exactly the desired throughput distribution). In + * contrast, in any asymmetric scenario, device idling is + * certainly needed to guarantee that bfqq receives its + * assigned fraction of the device throughput (see [1] for + * details). + * The problem is that idling may significantly reduce + * throughput with certain combinations of types of I/O and + * devices. An important example is sync random I/O, on flash + * storage with command queueing. So, unless bfqq falls in the + * above cases where idling also boosts throughput, it would + * be important to check conditions (i) and (ii) accurately, + * so as to avoid idling when not strictly needed for service + * guarantees. + * + * Unfortunately, it is extremely difficult to thoroughly + * check condition (ii). And, in case there are active groups, + * it becomes very difficult to check condition (i) too. In + * fact, if there are active groups, then, for condition (i) + * to become false, it is enough that an active group contains + * more active processes or sub-groups than some other active + * group. More precisely, for condition (i) to hold because of + * such a group, it is not even necessary that the group is + * (still) active: it is sufficient that, even if the group + * has become inactive, some of its descendant processes still + * have some request already dispatched but still waiting for + * completion. In fact, requests have still to be guaranteed + * their share of the throughput even after being + * dispatched. In this respect, it is easy to show that, if a + * group frequently becomes inactive while still having + * in-flight requests, and if, when this happens, the group is + * not considered in the calculation of whether the scenario + * is asymmetric, then the group may fail to be guaranteed its + * fair share of the throughput (basically because idling may + * not be performed for the descendant processes of the group, + * but it had to be). We address this issue with the + * following bi-modal behavior, implemented in the function + * bfq_symmetric_scenario(). + * + * If there are groups with requests waiting for completion + * (as commented above, some of these groups may even be + * already inactive), then the scenario is tagged as + * asymmetric, conservatively, without checking any of the + * conditions (i) and (ii). So the device is idled for bfqq. + * This behavior matches also the fact that groups are created + * exactly if controlling I/O is a primary concern (to + * preserve bandwidth and latency guarantees). + * + * On the opposite end, if there are no groups with requests + * waiting for completion, then only condition (i) is actually + * controlled, i.e., provided that condition (i) holds, idling + * is not performed, regardless of whether condition (ii) + * holds. In other words, only if condition (i) does not hold, + * then idling is allowed, and the device tends to be + * prevented from queueing many requests, possibly of several + * processes. Since there are no groups with requests waiting + * for completion, then, to control condition (i) it is enough + * to check just whether all the queues with requests waiting + * for completion also have the same weight. + * + * Not checking condition (ii) evidently exposes bfqq to the + * risk of getting less throughput than its fair share. + * However, for queues with the same weight, a further + * mechanism, preemption, mitigates or even eliminates this + * problem. And it does so without consequences on overall + * throughput. This mechanism and its benefits are explained + * in the next three paragraphs. + * + * Even if a queue, say Q, is expired when it remains idle, Q + * can still preempt the new in-service queue if the next + * request of Q arrives soon (see the comments on + * bfq_bfqq_update_budg_for_activation). If all queues and + * groups have the same weight, this form of preemption, + * combined with the hole-recovery heuristic described in the + * comments on function bfq_bfqq_update_budg_for_activation, + * are enough to preserve a correct bandwidth distribution in + * the mid term, even without idling. In fact, even if not + * idling allows the internal queues of the device to contain + * many requests, and thus to reorder requests, we can rather + * safely assume that the internal scheduler still preserves a + * minimum of mid-term fairness. + * + * More precisely, this preemption-based, idleless approach + * provides fairness in terms of IOPS, and not sectors per + * second. This can be seen with a simple example. Suppose + * that there are two queues with the same weight, but that + * the first queue receives requests of 8 sectors, while the + * second queue receives requests of 1024 sectors. In + * addition, suppose that each of the two queues contains at + * most one request at a time, which implies that each queue + * always remains idle after it is served. Finally, after + * remaining idle, each queue receives very quickly a new + * request. It follows that the two queues are served + * alternatively, preempting each other if needed. This + * implies that, although both queues have the same weight, + * the queue with large requests receives a service that is + * 1024/8 times as high as the service received by the other + * queue. + * + * The motivation for using preemption instead of idling (for + * queues with the same weight) is that, by not idling, + * service guarantees are preserved (completely or at least in + * part) without minimally sacrificing throughput. And, if + * there is no active group, then the primary expectation for + * this device is probably a high throughput. + * + * We are now left only with explaining the additional + * compound condition that is checked below for deciding + * whether the scenario is asymmetric. To explain this + * compound condition, we need to add that the function + * bfq_symmetric_scenario checks the weights of only + * non-weight-raised queues, for efficiency reasons (see + * comments on bfq_weights_tree_add()). Then the fact that + * bfqq is weight-raised is checked explicitly here. More + * precisely, the compound condition below takes into account + * also the fact that, even if bfqq is being weight-raised, + * the scenario is still symmetric if all queues with requests + * waiting for completion happen to be + * weight-raised. Actually, we should be even more precise + * here, and differentiate between interactive weight raising + * and soft real-time weight raising. + * + * As a side note, it is worth considering that the above + * device-idling countermeasures may however fail in the + * following unlucky scenario: if idling is (correctly) + * disabled in a time period during which all symmetry + * sub-conditions hold, and hence the device is allowed to + * enqueue many requests, but at some later point in time some + * sub-condition stops to hold, then it may become impossible + * to let requests be served in the desired order until all + * the requests already queued in the device have been served. + */ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - /* - * There is a case where idling must be performed not for - * throughput concerns, but to preserve service guarantees. - * - * To introduce this case, we can note that allowing the drive - * to enqueue more than one request at a time, and thereby - * delegating de facto final scheduling decisions to the - * drive's internal scheduler, entails loss of control on the - * actual request service order. In particular, the critical - * situation is when requests from different processes happen - * to be present, at the same time, in the internal queue(s) - * of the drive. In such a situation, the drive, by deciding - * the service order of the internally-queued requests, does - * determine also the actual throughput distribution among - * these processes. But the drive typically has no notion or - * concern about per-process throughput distribution, and - * makes its decisions only on a per-request basis. Therefore, - * the service distribution enforced by the drive's internal - * scheduler is likely to coincide with the desired - * device-throughput distribution only in a completely - * symmetric scenario where: - * (i) each of these processes must get the same throughput as - * the others; - * (ii) the I/O of each process has the same properties, in - * terms of locality (sequential or random), direction - * (reads or writes), request sizes, greediness - * (from I/O-bound to sporadic), and so on. - * In fact, in such a scenario, the drive tends to treat - * the requests of each of these processes in about the same - * way as the requests of the others, and thus to provide - * each of these processes with about the same throughput - * (which is exactly the desired throughput distribution). In - * contrast, in any asymmetric scenario, device idling is - * certainly needed to guarantee that bfqq receives its - * assigned fraction of the device throughput (see [1] for - * details). - * The problem is that idling may significantly reduce - * throughput with certain combinations of types of I/O and - * devices. An important example is sync random I/O, on flash - * storage with command queueing. So, unless bfqq falls in the - * above cases where idling also boosts throughput, it would - * be important to check conditions (i) and (ii) accurately, - * so as to avoid idling when not strictly needed for service - * guarantees. - * - * Unfortunately, it is extremely difficult to thoroughly - * check condition (ii). And, in case there are active groups, - * it becomes very difficult to check condition (i) too. In - * fact, if there are active groups, then, for condition (i) - * to become false, it is enough that an active group contains - * more active processes or sub-groups than some other active - * group. More precisely, for condition (i) to hold because of - * such a group, it is not even necessary that the group is - * (still) active: it is sufficient that, even if the group - * has become inactive, some of its descendant processes still - * have some request already dispatched but still waiting for - * completion. In fact, requests have still to be guaranteed - * their share of the throughput even after being - * dispatched. In this respect, it is easy to show that, if a - * group frequently becomes inactive while still having - * in-flight requests, and if, when this happens, the group is - * not considered in the calculation of whether the scenario - * is asymmetric, then the group may fail to be guaranteed its - * fair share of the throughput (basically because idling may - * not be performed for the descendant processes of the group, - * but it had to be). We address this issue with the - * following bi-modal behavior, implemented in the function - * bfq_symmetric_scenario(). - * - * If there are groups with requests waiting for completion - * (as commented above, some of these groups may even be - * already inactive), then the scenario is tagged as - * asymmetric, conservatively, without checking any of the - * conditions (i) and (ii). So the device is idled for bfqq. - * This behavior matches also the fact that groups are created - * exactly if controlling I/O is a primary concern (to - * preserve bandwidth and latency guarantees). - * - * On the opposite end, if there are no groups with requests - * waiting for completion, then only condition (i) is actually - * controlled, i.e., provided that condition (i) holds, idling - * is not performed, regardless of whether condition (ii) - * holds. In other words, only if condition (i) does not hold, - * then idling is allowed, and the device tends to be - * prevented from queueing many requests, possibly of several - * processes. Since there are no groups with requests waiting - * for completion, then, to control condition (i) it is enough - * to check just whether all the queues with requests waiting - * for completion also have the same weight. - * - * Not checking condition (ii) evidently exposes bfqq to the - * risk of getting less throughput than its fair share. - * However, for queues with the same weight, a further - * mechanism, preemption, mitigates or even eliminates this - * problem. And it does so without consequences on overall - * throughput. This mechanism and its benefits are explained - * in the next three paragraphs. - * - * Even if a queue, say Q, is expired when it remains idle, Q - * can still preempt the new in-service queue if the next - * request of Q arrives soon (see the comments on - * bfq_bfqq_update_budg_for_activation). If all queues and - * groups have the same weight, this form of preemption, - * combined with the hole-recovery heuristic described in the - * comments on function bfq_bfqq_update_budg_for_activation, - * are enough to preserve a correct bandwidth distribution in - * the mid term, even without idling. In fact, even if not - * idling allows the internal queues of the device to contain - * many requests, and thus to reorder requests, we can rather - * safely assume that the internal scheduler still preserves a - * minimum of mid-term fairness. - * - * More precisely, this preemption-based, idleless approach - * provides fairness in terms of IOPS, and not sectors per - * second. This can be seen with a simple example. Suppose - * that there are two queues with the same weight, but that - * the first queue receives requests of 8 sectors, while the - * second queue receives requests of 1024 sectors. In - * addition, suppose that each of the two queues contains at - * most one request at a time, which implies that each queue - * always remains idle after it is served. Finally, after - * remaining idle, each queue receives very quickly a new - * request. It follows that the two queues are served - * alternatively, preempting each other if needed. This - * implies that, although both queues have the same weight, - * the queue with large requests receives a service that is - * 1024/8 times as high as the service received by the other - * queue. - * - * The motivation for using preemption instead of idling (for - * queues with the same weight) is that, by not idling, - * service guarantees are preserved (completely or at least in - * part) without minimally sacrificing throughput. And, if - * there is no active group, then the primary expectation for - * this device is probably a high throughput. - * - * We are now left only with explaining the additional - * compound condition that is checked below for deciding - * whether the scenario is asymmetric. To explain this - * compound condition, we need to add that the function - * bfq_symmetric_scenario checks the weights of only - * non-weight-raised queues, for efficiency reasons (see - * comments on bfq_weights_tree_add()). Then the fact that - * bfqq is weight-raised is checked explicitly here. More - * precisely, the compound condition below takes into account - * also the fact that, even if bfqq is being weight-raised, - * the scenario is still symmetric if all queues with requests - * waiting for completion happen to be - * weight-raised. Actually, we should be even more precise - * here, and differentiate between interactive weight raising - * and soft real-time weight raising. - * - * As a side note, it is worth considering that the above - * device-idling countermeasures may however fail in the - * following unlucky scenario: if idling is (correctly) - * disabled in a time period during which all symmetry - * sub-conditions hold, and hence the device is allowed to - * enqueue many requests, but at some later point in time some - * sub-condition stops to hold, then it may become impossible - * to let requests be served in the desired order until all - * the requests already queued in the device have been served. - */ - bool asymmetric_scenario = (bfqq->wr_coeff > 1 && - bfqd->wr_busy_queues < - bfq_tot_busy_queues(bfqd)) || + return (bfqq->wr_coeff > 1 && + bfqd->wr_busy_queues < + bfq_tot_busy_queues(bfqd)) || !bfq_symmetric_scenario(bfqd); - - /* - * Finally, there is a case where maximizing throughput is the - * best choice even if it may cause unfairness toward - * bfqq. Such a case is when bfqq became active in a burst of - * queue activations. Queues that became active during a large - * burst benefit only from throughput, as discussed in the - * comments on bfq_handle_burst. Thus, if bfqq became active - * in a burst and not idling the device maximizes throughput, - * then the device must no be idled, because not idling the - * device provides bfqq and all other queues in the burst with - * maximum benefit. Combining this and the above case, we can - * now establish when idling is actually needed to preserve - * service guarantees. - */ - return asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); } /* -- cgit v1.2.3-59-g8ed1b From d87447d84fe194b0e4f5413b5344dc82cc100711 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:33 +0100 Subject: block, bfq: fix sequential rq detection in rate estimation In bfq_update_peak_rate, to check whether an I/O request rq is sequential, only the seek distance of rq w.r.t. the last request dispatched is controlled. This is not sufficient for non-rotational storage, where the size of rq is at least as relevant. This commit adds the missing control. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c1bb5e5fcdc4..12228af16198 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -235,6 +235,11 @@ static struct kmem_cache *bfq_pool; #define BFQQ_SEEK_THR (sector_t)(8 * 100) #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) +#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ + (get_sdist(last_pos, rq) > \ + BFQQ_SEEK_THR && \ + (!blk_queue_nonrot(bfqd->queue) || \ + blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) @@ -2754,7 +2759,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) if ((bfqd->rq_in_driver > 0 || now_ns - bfqd->last_completion < BFQ_MIN_TT) - && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) + && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) bfqd->sequential_samples++; bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); @@ -4511,10 +4516,7 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *rq) { bfqq->seek_history <<= 1; - bfqq->seek_history |= - get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && - (!blk_queue_nonrot(bfqd->queue) || - blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); + bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); } static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -- cgit v1.2.3-59-g8ed1b From 9dee8b3b057e1da26f85f1842f2aaf3bb200fb94 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:34 +0100 Subject: block, bfq: fix queue removal from weights tree bfq maintains an ordered list, through a red-black tree, of unique weights of active bfq_queues. This list is used to detect whether there are active queues with differentiated weights. The weight of a queue is removed from the list when both the following two conditions become true: (1) the bfq_queue is flagged as inactive (2) the has no in-flight request any longer; Unfortunately, in the rare cases where condition (2) becomes true before condition (1), the removal fails, because the function to remove the weight of the queue (bfq_weights_tree_remove) is rightly invoked in the path that deactivates the bfq_queue, but mistakenly invoked *before* the function that actually performs the deactivation (bfq_deactivate_bfqq). This commits moves the invocation of bfq_weights_tree_remove for condition (1) to after bfq_deactivate_bfqq. As a consequence of this move, it is necessary to add a further reference to the queue when the weight of a queue is added, because the queue might otherwise be freed before bfq_weights_tree_remove is invoked. This commit adds this reference and makes all related modifications. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 17 +++++++++++++---- block/bfq-wf2q.c | 6 +++--- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 12228af16198..bf585ad29bb5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -754,6 +754,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, inc_counter: bfqq->weight_counter->num_active++; + bfqq->ref++; } /* @@ -778,6 +779,7 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd, reset_entity_pointer: bfqq->weight_counter = NULL; + bfq_put_queue(bfqq); } /* @@ -789,9 +791,6 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, { struct bfq_entity *entity = bfqq->entity.parent; - __bfq_weights_tree_remove(bfqd, bfqq, - &bfqd->queue_weights_tree); - for_each_entity(entity) { struct bfq_sched_data *sd = entity->my_sched_data; @@ -825,6 +824,15 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, bfqd->num_groups_with_pending_reqs--; } } + + /* + * Next function is invoked last, because it causes bfqq to be + * freed if the following holds: bfqq is not in service and + * has no dispatched request. DO NOT use bfqq after the next + * function invocation. + */ + __bfq_weights_tree_remove(bfqd, bfqq, + &bfqd->queue_weights_tree); } /* @@ -1020,7 +1028,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, static int bfqq_process_refs(struct bfq_queue *bfqq) { - return bfqq->ref - bfqq->allocated - bfqq->entity.on_st; + return bfqq->ref - bfqq->allocated - bfqq->entity.on_st - + (bfqq->weight_counter != NULL); } /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index ce37d709a34f..63311d1ff1ed 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1673,15 +1673,15 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->busy_queues[bfqq->ioprio_class - 1]--; - if (!bfqq->dispatched) - bfq_weights_tree_remove(bfqd, bfqq); - if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; bfqg_stats_update_dequeue(bfqq_group(bfqq)); bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); + + if (!bfqq->dispatched) + bfq_weights_tree_remove(bfqd, bfqq); } /* -- cgit v1.2.3-59-g8ed1b From a3c92560324bd616deaecb6842b2a0337a80ad8b Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:35 +0100 Subject: block, bfq: reduce threshold for detecting command queueing bfq simple heuristic from cfq for detecting whether the drive performs command queueing: check whether the average number of in-flight requests is above a given threshold. Unfortunately this heuristic does fail to detect queueing (on drives with queueing) if processes doing I/O are few and issue I/O with a low depth. To reduce false negatives, this commit lowers the threshold. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index bf585ad29bb5..48b579032d14 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -230,7 +230,7 @@ static struct kmem_cache *bfq_pool; #define BFQ_MIN_TT (2 * NSEC_PER_MSEC) /* hw_tag detection: parallel requests threshold and min samples needed. */ -#define BFQ_HW_QUEUE_THRESHOLD 4 +#define BFQ_HW_QUEUE_THRESHOLD 3 #define BFQ_HW_QUEUE_SAMPLES 32 #define BFQQ_SEEK_THR (sector_t)(8 * 100) @@ -4798,7 +4798,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) * sum is not exact, as it's not taking into account deactivated * requests. */ - if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) + if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) return; if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -- cgit v1.2.3-59-g8ed1b From b3c3498112ffafa5f613bb482f9723996bfd5e4f Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:36 +0100 Subject: block, bfq: port commit "cfq-iosched: improve hw_tag detection" The original commit is commit 1a1238a7dd48 ("cfq-iosched: improve hw_tag detection") and has the following commit message: If active queue hasn't enough requests and idle window opens, cfq will not dispatch sufficient requests to hardware. In such situation, current code will zero hw_tag. But this is because cfq doesn't dispatch enough requests instead of hardware queue doesn't work. Don't zero hw_tag in such case. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 48b579032d14..2ab53d93ba12 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4786,6 +4786,8 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, static void bfq_update_hw_tag(struct bfq_data *bfqd) { + struct bfq_queue *bfqq = bfqd->in_service_queue; + bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, bfqd->rq_in_driver); @@ -4801,6 +4803,17 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) return; + /* + * If active queue hasn't enough requests and can idle, bfq might not + * dispatch sufficient requests to hardware. Don't zero hw_tag in this + * case + */ + if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && + bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < + BFQ_HW_QUEUE_THRESHOLD && + bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) + return; + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) return; -- cgit v1.2.3-59-g8ed1b From 02a6d787f4afc8be2d3d52ab0a1df0c6a2d99e7b Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:37 +0100 Subject: block, bfq: do not overcharge writes in asymmetric scenarios Writes tend to starve reads. bfq counters this problem by overcharging writes with an inflated service w.r.t. the actual service (number of sector written) they receive. Yet his overcharging is useless, and actually causes unfairness in the opposite direction, when bfq happens to be enforcing strong I/O control. bfq does this enforcing when the scenario is asymmetric, i.e., when some bfq_queue or group of bfq_queues is to be granted a different bandwidth than some other bfq_queue or group of bfq_queues. So, in such a scenario, this commit disables write overcharging. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2ab53d93ba12..06268449d2ca 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -888,7 +888,8 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, static unsigned long bfq_serv_to_charge(struct request *rq, struct bfq_queue *bfqq) { - if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) + if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || + !bfq_symmetric_scenario(bfqq->bfqd)) return blk_rq_sectors(rq); return blk_rq_sectors(rq) * bfq_async_charge_factor; -- cgit v1.2.3-59-g8ed1b From 058fdecc6de7cdecbf4c59b851e80eb2d6c5295f Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 29 Jan 2019 12:06:38 +0100 Subject: block, bfq: fix in-service-queue check for queue merging When a new I/O request arrives for a bfq_queue, say Q, bfq checks whether that request is close to (a) the head request of some other queue waiting to be served, or (b) the last request dispatched for the in-service queue (in case Q itself is not the in-service queue) If a queue, say Q2, is found for which the above condition holds, then bfq merges Q and Q2, to hopefully get a more sequential I/O in the resulting merged queue, and thus a possibly higher throughput. Case (b) is checked by comparing the new request for Q with the last request dispatched, assuming that the latter necessarily belonged to the in-service queue. Unfortunately, this assumption is no longer always correct, since commit d0edc2473be9 ("block, bfq: inject other-queue I/O into seeky idle queues on NCQ flash"). When the assumption does not hold, queues that must not be merged may be merged, causing unexpected loss of control on per-queue service guarantees. This commit solves this problem by adding an extra field, which stores the actual last request dispatched for the in-service queue, and by using this new field to correctly check case (b). Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 5 ++++- block/bfq-iosched.h | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 06268449d2ca..4c592496a16a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2251,7 +2251,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (in_service_bfqq && in_service_bfqq != bfqq && likely(in_service_bfqq != &bfqd->oom_bfqq) && - bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + bfq_rq_close_to_sector(io_struct, request, + bfqd->in_serv_last_pos) && bfqq->entity.parent == in_service_bfqq->entity.parent && bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); @@ -2791,6 +2792,8 @@ update_rate_and_reset: bfq_update_rate_reset(bfqd, rq); update_last_values: bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + if (RQ_BFQQ(rq) == bfqd->in_service_queue) + bfqd->in_serv_last_pos = bfqd->last_position; bfqd->last_dispatch = now_ns; } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 30be669be465..062e1c4787f4 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -538,6 +538,9 @@ struct bfq_data { /* on-disk position of the last served request */ sector_t last_position; + /* position of the last served request for the in-service queue */ + sector_t in_serv_last_pos; + /* time of last request completion (ns) */ u64 last_completion; -- cgit v1.2.3-59-g8ed1b From 8ccdf4a3775229314c8bd365ac88c2cbdf36be13 Mon Sep 17 00:00:00 2001 From: Jianchao Wang Date: Thu, 24 Jan 2019 18:25:32 +0800 Subject: blk-mq: save queue mapping result into ctx directly Currently, the queue mapping result is saved in a two-dimensional array. In the hot path, to get a hctx, we need do following: q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]] This isn't very efficient. We could save the queue mapping result into ctx directly with different hctx type, like, ctx->hctxs[type] Signed-off-by: Jianchao Wang Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 2 +- block/blk-mq-tag.c | 2 +- block/blk-mq.c | 4 ++-- block/blk-mq.h | 7 ++++--- block/blk.h | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 140933e4a7d1..40905539afed 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -321,7 +321,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); bool ret = false; enum hctx_type type; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2089c6c62f44..a4931fc7be8a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -170,7 +170,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) data->ctx = blk_mq_get_ctx(data->q); data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, - data->ctx->cpu); + data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = &tags->breserved_tags; diff --git a/block/blk-mq.c b/block/blk-mq.c index 8f5b533764ca..445d0a2642ae 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -364,7 +364,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, } if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->cmd_flags, - data->ctx->cpu); + data->ctx); if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; @@ -2435,7 +2435,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) continue; hctx = blk_mq_map_queue_type(q, j, i); - + ctx->hctxs[j] = hctx; /* * If the CPU is already set in the mask, then we've * mapped this one already. This can happen if diff --git a/block/blk-mq.h b/block/blk-mq.h index d943d46b0785..9fb06261518e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -23,6 +23,7 @@ struct blk_mq_ctx { unsigned int cpu; unsigned short index_hw[HCTX_MAX_TYPES]; + struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; /* incremented at dispatch time */ unsigned long rq_dispatched[2]; @@ -97,11 +98,11 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue * @flags: request command flags - * @cpu: CPU + * @cpu: cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, unsigned int flags, - unsigned int cpu) + struct blk_mq_ctx *ctx) { enum hctx_type type = HCTX_TYPE_DEFAULT; @@ -116,7 +117,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, q->tag_set->map[HCTX_TYPE_READ].nr_queues) type = HCTX_TYPE_READ; - return blk_mq_map_queue_type(q, type, cpu); + return ctx->hctxs[type]; } /* diff --git a/block/blk.h b/block/blk.h index 848278c52030..5d636ee41663 100644 --- a/block/blk.h +++ b/block/blk.h @@ -38,7 +38,7 @@ extern struct ida blk_queue_ida; static inline struct blk_flush_queue * blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) { - return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq; + return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; } static inline void __blk_get_queue(struct request_queue *q) -- cgit v1.2.3-59-g8ed1b From bb94aea1444b9859faa8d72aff3713fcd11c6696 Mon Sep 17 00:00:00 2001 From: Jianchao Wang Date: Thu, 24 Jan 2019 18:25:33 +0800 Subject: blk-mq: save default hctx into ctx->hctxs for not-supported type Currently, we check whether the hctx type is supported every time in hot path. Actually, this is not necessary, we could save the default hctx into ctx->hctxs if the type is not supported when map swqueues and use it directly with ctx->hctxs[type]. We also needn't check whether the poll is enabled or not, because the caller would clear the REQ_HIPRI in that case. Signed-off-by: Jianchao Wang Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 ++++++++- block/blk-mq.h | 13 +++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 445d0a2642ae..8a825aebc6b5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2431,8 +2431,11 @@ static void blk_mq_map_swqueue(struct request_queue *q) ctx = per_cpu_ptr(q->queue_ctx, i); for (j = 0; j < set->nr_maps; j++) { - if (!set->map[j].nr_queues) + if (!set->map[j].nr_queues) { + ctx->hctxs[j] = blk_mq_map_queue_type(q, + HCTX_TYPE_DEFAULT, i); continue; + } hctx = blk_mq_map_queue_type(q, j, i); ctx->hctxs[j] = hctx; @@ -2455,6 +2458,10 @@ static void blk_mq_map_swqueue(struct request_queue *q) */ BUG_ON(!hctx->nr_ctx); } + + for (; j < HCTX_MAX_TYPES; j++) + ctx->hctxs[j] = blk_mq_map_queue_type(q, + HCTX_TYPE_DEFAULT, i); } mutex_unlock(&q->sysfs_lock); diff --git a/block/blk-mq.h b/block/blk-mq.h index 9fb06261518e..14b7efb94f75 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -106,15 +106,12 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, { enum hctx_type type = HCTX_TYPE_DEFAULT; - if ((flags & REQ_HIPRI) && - q->tag_set->nr_maps > HCTX_TYPE_POLL && - q->tag_set->map[HCTX_TYPE_POLL].nr_queues && - test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + /* + * The caller ensure that if REQ_HIPRI, poll must be enabled. + */ + if (flags & REQ_HIPRI) type = HCTX_TYPE_POLL; - - else if (((flags & REQ_OP_MASK) == REQ_OP_READ) && - q->tag_set->nr_maps > HCTX_TYPE_READ && - q->tag_set->map[HCTX_TYPE_READ].nr_queues) + else if ((flags & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return ctx->hctxs[type]; -- cgit v1.2.3-59-g8ed1b From 6e02318eaea53eaafe628c4ffc254f57b2704561 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 17 Dec 2018 22:42:03 -0500 Subject: nvme: add support for the Write Zeroes command Allow write zeroes operations (REQ_OP_WRITE_ZEROES) on the block device, if the device supports an optional command bit set for write zeroes. Add support to setup write zeroes command. Set maximum possible write zeroes sectors in one write zeroes command according to nvme write zeroes command definition. This patch was posted as a part of block-write-zeroes support implementation (https://patchwork.kernel.org/patch/9454859/), but did not make into mainline kernel as it got reverted due to failure on the Linus's machine. In this patch in order to be more cautious, we use NVMe controller's maximum hardware sector size which is calculated based on the controller's MDTS (Maximum Data Transfer Size) field to calculate the maximum sectors for the write zeroes request. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen [folded a fix from Keith Busch to properly respect NVME_QUIRK_DEALLOCATE_ZEROES] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 150e49723c15..5c2d2f1e7261 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -611,6 +611,22 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, return BLK_STS_OK; } +static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd) +{ + if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) + return nvme_setup_discard(ns, req, cmnd); + + cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; + cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); + cmnd->write_zeroes.slba = + cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->write_zeroes.length = + cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + cmnd->write_zeroes.control = 0; + return BLK_STS_OK; +} + static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd) { @@ -705,7 +721,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, nvme_setup_flush(ns, cmd); break; case REQ_OP_WRITE_ZEROES: - /* currently only aliased to deallocate for a few ctrls: */ + ret = nvme_setup_write_zeroes(ns, req, cmd); + break; case REQ_OP_DISCARD: ret = nvme_setup_discard(ns, req, cmd); break; @@ -1509,6 +1526,37 @@ static void nvme_config_discard(struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } +static inline void nvme_config_write_zeroes(struct nvme_ns *ns) +{ + u32 max_sectors; + unsigned short bs = 1 << ns->lba_shift; + + if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES)) + return; + /* + * Even though NVMe spec explicitly states that MDTS is not + * applicable to the write-zeroes:- "The restriction does not apply to + * commands that do not transfer data between the host and the + * controller (e.g., Write Uncorrectable ro Write Zeroes command).". + * In order to be more cautious use controller's max_hw_sectors value + * to configure the maximum sectors for the write-zeroes which is + * configured based on the controller's MDTS field in the + * nvme_init_identify() if available. + */ + if (ns->ctrl->max_hw_sectors == UINT_MAX) + max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9; + else + max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9; + + blk_queue_max_write_zeroes_sectors(ns->queue, max_sectors); +} + +static inline void nvme_ns_config_oncs(struct nvme_ns *ns) +{ + nvme_config_discard(ns); + nvme_config_write_zeroes(ns); +} + static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, struct nvme_id_ns *id, struct nvme_ns_ids *ids) { @@ -1562,7 +1610,7 @@ static void nvme_update_disk_info(struct gendisk *disk, capacity = 0; set_capacity(disk, capacity); - nvme_config_discard(ns); + nvme_ns_config_oncs(ns); if (id->nsattr & (1 << 0)) set_disk_ro(disk, true); -- cgit v1.2.3-59-g8ed1b From 794a4cb3d2f7c105112f9b59525533f00ac06006 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 1 Jan 2019 00:19:30 -0800 Subject: nvme: remove the .stop_ctrl callout It is used now just to flush error recovery and reconnect work items in the RDMA and TCP transports, which can simply be moved to the corresponding teardown routines. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 -- drivers/nvme/host/nvme.h | 1 - drivers/nvme/host/rdma.c | 12 +++--------- drivers/nvme/host/tcp.c | 10 +++------- 4 files changed, 6 insertions(+), 19 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5c2d2f1e7261..c365f0aa9433 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3639,8 +3639,6 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fw_act_work); - if (ctrl->ops->stop_ctrl) - ctrl->ops->stop_ctrl(ctrl); } EXPORT_SYMBOL_GPL(nvme_stop_ctrl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ab961bdeea89..a0cc733c753e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -363,7 +363,6 @@ struct nvme_ctrl_ops { void (*submit_async_event)(struct nvme_ctrl *ctrl); void (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); - void (*stop_ctrl)(struct nvme_ctrl *ctrl); }; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 52abc3a6de12..ac365366c2ec 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -942,14 +942,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, } } -static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - - cancel_work_sync(&ctrl->err_work); - cancel_delayed_work_sync(&ctrl->reconnect_work); -} - static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); @@ -1854,6 +1846,9 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { + cancel_work_sync(&ctrl->err_work); + cancel_delayed_work_sync(&ctrl->reconnect_work); + nvme_rdma_teardown_io_queues(ctrl, shutdown); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); @@ -1902,7 +1897,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .submit_async_event = nvme_rdma_submit_async_event, .delete_ctrl = nvme_rdma_delete_ctrl, .get_address = nvmf_get_address, - .stop_ctrl = nvme_rdma_stop_ctrl, }; /* diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 5f0a00425242..208ee518af65 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1822,6 +1822,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) { + cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work); + cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); + nvme_tcp_teardown_io_queues(ctrl, shutdown); if (shutdown) nvme_shutdown_ctrl(ctrl); @@ -1859,12 +1862,6 @@ out_fail: nvme_tcp_reconnect_or_remove(ctrl); } -static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl) -{ - cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work); - cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); -} - static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); @@ -2115,7 +2112,6 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { .submit_async_event = nvme_tcp_submit_async_event, .delete_ctrl = nvme_tcp_delete_ctrl, .get_address = nvmf_get_address, - .stop_ctrl = nvme_tcp_stop_ctrl, }; static bool -- cgit v1.2.3-59-g8ed1b From f1e5b6239bdd46aa3f4e631611800ea7d10826c4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 7 Jan 2019 11:45:38 -0600 Subject: md-linear: use struct_size() in kzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kzalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Song Liu --- drivers/md/md-linear.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index d45c697c0ebe..5998d78aa189 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -96,8 +96,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) int i, cnt; bool discard_supported = false; - conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), - GFP_KERNEL); + conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL); if (!conf) return NULL; -- cgit v1.2.3-59-g8ed1b From ebda52fa1be73952ec603b1fad685ce86ccb5ee6 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 1 Feb 2019 10:45:01 +0800 Subject: raid1: simplify raid1_error function Remove redundance set_bit and let code simplify. Signed-off-by: Yufen Yu Signed-off-by: Song Liu --- drivers/md/raid1.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1d54109071cc..7e63ccc4ae7b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1603,11 +1603,9 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) return; } set_bit(Blocked, &rdev->flags); - if (test_and_clear_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) mddev->degraded++; - set_bit(Faulty, &rdev->flags); - } else - set_bit(Faulty, &rdev->flags); + set_bit(Faulty, &rdev->flags); spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. -- cgit v1.2.3-59-g8ed1b From f25191bb322dec8fa2979ecb8235643aa42470e1 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 6 Feb 2019 21:13:49 -0800 Subject: cdrom: Fix race condition in cdrom_sysctl_register The following traceback is sometimes seen when booting an image in qemu: [ 54.608293] cdrom: Uniform CD-ROM driver Revision: 3.20 [ 54.611085] Fusion MPT base driver 3.04.20 [ 54.611877] Copyright (c) 1999-2008 LSI Corporation [ 54.616234] Fusion MPT SAS Host driver 3.04.20 [ 54.635139] sysctl duplicate entry: /dev/cdrom//info [ 54.639578] CPU: 0 PID: 266 Comm: kworker/u4:5 Not tainted 5.0.0-rc5 #1 [ 54.639578] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 [ 54.641273] Workqueue: events_unbound async_run_entry_fn [ 54.641273] Call Trace: [ 54.641273] dump_stack+0x67/0x90 [ 54.641273] __register_sysctl_table+0x50b/0x570 [ 54.641273] ? rcu_read_lock_sched_held+0x6f/0x80 [ 54.641273] ? kmem_cache_alloc_trace+0x1c7/0x1f0 [ 54.646814] __register_sysctl_paths+0x1c8/0x1f0 [ 54.646814] cdrom_sysctl_register.part.7+0xc/0x5f [ 54.646814] register_cdrom.cold.24+0x2a/0x33 [ 54.646814] sr_probe+0x4bd/0x580 [ 54.646814] ? __driver_attach+0xd0/0xd0 [ 54.646814] really_probe+0xd6/0x260 [ 54.646814] ? __driver_attach+0xd0/0xd0 [ 54.646814] driver_probe_device+0x4a/0xb0 [ 54.646814] ? __driver_attach+0xd0/0xd0 [ 54.646814] bus_for_each_drv+0x73/0xc0 [ 54.646814] __device_attach+0xd6/0x130 [ 54.646814] bus_probe_device+0x9a/0xb0 [ 54.646814] device_add+0x40c/0x670 [ 54.646814] ? __pm_runtime_resume+0x4f/0x80 [ 54.646814] scsi_sysfs_add_sdev+0x81/0x290 [ 54.646814] scsi_probe_and_add_lun+0x888/0xc00 [ 54.646814] ? scsi_autopm_get_host+0x21/0x40 [ 54.646814] __scsi_add_device+0x116/0x130 [ 54.646814] ata_scsi_scan_host+0x93/0x1c0 [ 54.646814] async_run_entry_fn+0x34/0x100 [ 54.646814] process_one_work+0x237/0x5e0 [ 54.646814] worker_thread+0x37/0x380 [ 54.646814] ? rescuer_thread+0x360/0x360 [ 54.646814] kthread+0x118/0x130 [ 54.646814] ? kthread_create_on_node+0x60/0x60 [ 54.646814] ret_from_fork+0x3a/0x50 The only sensible explanation is that cdrom_sysctl_register() is called twice, once from the module init function and once from register_cdrom(). cdrom_sysctl_register() is not mutex protected and may happily execute twice if the second call is made before the first call is complete. Use a static atomic to ensure that the function is executed exactly once. Signed-off-by: Guenter Roeck Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 614ecdbb4ab7..933268b8d6a5 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -265,6 +265,7 @@ /* #define ERRLOGMASK (CD_WARNING|CD_OPEN|CD_COUNT_TRACKS|CD_CLOSE) */ /* #define ERRLOGMASK (CD_WARNING|CD_REG_UNREG|CD_DO_IOCTL|CD_OPEN|CD_CLOSE|CD_COUNT_TRACKS) */ +#include #include #include #include @@ -3692,9 +3693,9 @@ static struct ctl_table_header *cdrom_sysctl_header; static void cdrom_sysctl_register(void) { - static int initialized; + static atomic_t initialized = ATOMIC_INIT(0); - if (initialized == 1) + if (!atomic_add_unless(&initialized, 1, 1)) return; cdrom_sysctl_header = register_sysctl_table(cdrom_root_table); @@ -3705,8 +3706,6 @@ static void cdrom_sysctl_register(void) cdrom_sysctl_settings.debug = debug; cdrom_sysctl_settings.lock = lockdoor; cdrom_sysctl_settings.check = check_media_type; - - initialized = 1; } static void cdrom_sysctl_unregister(void) -- cgit v1.2.3-59-g8ed1b From e5fa81408fb43ebabde65938ef8b20ae879017e7 Mon Sep 17 00:00:00 2001 From: Aleksei Zakharov Date: Fri, 8 Feb 2019 19:14:05 +0300 Subject: block: avoid setting nr_requests to current value There's no reason to freeze queue and set nr_requests value if current value is the same. Signed-off-by: Aleksei Zakharov Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8a825aebc6b5..44d471ff8754 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3089,6 +3089,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!set) return -EINVAL; + if (q->nr_requests == nr) + return 0; + blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); -- cgit v1.2.3-59-g8ed1b From 9951379b0ca88c95876ad9778b9099e19a95d566 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 9 Feb 2019 12:52:53 +0800 Subject: bcache: never writeback a discard operation Some users see panics like the following when performing fstrim on a bcached volume: [ 529.803060] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 [ 530.183928] #PF error: [normal kernel read fault] [ 530.412392] PGD 8000001f42163067 P4D 8000001f42163067 PUD 1f42168067 PMD 0 [ 530.750887] Oops: 0000 [#1] SMP PTI [ 530.920869] CPU: 10 PID: 4167 Comm: fstrim Kdump: loaded Not tainted 5.0.0-rc1+ #3 [ 531.290204] Hardware name: HP ProLiant DL360 Gen9/ProLiant DL360 Gen9, BIOS P89 12/27/2015 [ 531.693137] RIP: 0010:blk_queue_split+0x148/0x620 [ 531.922205] Code: 60 38 89 55 a0 45 31 db 45 31 f6 45 31 c9 31 ff 89 4d 98 85 db 0f 84 7f 04 00 00 44 8b 6d 98 4c 89 ee 48 c1 e6 04 49 03 70 78 <8b> 46 08 44 8b 56 0c 48 8b 16 44 29 e0 39 d8 48 89 55 a8 0f 47 c3 [ 532.838634] RSP: 0018:ffffb9b708df39b0 EFLAGS: 00010246 [ 533.093571] RAX: 00000000ffffffff RBX: 0000000000046000 RCX: 0000000000000000 [ 533.441865] RDX: 0000000000000200 RSI: 0000000000000000 RDI: 0000000000000000 [ 533.789922] RBP: ffffb9b708df3a48 R08: ffff940d3b3fdd20 R09: 0000000000000000 [ 534.137512] R10: ffffb9b708df3958 R11: 0000000000000000 R12: 0000000000000000 [ 534.485329] R13: 0000000000000000 R14: 0000000000000000 R15: ffff940d39212020 [ 534.833319] FS: 00007efec26e3840(0000) GS:ffff940d1f480000(0000) knlGS:0000000000000000 [ 535.224098] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 535.504318] CR2: 0000000000000008 CR3: 0000001f4e256004 CR4: 00000000001606e0 [ 535.851759] Call Trace: [ 535.970308] ? mempool_alloc_slab+0x15/0x20 [ 536.174152] ? bch_data_insert+0x42/0xd0 [bcache] [ 536.403399] blk_mq_make_request+0x97/0x4f0 [ 536.607036] generic_make_request+0x1e2/0x410 [ 536.819164] submit_bio+0x73/0x150 [ 536.980168] ? submit_bio+0x73/0x150 [ 537.149731] ? bio_associate_blkg_from_css+0x3b/0x60 [ 537.391595] ? _cond_resched+0x1a/0x50 [ 537.573774] submit_bio_wait+0x59/0x90 [ 537.756105] blkdev_issue_discard+0x80/0xd0 [ 537.959590] ext4_trim_fs+0x4a9/0x9e0 [ 538.137636] ? ext4_trim_fs+0x4a9/0x9e0 [ 538.324087] ext4_ioctl+0xea4/0x1530 [ 538.497712] ? _copy_to_user+0x2a/0x40 [ 538.679632] do_vfs_ioctl+0xa6/0x600 [ 538.853127] ? __do_sys_newfstat+0x44/0x70 [ 539.051951] ksys_ioctl+0x6d/0x80 [ 539.212785] __x64_sys_ioctl+0x1a/0x20 [ 539.394918] do_syscall_64+0x5a/0x110 [ 539.568674] entry_SYSCALL_64_after_hwframe+0x44/0xa9 We have observed it where both: 1) LVM/devmapper is involved (bcache backing device is LVM volume) and 2) writeback cache is involved (bcache cache_mode is writeback) On one machine, we can reliably reproduce it with: # echo writeback > /sys/block/bcache0/bcache/cache_mode (not sure whether above line is required) # mount /dev/bcache0 /test # for i in {0..10}; do file="$(mktemp /test/zero.XXX)" dd if=/dev/zero of="$file" bs=1M count=256 sync rm $file done # fstrim -v /test Observing this with tracepoints on, we see the following writes: fstrim-18019 [022] .... 91107.302026: bcache_write: 73f95583-561c-408f-a93a-4cbd2498f5c8 inode 0 DS 4260112 + 196352 hit 0 bypass 1 fstrim-18019 [022] .... 91107.302050: bcache_write: 73f95583-561c-408f-a93a-4cbd2498f5c8 inode 0 DS 4456464 + 262144 hit 0 bypass 1 fstrim-18019 [022] .... 91107.302075: bcache_write: 73f95583-561c-408f-a93a-4cbd2498f5c8 inode 0 DS 4718608 + 81920 hit 0 bypass 1 fstrim-18019 [022] .... 91107.302094: bcache_write: 73f95583-561c-408f-a93a-4cbd2498f5c8 inode 0 DS 5324816 + 180224 hit 0 bypass 1 fstrim-18019 [022] .... 91107.302121: bcache_write: 73f95583-561c-408f-a93a-4cbd2498f5c8 inode 0 DS 5505040 + 262144 hit 0 bypass 1 fstrim-18019 [022] .... 91107.302145: bcache_write: 73f95583-561c-408f-a93a-4cbd2498f5c8 inode 0 DS 5767184 + 81920 hit 0 bypass 1 fstrim-18019 [022] .... 91107.308777: bcache_write: 73f95583-561c-408f-a93a-4cbd2498f5c8 inode 0 DS 6373392 + 180224 hit 1 bypass 0 Note the final one has different hit/bypass flags. This is because in should_writeback(), we were hitting a case where the partial stripe condition was returning true and so should_writeback() was returning true early. If that hadn't been the case, it would have hit the would_skip test, and as would_skip == s->iop.bypass == true, should_writeback() would have returned false. Looking at the git history from 'commit 72c270612bd3 ("bcache: Write out full stripes")', it looks like the idea was to optimise for raid5/6: * If a stripe is already dirty, force writes to that stripe to writeback mode - to help build up full stripes of dirty data To fix this issue, make sure that should_writeback() on a discard op never returns true. More details of debugging: https://www.spinics.net/lists/linux-bcache/msg06996.html Previous reports: - https://bugzilla.kernel.org/show_bug.cgi?id=201051 - https://bugzilla.kernel.org/show_bug.cgi?id=196103 - https://www.spinics.net/lists/linux-bcache/msg06885.html (Coly Li: minor modification to follow maximum 75 chars per line rule) Cc: Kent Overstreet Cc: stable@vger.kernel.org Fixes: 72c270612bd3 ("bcache: Write out full stripes") Signed-off-by: Daniel Axtens Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 6a743d3bb338..4e4c6810dc3c 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -71,6 +71,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, in_use > bch_cutoff_writeback_sync) return false; + if (bio_op(bio) == REQ_OP_DISCARD) + return false; + if (dc->partial_stripes_expensive && bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector, bio_sectors(bio))) -- cgit v1.2.3-59-g8ed1b From 83ff9318c44babe32da0947d81443bad82da2d44 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:52:54 +0800 Subject: bcache: not use hard coded memset size in bch_cache_accounting_clear() In stats.c:bch_cache_accounting_clear(), a hard coded number '7' is used in memset(). It is because in struct cache_stats, there are 7 atomic_t type members. This is not good when new members added into struct stats, the hard coded number will only clear part of memory. This patch replaces 'sizeof(unsigned long) * 7' by more generic 'sizeof(struct cache_stats))', to avoid potential error if new member added into struct cache_stats. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 894410f3f829..ba1c93791d8d 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -111,7 +111,7 @@ void bch_cache_accounting_clear(struct cache_accounting *acc) { memset(&acc->total.cache_hits, 0, - sizeof(unsigned long) * 7); + sizeof(struct cache_stats)); } void bch_cache_accounting_destroy(struct cache_accounting *acc) -- cgit v1.2.3-59-g8ed1b From 926d19465b66cb6bde4ca28fde16de775af4e357 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:52:55 +0800 Subject: bcache: export backing_dev_name via sysfs This patch export dc->backing_dev_name to sysfs file /sys/block/bcache/bcache/backing_dev_name, then people or user space tools may know the backing device name of this bcache device. Of cause it can be done by parsing sysfs links, but this method can be much simpler to find the link between bcache device and backing device. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 557a8a3270a1..b9166ee027fa 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -67,6 +67,7 @@ read_attribute(written); read_attribute(btree_written); read_attribute(metadata_written); read_attribute(active_journal_entries); +read_attribute(backing_dev_name); sysfs_time_stats_attribute(btree_gc, sec, ms); sysfs_time_stats_attribute(btree_split, sec, us); @@ -243,6 +244,12 @@ SHOW(__bch_cached_dev) return strlen(buf); } + if (attr == &sysfs_backing_dev_name) { + snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name); + strcat(buf, "\n"); + return strlen(buf); + } + #undef var return 0; } @@ -452,6 +459,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_verify, &sysfs_bypass_torture_test, #endif + &sysfs_backing_dev_name, NULL }; KTYPE(bch_cached_dev); -- cgit v1.2.3-59-g8ed1b From d4610456cfa412811b749f6215b9adae976ab4c3 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:52:56 +0800 Subject: bcache: export backing_dev_uuid via sysfs When there are multiple bcache devices, after a reboot the name of bcache devices may change (e.g. current /dev/bcache1 was /dev/bcache0 before reboot). Therefore we need the backing device UUID (sb.uuid) to identify each bcache device. Backing device uuid can be found by program bcache-super-show, but directly exporting backing_dev_uuid by sysfs file /sys/block/bcache/bcache/backing_dev_uuid is a much simpler method. With backing_dev_uuid, and partition uuids from /dev/disk/by-partuuid/, now we can identify each bcache device and its partitions conveniently. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b9166ee027fa..9be27b26d300 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -68,6 +68,7 @@ read_attribute(btree_written); read_attribute(metadata_written); read_attribute(active_journal_entries); read_attribute(backing_dev_name); +read_attribute(backing_dev_uuid); sysfs_time_stats_attribute(btree_gc, sec, ms); sysfs_time_stats_attribute(btree_split, sec, us); @@ -250,6 +251,13 @@ SHOW(__bch_cached_dev) return strlen(buf); } + if (attr == &sysfs_backing_dev_uuid) { + /* convert binary uuid into 36-byte string plus '\0' */ + snprintf(buf, 36+1, "%pU", dc->sb.uuid); + strcat(buf, "\n"); + return strlen(buf); + } + #undef var return 0; } @@ -460,6 +468,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_bypass_torture_test, #endif &sysfs_backing_dev_name, + &sysfs_backing_dev_uuid, NULL }; KTYPE(bch_cached_dev); -- cgit v1.2.3-59-g8ed1b From e8cf978dffb2c603340d4615eec2e5358c9df06d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 9 Feb 2019 12:52:57 +0800 Subject: bcache: fix indentation issue, remove tabs on a hunk of code There is a hunk of code that is indented one level too deep, fix this by removing the extra tabs. Signed-off-by: Colin Ian King Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 4dee119c3664..a697a3a923cd 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1615,21 +1615,21 @@ static void conditional_stop_bcache_device(struct cache_set *c, */ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.", d->disk->disk_name); - /* - * There might be a small time gap that cache set is - * released but bcache device is not. Inside this time - * gap, regular I/O requests will directly go into - * backing device as no cache set attached to. This - * behavior may also introduce potential inconsistence - * data in writeback mode while cache is dirty. - * Therefore before calling bcache_device_stop() due - * to a broken cache device, dc->io_disable should be - * explicitly set to true. - */ - dc->io_disable = true; - /* make others know io_disable is true earlier */ - smp_mb(); - bcache_device_stop(d); + /* + * There might be a small time gap that cache set is + * released but bcache device is not. Inside this time + * gap, regular I/O requests will directly go into + * backing device as no cache set attached to. This + * behavior may also introduce potential inconsistence + * data in writeback mode while cache is dirty. + * Therefore before calling bcache_device_stop() due + * to a broken cache device, dc->io_disable should be + * explicitly set to true. + */ + dc->io_disable = true; + /* make others know io_disable is true earlier */ + smp_mb(); + bcache_device_stop(d); } else { /* * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO -- cgit v1.2.3-59-g8ed1b From 58ac323084ebf44f8470eeb8b82660f9d0ee3689 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Sat, 9 Feb 2019 12:52:58 +0800 Subject: bcache: treat stale && dirty keys as bad keys Stale && dirty keys can be produced in the follow way: After writeback in write_dirty_finish(), dirty keys k1 will replace by clean keys k2 ==>ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); ==>btree_insert_fn(struct btree_op *b_op, struct btree *b) ==>static int bch_btree_insert_node(struct btree *b, struct btree_op *op, struct keylist *insert_keys, atomic_t *journal_ref, Then two steps: A) update k1 to k2 in btree node memory; bch_btree_insert_keys(b, op, insert_keys, replace_key) B) Write the bset(contains k2) to cache disk by a 30s delay work bch_btree_leaf_dirty(b, journal_ref). But before the 30s delay work write the bset to cache device, these things happened: A) GC works, and reclaim the bucket k2 point to; B) Allocator works, and invalidate the bucket k2 point to, and increase the gen of the bucket, and place it into free_inc fifo; C) Until now, the 30s delay work still does not finish work, so in the disk, the key still is k1, it is dirty and stale (its gen is smaller than the gen of the bucket). and then the machine power off suddenly happens; D) When the machine power on again, after the btree reconstruction, the stale dirty key appear. In bch_extent_bad(), when expensive_debug_checks is off, it would treat the dirty key as good even it is stale keys, and it would cause bellow probelms: A) In read_dirty() it would cause machine crash: BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); B) It could be worse when reads hits stale dirty keys, it would read old incorrect data. This patch tolerate the existence of these stale && dirty keys, and treat them as bad key in bch_extent_bad(). (Coly Li: fix indent which was modified by sender's email client) Signed-off-by: Tang Junhui Cc: stable@vger.kernel.org Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/extents.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 956004366699..886710043025 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -538,6 +538,7 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); unsigned int i, stale; + char buf[80]; if (!KEY_PTRS(k) || bch_extent_invalid(bk, k)) @@ -547,19 +548,19 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) if (!ptr_available(b->c, k, i)) return true; - if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) - return false; - for (i = 0; i < KEY_PTRS(k); i++) { stale = ptr_stale(b->c, k, i); + if (stale && KEY_DIRTY(k)) { + bch_extent_to_text(buf, sizeof(buf), k); + pr_info("stale dirty pointer, stale %u, key: %s", + stale, buf); + } + btree_bug_on(stale > BUCKET_GC_GEN_MAX, b, "key too stale: %i, need_gc %u", stale, b->c->need_gc); - btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), - b, "stale dirty pointer"); - if (stale) return true; -- cgit v1.2.3-59-g8ed1b From 596b5a5dd1bc2fa019fdaaae522ef331deef927f Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:52:59 +0800 Subject: bcache: improve sysfs_strtoul_clamp() Currently sysfs_strtoul_clamp() is defined as, 82 #define sysfs_strtoul_clamp(file, var, min, max) \ 83 do { \ 84 if (attr == &sysfs_ ## file) \ 85 return strtoul_safe_clamp(buf, var, min, max) \ 86 ?: (ssize_t) size; \ 87 } while (0) The problem is, if bit width of var is less then unsigned long, min and max may not protect var from integer overflow, because overflow happens in strtoul_safe_clamp() before checking min and max. To fix such overflow in sysfs_strtoul_clamp(), to make min and max take effect, this patch adds an unsigned long variable, and uses it to macro strtoul_safe_clamp() to convert an unsigned long value in range defined by [min, max]. Then assign this value to var. By this method, if bit width of var is less than unsigned long, integer overflow won't happen before min and max are checking. Now sysfs_strtoul_clamp() can properly handle smaller data type like unsigned int, of cause min and max should be defined in range of unsigned int too. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index 3fe82425859c..0ad2715a884e 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -81,9 +81,16 @@ do { \ #define sysfs_strtoul_clamp(file, var, min, max) \ do { \ - if (attr == &sysfs_ ## file) \ - return strtoul_safe_clamp(buf, var, min, max) \ - ?: (ssize_t) size; \ + if (attr == &sysfs_ ## file) { \ + unsigned long v = 0; \ + ssize_t ret; \ + ret = strtoul_safe_clamp(buf, v, min, max); \ + if (!ret) { \ + var = v; \ + return size; \ + } \ + return ret; \ + } \ } while (0) #define strtoul_or_return(cp) \ -- cgit v1.2.3-59-g8ed1b From f54478c6e226bb1540a3e58366601039dfd778e2 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:00 +0800 Subject: bcache: fix input integer overflow of congested threshold Cache set congested threshold values congested_read_threshold_us and congested_write_threshold_us can be set via sysfs interface. These two values are 'unsigned int' type, but sysfs interface uses strtoul to convert input string. So if people input a large number like 9999999999, the value indeed set is 1410065407, which is not expected behavior. This patch replaces sysfs_strtoul() by sysfs_strtoul_clamp() when convert input string to unsigned int value, and set value range in [0, UINT_MAX], to avoid the above integer overflow errors. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 9be27b26d300..bedd3e68fd29 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -778,10 +778,12 @@ STORE(__bch_cache_set) c->shrink.scan_objects(&c->shrink, &sc); } - sysfs_strtoul(congested_read_threshold_us, - c->congested_read_threshold_us); - sysfs_strtoul(congested_write_threshold_us, - c->congested_write_threshold_us); + sysfs_strtoul_clamp(congested_read_threshold_us, + c->congested_read_threshold_us, + 0, UINT_MAX); + sysfs_strtoul_clamp(congested_write_threshold_us, + c->congested_write_threshold_us, + 0, UINT_MAX); if (attr == &sysfs_errors) { v = __sysfs_match_string(error_actions, -1, buf); -- cgit v1.2.3-59-g8ed1b From 8c27a3953e92eb0b22dbb03d599f543a05f9574e Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:01 +0800 Subject: bcache: fix input overflow to sequential_cutoff People may set sequential_cutoff of a cached device via sysfs file, but current code does not check input value overflow. E.g. if value 4294967295 (UINT_MAX) is written to file sequential_cutoff, its value is 4GB, but if 4294967296 (UINT_MAX + 1) is written into, its value will be 0. This is an unexpected behavior. This patch replaces d_strtoi_h() by sysfs_strtoul_clamp() to convert input string to unsigned integer value, and limit its range in [0, UINT_MAX]. Then the input overflow can be fixed. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index bedd3e68fd29..96b64893f2cb 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -314,7 +314,9 @@ STORE(__cached_dev) dc->io_disable = v ? 1 : 0; } - d_strtoi_h(sequential_cutoff); + sysfs_strtoul_clamp(sequential_cutoff, + dc->sequential_cutoff, + 0, UINT_MAX); d_strtoi_h(readahead); if (attr == &sysfs_clear_stats) -- cgit v1.2.3-59-g8ed1b From e4db37fb69d56d9523bb540bd8e07bf221aa9f6d Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:02 +0800 Subject: bcache: add sysfs_strtoul_bool() for setting bit-field variables When setting bool values via sysfs interface, e.g. writeback_metadata, if writing 1 into writeback_metadata file, dc->writeback_metadata is set to 1, but if writing 2 into the file, dc->writeback_metadata is 0. This is misleading, a better result should be 1 for all non-zero input value. It is because dc->writeback_metadata is a bit-field variable, and current code simply use d_strtoul() to convert a string into integer and takes the lowest bit value. To fix such error, we need a routine to convert the input string into unsigned integer, and set target variable to 1 if the converted integer is non-zero. This patch introduces a new macro called sysfs_strtoul_bool(), it can be used to convert input string into bool value, we can use it to set bool value for bit-field vairables. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index 0ad2715a884e..215df32f567b 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -79,6 +79,16 @@ do { \ return strtoul_safe(buf, var) ?: (ssize_t) size; \ } while (0) +#define sysfs_strtoul_bool(file, var) \ +do { \ + if (attr == &sysfs_ ## file) { \ + unsigned long v = strtoul_or_return(buf); \ + \ + var = v ? 1 : 0; \ + return size; \ + } \ +} while (0) + #define sysfs_strtoul_clamp(file, var, min, max) \ do { \ if (attr == &sysfs_ ## file) { \ -- cgit v1.2.3-59-g8ed1b From f5c0b95d2eeb17cf8a81fde0461938d2a79303ab Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:03 +0800 Subject: bcache: use sysfs_strtoul_bool() to set bit-field variables When setting bcache parameters via sysfs, there are some variables are defined as bit-field value. Current bcache code in sysfs.c uses either d_strtoul() or sysfs_strtoul() to convert the input string to unsigned integer value and set it to the corresponded bit-field value. The problem is, the bit-field value only takes the lowest bit of the converted value. If input is 2, the expected value (like bool value) of the bit-field value should be 1, but indeed it is 0. The following sysfs files for bit-field variables have such problem, bypass_torture_test, for dc->bypass_torture_test writeback_metadata, for dc->writeback_metadata writeback_running, for dc->writeback_running verify, for c->verify key_merging_disabled, for c->key_merging_disabled gc_always_rewrite, for c->gc_always_rewrite btree_shrinker_disabled,for c->shrinker_disabled copy_gc_enabled, for c->copy_gc_enabled This patch uses sysfs_strtoul_bool() to set such bit-field variables, then if the converted value is non-zero, the bit-field variables will be set to 1, like setting a bool value like expensive_debug_checks. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 96b64893f2cb..57395e23747a 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -277,9 +277,9 @@ STORE(__cached_dev) sysfs_strtoul(data_csum, dc->disk.data_csum); d_strtoul(verify); - d_strtoul(bypass_torture_test); - d_strtoul(writeback_metadata); - d_strtoul(writeback_running); + sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test); + sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata); + sysfs_strtoul_bool(writeback_running, dc->writeback_running); d_strtoul(writeback_delay); sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, @@ -816,12 +816,12 @@ STORE(__bch_cache_set) } sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); - sysfs_strtoul(verify, c->verify); - sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); + sysfs_strtoul_bool(verify, c->verify); + sysfs_strtoul_bool(key_merging_disabled, c->key_merging_disabled); sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); - sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); - sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); - sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); + sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); + sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); /* * write gc_after_writeback here may overwrite an already set * BCH_DO_AUTO_GC, it doesn't matter because this flag will be -- cgit v1.2.3-59-g8ed1b From 369d21a73a241682de019ac5c5209ce3ec627743 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:04 +0800 Subject: bcache: fix input overflow to writeback_delay Sysfs file writeback_delay is used to configure dc->writeback_delay which is type unsigned int. But bcache code uses sysfs_strtoul() to convert the input string, therefore it might be overflowed if the input value is too large. E.g. input value is 4294967296 but indeed 0 is set to dc->writeback_delay. This patch uses sysfs_strtoul_clamp() to convert the input string and set the result value range in [0, UINT_MAX] to avoid such unsigned integer overflow. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 57395e23747a..e4519326594f 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -280,7 +280,7 @@ STORE(__cached_dev) sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test); sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata); sysfs_strtoul_bool(writeback_running, dc->writeback_running); - d_strtoul(writeback_delay); + sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX); sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, bch_cutoff_writeback); -- cgit v1.2.3-59-g8ed1b From c3b75a2199cdbfc1c335155fe143d842604b1baa Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:05 +0800 Subject: bcache: fix potential div-zero error of writeback_rate_i_term_inverse dc->writeback_rate_i_term_inverse can be set via sysfs interface. It is in type unsigned int, and convert from input string by d_strtoul(). The problem is d_strtoul() does not check valid range of the input, if 4294967296 is written into sysfs file writeback_rate_i_term_inverse, an overflow of unsigned integer will happen and value 0 is set to dc->writeback_rate_i_term_inverse. In writeback.c:__update_writeback_rate(), there are following lines of code, integral_scaled = div_s64(dc->writeback_rate_integral, dc->writeback_rate_i_term_inverse); If dc->writeback_rate_i_term_inverse is set to 0 via sysfs interface, a div-zero error might be triggered in the above code. Therefore we need to add a range limitation in the sysfs interface, this is what this patch does, use sysfs_stroul_clamp() to replace d_strtoul() and restrict the input range in [1, UINT_MAX]. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index e4519326594f..0fad46d3a8bd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -302,7 +302,9 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_rate_update_seconds, dc->writeback_rate_update_seconds, 1, WRITEBACK_RATE_UPDATE_SECS_MAX); - d_strtoul(writeback_rate_i_term_inverse); + sysfs_strtoul_clamp(writeback_rate_i_term_inverse, + dc->writeback_rate_i_term_inverse, + 1, UINT_MAX); d_strtoul_nonzero(writeback_rate_p_term_inverse); d_strtoul_nonzero(writeback_rate_minimum); -- cgit v1.2.3-59-g8ed1b From 5b5fd3c94eef69dcfaa8648198e54c92e5687d6d Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:06 +0800 Subject: bcache: fix potential div-zero error of writeback_rate_p_term_inverse Current code already uses d_strtoul_nonzero() to convert input string to an unsigned integer, to make sure writeback_rate_p_term_inverse won't be zero value. But overflow may happen when converting input string to an unsigned integer value by d_strtoul_nonzero(), then dc->writeback_rate_p_term_inverse can still be set to 0 even if the sysfs file input value is not zero, e.g. 4294967296 (a.k.a UINT_MAX+1). If dc->writeback_rate_p_term_inverse is set to 0, it might cause a dev-zero error in following code from __update_writeback_rate(), int64_t proportional_scaled = div_s64(error, dc->writeback_rate_p_term_inverse); This patch replaces d_strtoul_nonzero() by sysfs_strtoul_clamp() and limit the value range in [1, UINT_MAX]. Then the unsigned integer overflow and dev-zero error can be avoided. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 0fad46d3a8bd..c6677c93e368 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -305,7 +305,9 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_rate_i_term_inverse, dc->writeback_rate_i_term_inverse, 1, UINT_MAX); - d_strtoul_nonzero(writeback_rate_p_term_inverse); + sysfs_strtoul_clamp(writeback_rate_p_term_inverse, + dc->writeback_rate_p_term_inverse, + 1, UINT_MAX); d_strtoul_nonzero(writeback_rate_minimum); sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); -- cgit v1.2.3-59-g8ed1b From dab71b2db98dcdd4657d151b01a7be88ce10f9d1 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:07 +0800 Subject: bcache: fix input overflow to writeback_rate_minimum dc->writeback_rate_minimum is type unsigned integer variable, it is set via sysfs interface, and converte from input string to unsigned integer by d_strtoul_nonzero(). When the converted input value is larger than UINT_MAX, overflow to unsigned integer happens. This patch fixes the overflow by using sysfs_strotoul_clamp() to convert input string and limit the value in range [1, UINT_MAX], then the overflow can be avoided. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index c6677c93e368..d292eb757ac4 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -308,7 +308,9 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_rate_p_term_inverse, dc->writeback_rate_p_term_inverse, 1, UINT_MAX); - d_strtoul_nonzero(writeback_rate_minimum); + sysfs_strtoul_clamp(writeback_rate_minimum, + dc->writeback_rate_minimum, + 1, UINT_MAX); sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); -- cgit v1.2.3-59-g8ed1b From 453745fbbebecf7e459785db7e29e11563908525 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:08 +0800 Subject: bcache: fix input overflow to journal_delay_ms c->journal_delay_ms is in type unsigned short, it is set via sysfs interface and converted by sysfs_strtoul() from input string to unsigned short value. Therefore overflow to unsigned short might be happen when the converted value exceed USHRT_MAX. e.g. writing 65536 into sysfs file journal_delay_ms, c->journal_delay_ms is set to 0. This patch uses sysfs_strtoul_clamp() to convert the input string and limit value range in [0, USHRT_MAX], to avoid the input overflow. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index d292eb757ac4..201e85bbe3eb 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -821,7 +821,9 @@ STORE(__bch_cache_set) } } - sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); + sysfs_strtoul_clamp(journal_delay_ms, + c->journal_delay_ms, + 0, USHRT_MAX); sysfs_strtoul_bool(verify, c->verify); sysfs_strtoul_bool(key_merging_disabled, c->key_merging_disabled); sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); -- cgit v1.2.3-59-g8ed1b From b15008403b59955c9fa0c8b55cadd6dae991a4e9 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:09 +0800 Subject: bcache: fix input overflow to cache set io_error_limit c->error_limit is in type unsigned int, it is set via cache set sysfs file io_error_limit. Inside the bcache code, input string is converted by strtoul_or_return() and set the converted value to c->error_limit. Because the converted value is unsigned long, and c->error_limit is unsigned int, if the input is large enought, overflow will happen to c->error_limit. This patch uses sysfs_strtoul_clamp() to convert input string, and set the range in [0, UINT_MAX] to avoid the potential overflow. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 201e85bbe3eb..467105614324 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -801,8 +801,7 @@ STORE(__bch_cache_set) c->on_error = v; } - if (attr == &sysfs_io_error_limit) - c->error_limit = strtoul_or_return(buf); + sysfs_strtoul_clamp(io_error_limit, c->error_limit, 0, UINT_MAX); /* See count_io_errors() for why 88 */ if (attr == &sysfs_io_error_halflife) -- cgit v1.2.3-59-g8ed1b From a91fbda49f746119828f7e8ad0f0aa2ab0578f65 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:10 +0800 Subject: bcache: fix input overflow to cache set sysfs file io_error_halflife Cache set sysfs entry io_error_halflife is used to set c->error_decay. c->error_decay is in type unsigned int, and it is converted by strtoul_or_return(), therefore overflow to c->error_decay is possible for a large input value. This patch fixes the overflow by using strtoul_safe_clamp() to convert input string to an unsigned long value in range [0, UINT_MAX], then divides by 88 and set it to c->error_decay. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 467105614324..17bae9c14ca0 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -804,8 +804,17 @@ STORE(__bch_cache_set) sysfs_strtoul_clamp(io_error_limit, c->error_limit, 0, UINT_MAX); /* See count_io_errors() for why 88 */ - if (attr == &sysfs_io_error_halflife) - c->error_decay = strtoul_or_return(buf) / 88; + if (attr == &sysfs_io_error_halflife) { + unsigned long v = 0; + ssize_t ret; + + ret = strtoul_safe_clamp(buf, v, 0, UINT_MAX); + if (!ret) { + c->error_decay = v / 88; + return size; + } + return ret; + } if (attr == &sysfs_io_disable) { v = strtoul_or_return(buf); -- cgit v1.2.3-59-g8ed1b From dc7292a5bcb4c878b076fca2ac3fc22f81b8f8df Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 9 Feb 2019 12:53:11 +0800 Subject: bcache: use (REQ_META|REQ_PRIO) to indicate bio for metadata In 'commit 752f66a75aba ("bcache: use REQ_PRIO to indicate bio for metadata")' REQ_META is replaced by REQ_PRIO to indicate metadata bio. This assumption is not always correct, e.g. XFS uses REQ_META to mark metadata bio other than REQ_PRIO. This is why Nix noticed that bcache does not cache metadata for XFS after the above commit. Thanks to Dave Chinner, he explains the difference between REQ_META and REQ_PRIO from view of file system developer. Here I quote part of his explanation from mailing list, REQ_META is used for metadata. REQ_PRIO is used to communicate to the lower layers that the submitter considers this IO to be more important that non REQ_PRIO IO and so dispatch should be expedited. IOWs, if the filesystem considers metadata IO to be more important that user data IO, then it will use REQ_PRIO | REQ_META rather than just REQ_META. Then it seems bios with REQ_META or REQ_PRIO should both be cached for performance optimation, because they are all probably low I/O latency demand by upper layer (e.g. file system). So in this patch, when we want to decide whether to bypass the cache, REQ_META and REQ_PRIO are both checked. Then both metadata and high priority I/O requests will be handled properly. Reported-by: Nix Signed-off-by: Coly Li Reviewed-by: Andre Noll Tested-by: Nix Cc: stable@vger.kernel.org Cc: Dave Chinner Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 15070412a32e..f101bfe8657a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -392,10 +392,11 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) /* * Flag for bypass if the IO is for read-ahead or background, - * unless the read-ahead request is for metadata (eg, for gfs2). + * unless the read-ahead request is for metadata + * (eg, for gfs2 or xfs). */ if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && - !(bio->bi_opf & REQ_PRIO)) + !(bio->bi_opf & (REQ_META|REQ_PRIO))) goto skip; if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || @@ -877,7 +878,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, } if (!(bio->bi_opf & REQ_RAHEAD) && - !(bio->bi_opf & REQ_PRIO) && + !(bio->bi_opf & (REQ_META|REQ_PRIO)) && s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) reada = min_t(sector_t, dc->readahead >> 9, get_capacity(bio->bi_disk) - bio_end_sector(bio)); -- cgit v1.2.3-59-g8ed1b From d11a3998985b351aaab6bbdc23bc884bd5e815c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 9 Feb 2019 15:40:24 -0700 Subject: block: kill QUEUE_FLAG_FLUSH_NQ We have various helpers for setting/clearing this flag, and also a helper to check if the queue supports queueable flushes or not. But nobody uses them anymore, kill it with fire. Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/blk-settings.c | 9 --------- drivers/ata/libata-scsi.c | 2 -- drivers/block/null_blk_main.c | 1 - include/linux/blkdev.h | 7 ------- 5 files changed, 20 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f8120832ca7b..c782e81db627 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -132,7 +132,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(POLL), QUEUE_FLAG_NAME(WC), QUEUE_FLAG_NAME(FUA), - QUEUE_FLAG_NAME(FLUSH_NQ), QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(POLL_STATS), diff --git a/block/blk-settings.c b/block/blk-settings.c index 3e7038e475ee..6375afaedcec 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -799,15 +799,6 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) } EXPORT_SYMBOL(blk_queue_update_dma_alignment); -void blk_queue_flush_queueable(struct request_queue *q, bool queueable) -{ - if (queueable) - blk_queue_flag_clear(QUEUE_FLAG_FLUSH_NQ, q); - else - blk_queue_flag_set(QUEUE_FLAG_FLUSH_NQ, q); -} -EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); - /** * blk_set_queue_depth - tell the block layer about the device queue depth * @q: the request queue for the device diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 3d4887d0e84a..dfe66d00dd5b 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1318,8 +1318,6 @@ static int ata_scsi_dev_config(struct scsi_device *sdev, scsi_change_queue_depth(sdev, depth); } - blk_queue_flush_queueable(q, false); - if (dev->flags & ATA_DFLAG_TRUSTED) sdev->security_supported = 1; diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 62c9654b9ce8..83c38a6217d7 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1678,7 +1678,6 @@ static int null_add_dev(struct nullb_device *dev) if (dev->cache_size > 0) { set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); blk_queue_write_cache(nullb->q, true, true); - blk_queue_flush_queueable(nullb->q, true); } if (dev->zoned) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 338604dff7d0..24ccab51085f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -592,7 +592,6 @@ struct request_queue { #define QUEUE_FLAG_POLL 19 /* IO polling enabled if set */ #define QUEUE_FLAG_WC 20 /* Write back caching */ #define QUEUE_FLAG_FUA 21 /* device supports FUA writes */ -#define QUEUE_FLAG_FLUSH_NQ 22 /* flush not queueuable */ #define QUEUE_FLAG_DAX 23 /* device supports DAX */ #define QUEUE_FLAG_STATS 24 /* track IO start and completion times */ #define QUEUE_FLAG_POLL_STATS 25 /* collecting stats for hybrid polling */ @@ -1069,7 +1068,6 @@ extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); -extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); /* @@ -1446,11 +1444,6 @@ static inline unsigned int block_size(struct block_device *bdev) return bdev->bd_block_size; } -static inline bool queue_flush_queueable(struct request_queue *q) -{ - return !test_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); -} - typedef struct {struct page *v;} Sector; unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *); -- cgit v1.2.3-59-g8ed1b From eca7abf31abba2acac445ec6a1d3f94cf0cab918 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 9 Feb 2019 15:42:07 -0700 Subject: block: queue flag cleanup We have QUEUE_FLAG_DEFAULT defined, but it's not used anymore since the legacy IO stack is gone. Kill it. Sanitize the queue flags in general, they use spaces (for some reason), and the space is pretty sparse. With the flags renumbered, we can more clearly see how many we have available. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 58 +++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 24ccab51085f..3603270cb82d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -572,37 +572,33 @@ struct request_queue { u64 write_hints[BLK_MAX_WRITE_HINTS]; }; -#define QUEUE_FLAG_STOPPED 1 /* queue is stopped */ -#define QUEUE_FLAG_DYING 2 /* queue being torn down */ -#define QUEUE_FLAG_BIDI 4 /* queue supports bidi requests */ -#define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ -#define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */ -#define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */ -#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ -#define QUEUE_FLAG_IO_STAT 10 /* do disk/partitions IO accounting */ -#define QUEUE_FLAG_DISCARD 11 /* supports DISCARD */ -#define QUEUE_FLAG_NOXMERGES 12 /* No extended merges */ -#define QUEUE_FLAG_ADD_RANDOM 13 /* Contributes to random pool */ -#define QUEUE_FLAG_SECERASE 14 /* supports secure erase */ -#define QUEUE_FLAG_SAME_FORCE 15 /* force complete on same CPU */ -#define QUEUE_FLAG_DEAD 16 /* queue tear-down finished */ -#define QUEUE_FLAG_INIT_DONE 17 /* queue is initialized */ -#define QUEUE_FLAG_NO_SG_MERGE 18 /* don't attempt to merge SG segments*/ -#define QUEUE_FLAG_POLL 19 /* IO polling enabled if set */ -#define QUEUE_FLAG_WC 20 /* Write back caching */ -#define QUEUE_FLAG_FUA 21 /* device supports FUA writes */ -#define QUEUE_FLAG_DAX 23 /* device supports DAX */ -#define QUEUE_FLAG_STATS 24 /* track IO start and completion times */ -#define QUEUE_FLAG_POLL_STATS 25 /* collecting stats for hybrid polling */ -#define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ -#define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ -#define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ -#define QUEUE_FLAG_PCI_P2PDMA 29 /* device supports PCI p2p requests */ - -#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_SAME_COMP) | \ - (1 << QUEUE_FLAG_ADD_RANDOM)) +#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ +#define QUEUE_FLAG_DYING 1 /* queue being torn down */ +#define QUEUE_FLAG_BIDI 2 /* queue supports bidi requests */ +#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ +#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ +#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ +#define QUEUE_FLAG_NONROT 6 /* non-rotational device (SSD) */ +#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ +#define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ +#define QUEUE_FLAG_DISCARD 8 /* supports DISCARD */ +#define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ +#define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ +#define QUEUE_FLAG_SECERASE 11 /* supports secure erase */ +#define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ +#define QUEUE_FLAG_DEAD 13 /* queue tear-down finished */ +#define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ +#define QUEUE_FLAG_NO_SG_MERGE 15 /* don't attempt to merge SG segments*/ +#define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ +#define QUEUE_FLAG_WC 17 /* Write back caching */ +#define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ +#define QUEUE_FLAG_DAX 19 /* device supports DAX */ +#define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ +#define QUEUE_FLAG_POLL_STATS 21 /* collecting stats for hybrid polling */ +#define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ +#define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */ +#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ +#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP)) -- cgit v1.2.3-59-g8ed1b From 7585d5082e17a6988a784010c1fbbc4055edcdb7 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 25 Jan 2019 00:01:42 -0200 Subject: blk-cgroup: Fix doc related to blkcg_exit_queue Since 4cf6324b17e9, a portion of function blk_cleanup_queue was moved to a newly created function called blk_exit_queue, including the call of blkcg_exit_queue. So, adjust the documenation according. Reviewed-by: Bart Van Assche Signed-off-by: Marcos Paulo de Souza Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2bed5725aa03..77f37ef8ef06 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1269,7 +1269,7 @@ void blkcg_drain_queue(struct request_queue *q) * blkcg_exit_queue - exit and release blkcg part of request_queue * @q: request_queue being released * - * Called from blk_release_queue(). Responsible for exiting blkcg part. + * Called from blk_exit_queue(). Responsible for exiting blkcg part. */ void blkcg_exit_queue(struct request_queue *q) { -- cgit v1.2.3-59-g8ed1b From 1e9364283764ac93b012739890a30d73e76396db Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Sun, 10 Feb 2019 15:22:51 -0200 Subject: blk-sysfs: Rework documention of __blk_release_queue The Notes section of the comment was removed, because now blk_release_queue can only be executed from blk_cleanup_queue (being called when the q->kobj reaches zero), and also blk_init_queue was removed in a1ce35fa4985. Signed-off-by: Marcos Paulo de Souza Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 590d1ef2f961..94e1b052abbc 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -817,21 +817,16 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) } /** - * __blk_release_queue - release a request queue when it is no longer needed + * __blk_release_queue - release a request queue * @work: pointer to the release_work member of the request queue to be released * * Description: - * blk_release_queue is the counterpart of blk_init_queue(). It should be - * called when a request queue is being released; typically when a block - * device is being de-registered. Its primary task it to free the queue - * itself. - * - * Notes: - * The low level driver must have finished any outstanding requests first - * via blk_cleanup_queue(). - * - * Although blk_release_queue() may be called with preemption disabled, - * __blk_release_queue() may sleep. + * This function is called when a block device is being unregistered. The + * process of releasing a request queue starts with blk_cleanup_queue, which + * set the appropriate flags and then calls blk_put_queue, that decrements + * the reference counter of the request queue. Once the reference counter + * of the request queue reaches zero, blk_release_queue is called to release + * all allocated resources of the request queue. */ static void __blk_release_queue(struct work_struct *work) { -- cgit v1.2.3-59-g8ed1b From f9324980d7300f961e9895ad94d5ea71c0fe187e Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 11 Feb 2019 13:25:02 +0100 Subject: lightnvm: pblk: stop taking the free lock in in pblk_lines_free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pblk_line_meta_free might sleep (it can end up calling vfree, depending on how we allocate lba lists), and this can lead to a BUG() if we wake up on a different cpu and release the lock. As there is no point of grabbing the free lock when pblk has shut down, remove the lock. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index f9a3e47b6a93..eb0135c77805 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -584,14 +584,12 @@ static void pblk_lines_free(struct pblk *pblk) struct pblk_line *line; int i; - spin_lock(&l_mg->free_lock); for (i = 0; i < l_mg->nr_lines; i++) { line = &pblk->lines[i]; pblk_line_free(line); pblk_line_meta_free(l_mg, line); } - spin_unlock(&l_mg->free_lock); pblk_line_mg_free(pblk); -- cgit v1.2.3-59-g8ed1b From 6916cf5426d08ea8be50ab4ba7ff86ea022cdff3 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 11 Feb 2019 13:25:03 +0100 Subject: lightnvm: pblk: use vfree to free metadata on error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As chunk metadata is allocated using vmalloc, we need to free it using vfree. Fixes: 090ee26fd512 ("lightnvm: use internal allocation for chunk log page") Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 1ff165351180..1b5ff51faa63 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -141,7 +141,7 @@ struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk) ret = nvm_get_chunk_meta(dev, ppa, geo->all_chunks, meta); if (ret) { - kfree(meta); + vfree(meta); return ERR_PTR(-EIO); } -- cgit v1.2.3-59-g8ed1b From e74ecf63ef9c3ff92bda531ed41a43ad4e75682e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 11 Feb 2019 13:25:04 +0100 Subject: lightnvm: Use u64 instead of __le64 for CPU visible side MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sparse complains about using strict data types: drivers/lightnvm/pblk-read.c:254:43: warning: incorrect type in assignment (different base types) drivers/lightnvm/pblk-read.c:254:43: expected restricted __le64 drivers/lightnvm/pblk-read.c:254:43: got unsigned long long [unsigned] [usertype] drivers/lightnvm/pblk-read.c:255:29: warning: cast from restricted __le64 drivers/lightnvm/pblk-read.c:268:29: warning: cast from restricted __le64 drivers/lightnvm/pblk-read.c:328:41: warning: incorrect type in assignment (different base types) drivers/lightnvm/pblk-read.c:328:41: expected restricted __le64 drivers/lightnvm/pblk-read.c:328:41: got unsigned long long [unsigned] [usertype] In the code it seems explicit that lba_list_mem and lba_list_media members of struct pblk_pr_ctx are used on CPU side, which means they should not be of strict types. Change types of lba_list_mem and lba_list_media members to be u64. Signed-off-by: Andy Shevchenko Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 85e38ed62f85..0dd697ea201e 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -131,8 +131,8 @@ struct pblk_pr_ctx { unsigned int bio_init_idx; void *ppa_ptr; dma_addr_t dma_ppa_list; - __le64 lba_list_mem[NVM_MAX_VLBA]; - __le64 lba_list_media[NVM_MAX_VLBA]; + u64 lba_list_mem[NVM_MAX_VLBA]; + u64 lba_list_media[NVM_MAX_VLBA]; }; /* Pad context */ -- cgit v1.2.3-59-g8ed1b From 7e0a0847ed7ea02f03dd1442136a0cd684d91218 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 11 Feb 2019 13:25:05 +0100 Subject: lightnvm: pblk: Switch to use new generic UUID API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are new types and helpers that are supposed to be used in new code. As a preparation to get rid of legacy types and API functions do the conversion here. Signed-off-by: Andy Shevchenko Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 5 +++-- drivers/lightnvm/pblk-init.c | 2 +- drivers/lightnvm/pblk-recovery.c | 8 +++++--- drivers/lightnvm/pblk.h | 10 +--------- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 1b5ff51faa63..2a9e9facf44f 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1065,7 +1065,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len); smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); - memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16); + guid_copy((guid_t *)&smeta_buf->header.uuid, &pblk->instance_uuid); smeta_buf->header.id = cpu_to_le32(line->id); smeta_buf->header.type = cpu_to_le16(line->type); smeta_buf->header.version_major = SMETA_VERSION_MAJOR; @@ -1874,7 +1874,8 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) { emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); - memcpy(emeta_buf->header.uuid, pblk->instance_uuid, 16); + guid_copy((guid_t *)&emeta_buf->header.uuid, + &pblk->instance_uuid); emeta_buf->header.id = cpu_to_le32(line->id); emeta_buf->header.type = cpu_to_le16(line->type); emeta_buf->header.version_major = EMETA_VERSION_MAJOR; diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index eb0135c77805..8b643d0bffae 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -130,7 +130,7 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) struct pblk_line *line = NULL; if (factory_init) { - pblk_setup_uuid(pblk); + guid_gen(&pblk->instance_uuid); } else { line = pblk_recov_l2p(pblk); if (IS_ERR(line)) { diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 5ee20da7bdb3..6761d2afa4d0 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -703,11 +703,13 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) /* The first valid instance uuid is used for initialization */ if (!valid_uuid) { - memcpy(pblk->instance_uuid, smeta_buf->header.uuid, 16); + guid_copy(&pblk->instance_uuid, + (guid_t *)&smeta_buf->header.uuid); valid_uuid = 1; } - if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) { + if (!guid_equal(&pblk->instance_uuid, + (guid_t *)&smeta_buf->header.uuid)) { pblk_debug(pblk, "ignore line %u due to uuid mismatch\n", i); continue; @@ -737,7 +739,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) } if (!found_lines) { - pblk_setup_uuid(pblk); + guid_gen(&pblk->instance_uuid); spin_lock(&l_mg->free_lock); WARN_ON_ONCE(!test_and_clear_bit(meta_line, diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 0dd697ea201e..72ae8755764e 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -646,7 +646,7 @@ struct pblk { int sec_per_write; - unsigned char instance_uuid[16]; + guid_t instance_uuid; /* Persistent write amplification counters, 4kb sector I/Os */ atomic64_t user_wa; /* Sectors written by user */ @@ -1360,14 +1360,6 @@ static inline unsigned int pblk_get_secs(struct bio *bio) return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE; } -static inline void pblk_setup_uuid(struct pblk *pblk) -{ - uuid_le uuid; - - uuid_le_gen(&uuid); - memcpy(pblk->instance_uuid, uuid.b, 16); -} - static inline char *pblk_disk_name(struct pblk *pblk) { struct gendisk *disk = pblk->disk; -- cgit v1.2.3-59-g8ed1b From b7fce8f79d947abbe84e577d89ded3c3d5345c23 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 11 Feb 2019 13:25:06 +0100 Subject: lightnvm: pblk: fix TRACE_INCLUDE_PATH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As the comment block in include/trace/define_trace.h says, TRACE_INCLUDE_PATH should be a relative path to the define_trace.h ../../drivers/lightnvm is the correct relative path. ../../../drivers/lightnvm is working by coincidence because the top Makefile adds -I$(srctree)/arch/$(SRCARCH)/include as a header search path, but we should not rely on it. Signed-off-by: Masahiro Yamada Reviewed-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/pblk-trace.h b/drivers/lightnvm/pblk-trace.h index 679e5c458ca6..9534503b69d9 100644 --- a/drivers/lightnvm/pblk-trace.h +++ b/drivers/lightnvm/pblk-trace.h @@ -139,7 +139,7 @@ TRACE_EVENT(pblk_state, /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../../drivers/lightnvm +#define TRACE_INCLUDE_PATH ../../drivers/lightnvm #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE pblk-trace #include -- cgit v1.2.3-59-g8ed1b From aa8759d80a755cb32f1707b360a265f3695770ef Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 11 Feb 2019 13:25:07 +0100 Subject: lightnvm: pblk: extend line wp balance check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pblk stripes writes of minimal write size across all non-offline chunks in a line, which means that the maximum write pointer delta should not exceed the minimal write size. Extend the line write pointer balance check to cover this case, and ignore the offline chunk wps. This will render us a warning during recovery if something unexpected has happened to the chunk write pointers (i.e. powerloss, a spurious chunk reset, ..). Reported-by: Zhoujie Wu Tested-by: Zhoujie Wu Reviewed-by: Javier González Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-recovery.c | 56 +++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 6761d2afa4d0..d86f580036d3 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -302,35 +302,55 @@ static int pblk_pad_distance(struct pblk *pblk, struct pblk_line *line) return (distance > line->left_msecs) ? line->left_msecs : distance; } -static int pblk_line_wp_is_unbalanced(struct pblk *pblk, - struct pblk_line *line) +/* Return a chunk belonging to a line by stripe(write order) index */ +static struct nvm_chk_meta *pblk_get_stripe_chunk(struct pblk *pblk, + struct pblk_line *line, + int index) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; struct pblk_lun *rlun; - struct nvm_chk_meta *chunk; struct ppa_addr ppa; - u64 line_wp; - int pos, i; + int pos; - rlun = &pblk->luns[0]; + rlun = &pblk->luns[index]; ppa = rlun->bppa; pos = pblk_ppa_to_pos(geo, ppa); - chunk = &line->chks[pos]; - line_wp = chunk->wp; + return &line->chks[pos]; +} - for (i = 1; i < lm->blk_per_line; i++) { - rlun = &pblk->luns[i]; - ppa = rlun->bppa; - pos = pblk_ppa_to_pos(geo, ppa); - chunk = &line->chks[pos]; +static int pblk_line_wps_are_unbalanced(struct pblk *pblk, + struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + int blk_in_line = lm->blk_per_line; + struct nvm_chk_meta *chunk; + u64 max_wp, min_wp; + int i; + + i = find_first_zero_bit(line->blk_bitmap, blk_in_line); - if (chunk->wp > line_wp) + /* If there is one or zero good chunks in the line, + * the write pointers can't be unbalanced. + */ + if (i >= (blk_in_line - 1)) + return 0; + + chunk = pblk_get_stripe_chunk(pblk, line, i); + max_wp = chunk->wp; + if (max_wp > pblk->max_write_pgs) + min_wp = max_wp - pblk->max_write_pgs; + else + min_wp = 0; + + i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1); + while (i < blk_in_line) { + chunk = pblk_get_stripe_chunk(pblk, line, i); + if (chunk->wp > max_wp || chunk->wp < min_wp) return 1; - else if (chunk->wp < line_wp) - line_wp = chunk->wp; + + i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1); } return 0; @@ -356,7 +376,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, int ret; u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec; - if (pblk_line_wp_is_unbalanced(pblk, line)) + if (pblk_line_wps_are_unbalanced(pblk, line)) pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id); ppa_list = p.ppa_list; -- cgit v1.2.3-59-g8ed1b From b4cdc4260edf7e03d8b6fabb88aec2cdda95917e Mon Sep 17 00:00:00 2001 From: Javier González Date: Mon, 11 Feb 2019 13:25:08 +0100 Subject: lightnvm: pblk: prevent stall due to wb threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to respect mw_cuinits, pblk's write buffer maintains a backpointer to protect data not yet persisted; when writing to the write buffer, this backpointer defines a threshold that pblk's rate-limiter enforces. On small PU configurations, the following scenarios might take place: (i) the threshold is larger than the write buffer and (ii) the threshold is smaller than the write buffer, but larger than the maximun allowed split bio - 256KB at this moment (Note that writes are not always split - we only do this when we the size of the buffer is smaller than the buffer). In both cases, pblk's rate-limiter prevents the I/O to be written to the buffer, thus stalling. This patch fixes the original backpointer implementation by considering the threshold both on buffer creation and on the rate-limiters path, when bio_split is triggered (case (ii) above). Fixes: 766c8ceb16fc ("lightnvm: pblk: guarantee that backpointer is respected on writer stall") Signed-off-by: Javier González Reviewed-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-rb.c | 25 +++++++++++++++++++------ drivers/lightnvm/pblk-rl.c | 5 ++--- drivers/lightnvm/pblk.h | 2 +- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index d4ca8c64ee0f..a6133b50ed9c 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -45,10 +45,23 @@ void pblk_rb_free(struct pblk_rb *rb) /* * pblk_rb_calculate_size -- calculate the size of the write buffer */ -static unsigned int pblk_rb_calculate_size(unsigned int nr_entries) +static unsigned int pblk_rb_calculate_size(unsigned int nr_entries, + unsigned int threshold) { - /* Alloc a write buffer that can at least fit 128 entries */ - return (1 << max(get_count_order(nr_entries), 7)); + unsigned int thr_sz = 1 << (get_count_order(threshold + NVM_MAX_VLBA)); + unsigned int max_sz = max(thr_sz, nr_entries); + unsigned int max_io; + + /* Alloc a write buffer that can (i) fit at least two split bios + * (considering max I/O size NVM_MAX_VLBA, and (ii) guarantee that the + * threshold will be respected + */ + max_io = (1 << max((int)(get_count_order(max_sz)), + (int)(get_count_order(NVM_MAX_VLBA << 1)))); + if ((threshold + NVM_MAX_VLBA) >= max_io) + max_io <<= 1; + + return max_io; } /* @@ -67,12 +80,12 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold, unsigned int alloc_order, order, iter; unsigned int nr_entries; - nr_entries = pblk_rb_calculate_size(size); + nr_entries = pblk_rb_calculate_size(size, threshold); entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry))); if (!entries) return -ENOMEM; - power_size = get_count_order(size); + power_size = get_count_order(nr_entries); power_seg_sz = get_count_order(seg_size); down_write(&pblk_rb_lock); @@ -149,7 +162,7 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold, * Initialize rate-limiter, which controls access to the write buffer * by user and GC I/O */ - pblk_rl_init(&pblk->rl, rb->nr_entries); + pblk_rl_init(&pblk->rl, rb->nr_entries, threshold); return 0; } diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index 76116d5f78e4..b014957dde0b 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -207,7 +207,7 @@ void pblk_rl_free(struct pblk_rl *rl) del_timer(&rl->u_timer); } -void pblk_rl_init(struct pblk_rl *rl, int budget) +void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold) { struct pblk *pblk = container_of(rl, struct pblk, rl); struct nvm_tgt_dev *dev = pblk->dev; @@ -217,7 +217,6 @@ void pblk_rl_init(struct pblk_rl *rl, int budget) int sec_meta, blk_meta; unsigned int rb_windows; - /* Consider sectors used for metadata */ sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); @@ -234,7 +233,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget) /* To start with, all buffer is available to user I/O writers */ rl->rb_budget = budget; rl->rb_user_max = budget; - rl->rb_max_io = budget >> 1; + rl->rb_max_io = threshold ? (budget - threshold) : (budget - 1); rl->rb_gc_max = 0; rl->rb_state = PBLK_RL_HIGH; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 72ae8755764e..a6386d5acd73 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -924,7 +924,7 @@ int pblk_gc_sysfs_force(struct pblk *pblk, int force); /* * pblk rate limiter */ -void pblk_rl_init(struct pblk_rl *rl, int budget); +void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold); void pblk_rl_free(struct pblk_rl *rl); void pblk_rl_update_rates(struct pblk_rl *rl); int pblk_rl_high_thrs(struct pblk_rl *rl); -- cgit v1.2.3-59-g8ed1b From 0586942f03b71bc95b0ee356ff6b09d53acbad06 Mon Sep 17 00:00:00 2001 From: Heiner Litz Date: Mon, 11 Feb 2019 13:25:09 +0100 Subject: lightnvm: pblk: fix race condition on GC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes a race condition where a write is mapped to the last sectors of a line. The write is synced to the device but the L2P is not updated yet. When the line is garbage collected before the L2P update is performed, the sectors are ignored by the GC logic and the line is freed before all sectors are moved. When the L2P is finally updated, it contains a mapping to a freed line, subsequent reads of the corresponding LBAs fail. This patch introduces a per line counter specifying the number of sectors that are synced to the device but have not been updated in the L2P. Lines with a counter of greater than zero will not be selected for GC. Signed-off-by: Heiner Litz Reviewed-by: Hans Holmberg Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 1 + drivers/lightnvm/pblk-gc.c | 20 +++++++++++++------- drivers/lightnvm/pblk-map.c | 1 + drivers/lightnvm/pblk-rb.c | 1 + drivers/lightnvm/pblk-write.c | 1 + drivers/lightnvm/pblk.h | 1 + 6 files changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 2a9e9facf44f..6ca868868fee 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1278,6 +1278,7 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) spin_unlock(&line->lock); kref_init(&line->ref); + atomic_set(&line->sec_to_update, 0); return 0; } diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 2fa118c8eb71..26a52ea7ec45 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -365,16 +365,22 @@ static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, struct list_head *group_list) { struct pblk_line *line, *victim; - int line_vsc, victim_vsc; + unsigned int line_vsc = ~0x0L, victim_vsc = ~0x0L; victim = list_first_entry(group_list, struct pblk_line, list); + list_for_each_entry(line, group_list, list) { - line_vsc = le32_to_cpu(*line->vsc); - victim_vsc = le32_to_cpu(*victim->vsc); - if (line_vsc < victim_vsc) + if (!atomic_read(&line->sec_to_update)) + line_vsc = le32_to_cpu(*line->vsc); + if (line_vsc < victim_vsc) { victim = line; + victim_vsc = le32_to_cpu(*victim->vsc); + } } + if (victim_vsc == ~0x0) + return NULL; + return victim; } @@ -448,13 +454,13 @@ next_gc_group: do { spin_lock(&l_mg->gc_lock); - if (list_empty(group_list)) { + + line = pblk_gc_get_victim_line(pblk, group_list); + if (!line) { spin_unlock(&l_mg->gc_lock); break; } - line = pblk_gc_get_victim_line(pblk, group_list); - spin_lock(&line->lock); WARN_ON(line->state != PBLK_LINESTATE_CLOSED); line->state = PBLK_LINESTATE_GC; diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index 79df583ea709..7fbc99b60cac 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c @@ -73,6 +73,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, */ if (i < valid_secs) { kref_get(&line->ref); + atomic_inc(&line->sec_to_update); w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i); w_ctx->ppa = ppa_list[i]; meta->lba = cpu_to_le64(w_ctx->lba); diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index a6133b50ed9c..03c241b340ea 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -260,6 +260,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update) entry->cacheline); line = pblk_ppa_to_line(pblk, w_ctx->ppa); + atomic_dec(&line->sec_to_update); kref_put(&line->ref, pblk_line_put); clean_wctx(w_ctx); rb->l2p_update = pblk_rb_ptr_wrap(rb, rb->l2p_update, 1); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 06d56deb645d..6593deab52da 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -177,6 +177,7 @@ static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry, * re-map these entries */ line = pblk_ppa_to_line(pblk, w_ctx->ppa); + atomic_dec(&line->sec_to_update); kref_put(&line->ref, pblk_line_put); } spin_unlock(&pblk->trans_lock); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index a6386d5acd73..ac3ab778e976 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -487,6 +487,7 @@ struct pblk_line { __le32 *vsc; /* Valid sector count in line */ struct kref ref; /* Write buffer L2P references */ + atomic_t sec_to_update; /* Outstanding L2P updates to ppa */ struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */ -- cgit v1.2.3-59-g8ed1b From b7143fe67bfc3b83a9e11371da659e1e70a1bbf3 Mon Sep 17 00:00:00 2001 From: Aleksei Zakharov Date: Mon, 11 Feb 2019 13:10:34 +0300 Subject: block: avoid setting wbt_lat_usec to current value There's no reason to set wbt min lat and freeze request queue if current value is the same. Reviewed-by: Johannes Thumshirn Signed-off-by: Aleksei Zakharov Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 94e1b052abbc..59685918167e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -468,6 +468,9 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, else if (val >= 0) val *= 1000ULL; + if (wbt_get_min_lat(q) == val) + return count; + /* * Ensure that the queue is idled, in case the latency update * ends up either enabling or disabling wbt completely. We can't -- cgit v1.2.3-59-g8ed1b From fbd72127c975dc8e532ecc73d52f3b1b00935bec Mon Sep 17 00:00:00 2001 From: Aleksei Zakharov Date: Mon, 11 Feb 2019 13:50:37 +0300 Subject: block: avoid setting none scheduler if it's already none There's no reason to freeze queue and remove scheduler if there's no scheduler already. Signed-off-by: Aleksei Zakharov Signed-off-by: Jens Axboe --- block/elevator.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/elevator.c b/block/elevator.c index f05e90d4e695..d6d835a08de6 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -667,8 +667,11 @@ static int __elevator_change(struct request_queue *q, const char *name) /* * Special case for mq, turn off scheduling */ - if (!strncmp(name, "none", 4)) + if (!strncmp(name, "none", 4)) { + if (!q->elevator) + return 0; return elevator_switch(q, NULL); + } strlcpy(elevator_name, name, sizeof(elevator_name)); e = elevator_get(q, strstrip(elevator_name), true); -- cgit v1.2.3-59-g8ed1b From 8a2ee44a371c8cbef587ea609908c3cbf1645231 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2019 19:13:07 +0800 Subject: btrfs: look at bi_size for repair decisions bio_readpage_error currently uses bi_vcnt to decide if it is worth retrying an I/O. But the vector count is mostly an implementation artifact - it really should figure out if there is more than a single sector worth retrying. Use bi_size for that and shift by PAGE_SHIFT. This really should be blocks/sectors, but given that btrfs doesn't support a sector size different from the PAGE_SIZE using the page size keeps the changes to a minimum. Reviewed-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/btrfs/extent_io.c | 2 +- include/linux/bio.h | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 52abe4082680..dc8ba3ee515d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2350,7 +2350,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, int read_mode = 0; blk_status_t status; int ret; - unsigned failed_bio_pages = bio_pages_all(failed_bio); + unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); diff --git a/include/linux/bio.h b/include/linux/bio.h index 7380b094dcca..72b4f7be2106 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -263,12 +263,6 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) bv->bv_len = iter.bi_bvec_done; } -static inline unsigned bio_pages_all(struct bio *bio) -{ - WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); - return bio->bi_vcnt; -} - static inline struct bio_vec *bio_first_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); -- cgit v1.2.3-59-g8ed1b From 1a67356e9a4829da2935dd338630a550c59c8489 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:08 +0800 Subject: block: don't use bio->bi_vcnt to figure out segment number It is wrong to use bio->bi_vcnt to figure out how many segments there are in the bio even though CLONED flag isn't set on this bio, because this bio may be splitted or advanced. So always use bio_segments() in blk_recount_segments(), and it shouldn't cause any performance loss now because the physical segment number is figured out in blk_queue_split() and BIO_SEG_VALID is set meantime since bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting"). Reviewed-by: Omar Sandoval Reviewed-by: Christoph Hellwig Fixes: 76d8137a3113 ("blk-merge: recaculate segment if it isn't less than max segments") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 71e9ac03f621..f85d878f313d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -367,13 +367,7 @@ void blk_recalc_rq_segments(struct request *rq) void blk_recount_segments(struct request_queue *q, struct bio *bio) { - unsigned short seg_cnt; - - /* estimate segment number by bi_vcnt for non-cloned bio */ - if (bio_flagged(bio, BIO_CLONED)) - seg_cnt = bio_segments(bio); - else - seg_cnt = bio->bi_vcnt; + unsigned short seg_cnt = bio_segments(bio); if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) && (seg_cnt < queue_max_segments(q))) -- cgit v1.2.3-59-g8ed1b From 19d62f6d00972f957c94aba0975c14490cfed385 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:09 +0800 Subject: block: remove bvec_iter_rewind() Commit 7759eb23fd980 ("block: remove bio_rewind_iter()") removes bio_rewind_iter(), then no one uses bvec_iter_rewind() any more, so remove it. Reviewed-by: Omar Sandoval Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bvec.h | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 02c73c6aa805..ba0ae40e77c9 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -92,30 +92,6 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, return true; } -static inline bool bvec_iter_rewind(const struct bio_vec *bv, - struct bvec_iter *iter, - unsigned int bytes) -{ - while (bytes) { - unsigned len = min(bytes, iter->bi_bvec_done); - - if (iter->bi_bvec_done == 0) { - if (WARN_ONCE(iter->bi_idx == 0, - "Attempted to rewind iter beyond " - "bvec's boundaries\n")) { - return false; - } - iter->bi_idx--; - iter->bi_bvec_done = __bvec_iter_bvec(bv, *iter)->bv_len; - continue; - } - bytes -= len; - iter->bi_size += len; - iter->bi_bvec_done -= len; - } - return true; -} - #define for_each_bvec(bvl, bio_vec, iter, start) \ for (iter = (start); \ (iter).bi_size && \ -- cgit v1.2.3-59-g8ed1b From 3d75ca0adef4280650c6690a0c4702a74a6f3c95 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:10 +0800 Subject: block: introduce multi-page bvec helpers This patch introduces helpers of 'mp_bvec_iter_*' for multi-page bvec support. The introduced helpers treate one bvec as real multi-page segment, which may include more than one pages. The existed helpers of bvec_iter_* are interfaces for supporting current bvec iterator which is thought as single-page by drivers, fs, dm and etc. These introduced helpers will build single-page bvec in flight, so this way won't break current bio/bvec users, which needn't any change. Follows some multi-page bvec background: - bvecs stored in bio->bi_io_vec is always multi-page style - bvec(struct bio_vec) represents one physically contiguous I/O buffer, now the buffer may include more than one page after multi-page bvec is supported, and all these pages represented by one bvec is physically contiguous. Before multi-page bvec support, at most one page is included in one bvec, we call it single-page bvec. - .bv_page of the bvec points to the 1st page in the multi-page bvec - .bv_offset of the bvec is the offset of the buffer in the bvec The effect on the current drivers/filesystem/dm/bcache/...: - almost everyone supposes that one bvec only includes one single page, so we keep the sp interface not changed, for example, bio_for_each_segment() still returns single-page bvec - bio_for_each_segment_all() will return single-page bvec too - during iterating, iterator variable(struct bvec_iter) is always updated in multi-page bvec style, and bvec_iter_advance() is kept not changed - returned(copied) single-page bvec is built in flight by bvec helpers from the stored multi-page bvec Reviewed-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bvec.h | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ba0ae40e77c9..0ae729b1c9fe 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -23,6 +23,7 @@ #include #include #include +#include /* * was unsigned short, but we might as well be ready for > 64kB I/O pages @@ -50,16 +51,39 @@ struct bvec_iter { */ #define __bvec_iter_bvec(bvec, iter) (&(bvec)[(iter).bi_idx]) -#define bvec_iter_page(bvec, iter) \ +/* multi-page (mp_bvec) helpers */ +#define mp_bvec_iter_page(bvec, iter) \ (__bvec_iter_bvec((bvec), (iter))->bv_page) -#define bvec_iter_len(bvec, iter) \ +#define mp_bvec_iter_len(bvec, iter) \ min((iter).bi_size, \ __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done) -#define bvec_iter_offset(bvec, iter) \ +#define mp_bvec_iter_offset(bvec, iter) \ (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done) +#define mp_bvec_iter_page_idx(bvec, iter) \ + (mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE) + +#define mp_bvec_iter_bvec(bvec, iter) \ +((struct bio_vec) { \ + .bv_page = mp_bvec_iter_page((bvec), (iter)), \ + .bv_len = mp_bvec_iter_len((bvec), (iter)), \ + .bv_offset = mp_bvec_iter_offset((bvec), (iter)), \ +}) + +/* For building single-page bvec in flight */ + #define bvec_iter_offset(bvec, iter) \ + (mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE) + +#define bvec_iter_len(bvec, iter) \ + min_t(unsigned, mp_bvec_iter_len((bvec), (iter)), \ + PAGE_SIZE - bvec_iter_offset((bvec), (iter))) + +#define bvec_iter_page(bvec, iter) \ + nth_page(mp_bvec_iter_page((bvec), (iter)), \ + mp_bvec_iter_page_idx((bvec), (iter))) + #define bvec_iter_bvec(bvec, iter) \ ((struct bio_vec) { \ .bv_page = bvec_iter_page((bvec), (iter)), \ -- cgit v1.2.3-59-g8ed1b From d18d91740ad22e9d7998884c4d80523d0ba95ddf Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:11 +0800 Subject: block: introduce bio_for_each_bvec() and rq_for_each_bvec() bio_for_each_bvec() is used for iterating over multi-page bvec for bio split & merge code. rq_for_each_bvec() can be used for drivers which may handle the multi-page bvec directly, so far loop is one perfect use case. Reviewed-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 10 ++++++++++ include/linux/blkdev.h | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/bio.h b/include/linux/bio.h index 72b4f7be2106..7ef8a7505c0a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -156,6 +156,16 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, #define bio_for_each_segment(bvl, bio, iter) \ __bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter) +#define __bio_for_each_bvec(bvl, bio, iter, start) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \ + bio_advance_iter((bio), &(iter), (bvl).bv_len)) + +/* iterate over multi-page bvec */ +#define bio_for_each_bvec(bvl, bio, iter) \ + __bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter) + #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) static inline unsigned bio_segments(struct bio *bio) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3603270cb82d..b6292d469ea4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -792,6 +792,10 @@ struct req_iterator { __rq_for_each_bio(_iter.bio, _rq) \ bio_for_each_segment(bvl, _iter.bio, _iter.iter) +#define rq_for_each_bvec(bvl, _rq, _iter) \ + __rq_for_each_bio(_iter.bio, _rq) \ + bio_for_each_bvec(bvl, _iter.bio, _iter.iter) + #define rq_iter_last(bvec, _iter) \ (_iter.bio->bi_next == NULL && \ bio_iter_last(bvec, _iter.iter)) -- cgit v1.2.3-59-g8ed1b From dcebd755926b0f39dd1e3ef75bd3b46943400df0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:12 +0800 Subject: block: use bio_for_each_bvec() to compute multi-page bvec count First it is more efficient to use bio_for_each_bvec() in both blk_bio_segment_split() and __blk_recalc_rq_segments() to compute how many multi-page bvecs there are in the bio. Secondly once bio_for_each_bvec() is used, the bvec may need to be splitted because its length can be very longer than max segment size, so we have to split the big bvec into several segments. Thirdly when splitting multi-page bvec into segments, the max segment limit may be reached, so the bio split need to be considered under this situation too. Reviewed-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 103 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 20 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index f85d878f313d..4ef56b2d2aa5 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -161,6 +161,73 @@ static inline unsigned get_max_io_size(struct request_queue *q, return sectors; } +static unsigned get_max_segment_size(struct request_queue *q, + unsigned offset) +{ + unsigned long mask = queue_segment_boundary(q); + + /* default segment boundary mask means no boundary limit */ + if (mask == BLK_SEG_BOUNDARY_MASK) + return queue_max_segment_size(q); + + return min_t(unsigned long, mask - (mask & offset) + 1, + queue_max_segment_size(q)); +} + +/* + * Split the bvec @bv into segments, and update all kinds of + * variables. + */ +static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, + unsigned *nsegs, unsigned *last_seg_size, + unsigned *front_seg_size, unsigned *sectors) +{ + unsigned len = bv->bv_len; + unsigned total_len = 0; + unsigned new_nsegs = 0, seg_size = 0; + + /* + * Multi-page bvec may be too big to hold in one segment, so the + * current bvec has to be splitted as multiple segments. + */ + while (len && new_nsegs + *nsegs < queue_max_segments(q)) { + seg_size = get_max_segment_size(q, bv->bv_offset + total_len); + seg_size = min(seg_size, len); + + new_nsegs++; + total_len += seg_size; + len -= seg_size; + + if ((bv->bv_offset + total_len) & queue_virt_boundary(q)) + break; + } + + if (!new_nsegs) + return !!len; + + /* update front segment size */ + if (!*nsegs) { + unsigned first_seg_size; + + if (new_nsegs == 1) + first_seg_size = get_max_segment_size(q, bv->bv_offset); + else + first_seg_size = queue_max_segment_size(q); + + if (*front_seg_size < first_seg_size) + *front_seg_size = first_seg_size; + } + + /* update other varibles */ + *last_seg_size = seg_size; + *nsegs += new_nsegs; + if (sectors) + *sectors += total_len >> 9; + + /* split in the middle of the bvec if len != 0 */ + return !!len; +} + static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *bio, struct bio_set *bs, @@ -174,7 +241,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *new = NULL; const unsigned max_sectors = get_max_io_size(q, bio); - bio_for_each_segment(bv, bio, iter) { + bio_for_each_bvec(bv, bio, iter) { /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. @@ -189,8 +256,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, */ if (nsegs < queue_max_segments(q) && sectors < max_sectors) { - nsegs++; - sectors = max_sectors; + /* split in the middle of bvec */ + bv.bv_len = (max_sectors - sectors) << 9; + bvec_split_segs(q, &bv, &nsegs, + &seg_size, + &front_seg_size, + §ors); } goto split; } @@ -212,14 +283,12 @@ new_segment: if (nsegs == queue_max_segments(q)) goto split; - if (nsegs == 1 && seg_size > front_seg_size) - front_seg_size = seg_size; - - nsegs++; bvprv = bv; bvprvp = &bvprv; - seg_size = bv.bv_len; - sectors += bv.bv_len >> 9; + + if (bvec_split_segs(q, &bv, &nsegs, &seg_size, + &front_seg_size, §ors)) + goto split; } @@ -233,8 +302,6 @@ split: bio = new; } - if (nsegs == 1 && seg_size > front_seg_size) - front_seg_size = seg_size; bio->bi_seg_front_size = front_seg_size; if (seg_size > bio->bi_seg_back_size) bio->bi_seg_back_size = seg_size; @@ -297,6 +364,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, struct bio_vec bv, bvprv = { NULL }; int prev = 0; unsigned int seg_size, nr_phys_segs; + unsigned front_seg_size = bio->bi_seg_front_size; struct bio *fbio, *bbio; struct bvec_iter iter; @@ -316,7 +384,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, seg_size = 0; nr_phys_segs = 0; for_each_bio(bio) { - bio_for_each_segment(bv, bio, iter) { + bio_for_each_bvec(bv, bio, iter) { /* * If SG merging is disabled, each bio vector is * a segment @@ -336,20 +404,15 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, continue; } new_segment: - if (nr_phys_segs == 1 && seg_size > - fbio->bi_seg_front_size) - fbio->bi_seg_front_size = seg_size; - - nr_phys_segs++; bvprv = bv; prev = 1; - seg_size = bv.bv_len; + bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size, + &front_seg_size, NULL); } bbio = bio; } - if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size) - fbio->bi_seg_front_size = seg_size; + fbio->bi_seg_front_size = front_seg_size; if (seg_size > bbio->bi_seg_back_size) bbio->bi_seg_back_size = seg_size; -- cgit v1.2.3-59-g8ed1b From 862e5a5e6fa2c7fd514c179e9ceea27a49a3327f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:13 +0800 Subject: block: use bio_for_each_bvec() to map sg It is more efficient to use bio_for_each_bvec() to map sg, meantime we have to consider splitting multipage bvec as done in blk_bio_segment_split(). Reviewed-by: Omar Sandoval Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 70 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 4ef56b2d2aa5..1912499b08b7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -464,6 +464,54 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, return biovec_phys_mergeable(q, &end_bv, &nxt_bv); } +static struct scatterlist *blk_next_sg(struct scatterlist **sg, + struct scatterlist *sglist) +{ + if (!*sg) + return sglist; + + /* + * If the driver previously mapped a shorter list, we could see a + * termination bit prematurely unless it fully inits the sg table + * on each mapping. We KNOW that there must be more entries here + * or the driver would be buggy, so force clear the termination bit + * to avoid doing a full sg_init_table() in drivers for each command. + */ + sg_unmark_end(*sg); + return sg_next(*sg); +} + +static unsigned blk_bvec_map_sg(struct request_queue *q, + struct bio_vec *bvec, struct scatterlist *sglist, + struct scatterlist **sg) +{ + unsigned nbytes = bvec->bv_len; + unsigned nsegs = 0, total = 0, offset = 0; + + while (nbytes > 0) { + unsigned seg_size; + struct page *pg; + unsigned idx; + + *sg = blk_next_sg(sg, sglist); + + seg_size = get_max_segment_size(q, bvec->bv_offset + total); + seg_size = min(nbytes, seg_size); + + offset = (total + bvec->bv_offset) % PAGE_SIZE; + idx = (total + bvec->bv_offset) / PAGE_SIZE; + pg = nth_page(bvec->bv_page, idx); + + sg_set_page(*sg, pg, seg_size, offset); + + total += seg_size; + nbytes -= seg_size; + nsegs++; + } + + return nsegs; +} + static inline void __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, struct scatterlist *sglist, struct bio_vec *bvprv, @@ -481,25 +529,7 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, (*sg)->length += nbytes; } else { new_segment: - if (!*sg) - *sg = sglist; - else { - /* - * If the driver previously mapped a shorter - * list, we could see a termination bit - * prematurely unless it fully inits the sg - * table on each mapping. We KNOW that there - * must be more entries here or the driver - * would be buggy, so force clear the - * termination bit to avoid doing a full - * sg_init_table() in drivers for each command. - */ - sg_unmark_end(*sg); - *sg = sg_next(*sg); - } - - sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); - (*nsegs)++; + (*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg); } *bvprv = *bvec; } @@ -521,7 +551,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, int nsegs = 0; for_each_bio(bio) - bio_for_each_segment(bvec, bio, iter) + bio_for_each_bvec(bvec, bio, iter) __blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg, &nsegs); -- cgit v1.2.3-59-g8ed1b From 45a3fb95298b326ab8175f2bd97bd8666017b692 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:14 +0800 Subject: block: introduce mp_bvec_last_segment() BTRFS and guard_bio_eod() need to get the last singlepage segment from one multipage bvec, so introduce this helper to make them happy. Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bvec.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 0ae729b1c9fe..21f76bad7be2 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -131,4 +131,26 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, .bi_bvec_done = 0, \ } +/* + * Get the last single-page segment from the multi-page bvec and store it + * in @seg + */ +static inline void mp_bvec_last_segment(const struct bio_vec *bvec, + struct bio_vec *seg) +{ + unsigned total = bvec->bv_offset + bvec->bv_len; + unsigned last_page = (total - 1) / PAGE_SIZE; + + seg->bv_page = nth_page(bvec->bv_page, last_page); + + /* the whole segment is inside the last page */ + if (bvec->bv_offset >= last_page * PAGE_SIZE) { + seg->bv_offset = bvec->bv_offset % PAGE_SIZE; + seg->bv_len = bvec->bv_len; + } else { + seg->bv_offset = 0; + seg->bv_len = total - last_page * PAGE_SIZE; + } +} + #endif /* __LINUX_BVEC_ITER_H */ -- cgit v1.2.3-59-g8ed1b From f70f44640759728d6e31326acbee08ca22d1066f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:15 +0800 Subject: fs/buffer.c: use bvec iterator to truncate the bio Once multi-page bvec is enabled, the last bvec may include more than one page, this patch use mp_bvec_last_segment() to truncate the bio. Reviewed-by: Omar Sandoval Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- fs/buffer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index 52d024bfdbc1..817871274c77 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3032,7 +3032,10 @@ void guard_bio_eod(int op, struct bio *bio) /* ..and clear the end of the buffer for reads */ if (op == REQ_OP_READ) { - zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len, + struct bio_vec bv; + + mp_bvec_last_segment(bvec, &bv); + zero_user(bv.bv_page, bv.bv_offset + bv.bv_len, truncated_bytes); } } -- cgit v1.2.3-59-g8ed1b From c3a7ce738009912f9d237bdabf4a20038522de10 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:16 +0800 Subject: btrfs: use mp_bvec_last_segment to get bio's last page Preparing for supporting multi-page bvec. Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- fs/btrfs/extent_io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dc8ba3ee515d..986ef49b0269 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2697,11 +2697,12 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, { blk_status_t ret = 0; struct bio_vec *bvec = bio_last_bvec_all(bio); - struct page *page = bvec->bv_page; + struct bio_vec bv; struct extent_io_tree *tree = bio->bi_private; u64 start; - start = page_offset(page) + bvec->bv_offset; + mp_bvec_last_segment(bvec, &bv); + start = page_offset(bv.bv_page) + bv.bv_offset; bio->bi_private = NULL; -- cgit v1.2.3-59-g8ed1b From 86af5952a8470f96b53830372c64469cb7ce780c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:17 +0800 Subject: block: loop: pass multi-page bvec to iov_iter iov_iter is implemented on bvec itererator helpers, so it is safe to pass multi-page bvec to it, and this way is much more efficient than passing one page in each bvec. Reviewed-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index cf5538942834..8ef583197414 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -511,21 +511,22 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos, bool rw) { struct iov_iter iter; + struct req_iterator rq_iter; struct bio_vec *bvec; struct request *rq = blk_mq_rq_from_pdu(cmd); struct bio *bio = rq->bio; struct file *file = lo->lo_backing_file; + struct bio_vec tmp; unsigned int offset; - int segments = 0; + int nr_bvec = 0; int ret; + rq_for_each_bvec(tmp, rq, rq_iter) + nr_bvec++; + if (rq->bio != rq->biotail) { - struct req_iterator iter; - struct bio_vec tmp; - __rq_for_each_bio(bio, rq) - segments += bio_segments(bio); - bvec = kmalloc_array(segments, sizeof(struct bio_vec), + bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_NOIO); if (!bvec) return -EIO; @@ -534,10 +535,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, /* * The bios of the request may be started from the middle of * the 'bvec' because of bio splitting, so we can't directly - * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment + * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec * API will take care of all details for us. */ - rq_for_each_segment(tmp, rq, iter) { + rq_for_each_bvec(tmp, rq, rq_iter) { *bvec = tmp; bvec++; } @@ -551,11 +552,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, */ offset = bio->bi_iter.bi_bvec_done; bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); - segments = bio_segments(bio); } atomic_set(&cmd->ref, 2); - iov_iter_bvec(&iter, rw, bvec, segments, blk_rq_bytes(rq)); + iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); iter.iov_offset = offset; cmd->iocb.ki_pos = pos; -- cgit v1.2.3-59-g8ed1b From 2e1f4f4d2481d8bf111904c3e45fc0c4c94bf76e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:18 +0800 Subject: bcache: avoid to use bio_for_each_segment_all() in bch_bio_alloc_pages() bch_bio_alloc_pages() is always called on one new bio, so it is safe to access the bvec table directly. Given it is the only kind of this case, open code the bvec table access since bio_for_each_segment_all() will be changed to support for iterating over multipage bvec. Acked-by: Coly Li Reviewed-by: Omar Sandoval Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/md/bcache/util.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 20eddeac1531..62fb917f7a4f 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -270,7 +270,11 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) int i; struct bio_vec *bv; - bio_for_each_segment_all(bv, bio, i) { + /* + * This is called on freshly new bio, so it is safe to access the + * bvec table directly. + */ + for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) { bv->bv_page = alloc_page(gfp_mask); if (!bv->bv_page) { while (--bv >= bio->bi_io_vec) -- cgit v1.2.3-59-g8ed1b From 6dc4f100c175dd0511ae8674786e7c9006cdfbfa Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:19 +0800 Subject: block: allow bio_for_each_segment_all() to iterate over multi-page bvec This patch introduces one extra iterator variable to bio_for_each_segment_all(), then we can allow bio_for_each_segment_all() to iterate over multi-page bvec. Given it is just one mechannical & simple change on all bio_for_each_segment_all() users, this patch does tree-wide change in one single patch, so that we can avoid to use a temporary helper for this conversion. Reviewed-by: Omar Sandoval Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 27 ++++++++++++++++++--------- block/bounce.c | 6 ++++-- drivers/md/bcache/btree.c | 3 ++- drivers/md/dm-crypt.c | 3 ++- drivers/md/raid1.c | 3 ++- drivers/staging/erofs/data.c | 3 ++- drivers/staging/erofs/unzip_vle.c | 3 ++- fs/block_dev.c | 6 ++++-- fs/btrfs/compression.c | 3 ++- fs/btrfs/disk-io.c | 3 ++- fs/btrfs/extent_io.c | 9 ++++++--- fs/btrfs/inode.c | 6 ++++-- fs/btrfs/raid56.c | 3 ++- fs/crypto/bio.c | 3 ++- fs/direct-io.c | 4 +++- fs/exofs/ore.c | 3 ++- fs/exofs/ore_raid.c | 3 ++- fs/ext4/page-io.c | 3 ++- fs/ext4/readpage.c | 3 ++- fs/f2fs/data.c | 9 ++++++--- fs/gfs2/lops.c | 9 ++++++--- fs/gfs2/meta_io.c | 3 ++- fs/iomap.c | 6 ++++-- fs/mpage.c | 3 ++- fs/xfs/xfs_aops.c | 5 +++-- include/linux/bio.h | 11 +++++++++-- include/linux/bvec.h | 30 ++++++++++++++++++++++++++++++ 27 files changed, 127 insertions(+), 46 deletions(-) diff --git a/block/bio.c b/block/bio.c index 4db1008309ed..968b12fea564 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1072,8 +1072,9 @@ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) { int i; struct bio_vec *bvec; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { ssize_t ret; ret = copy_page_from_iter(bvec->bv_page, @@ -1103,8 +1104,9 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) { int i; struct bio_vec *bvec; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { ssize_t ret; ret = copy_page_to_iter(bvec->bv_page, @@ -1126,8 +1128,9 @@ void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) __free_page(bvec->bv_page); } EXPORT_SYMBOL(bio_free_pages); @@ -1295,6 +1298,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, struct bio *bio; int ret; struct bio_vec *bvec; + struct bvec_iter_all iter_all; if (!iov_iter_count(iter)) return ERR_PTR(-EINVAL); @@ -1368,7 +1372,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, return bio; out_unmap: - bio_for_each_segment_all(bvec, bio, j) { + bio_for_each_segment_all(bvec, bio, j, iter_all) { put_page(bvec->bv_page); } bio_put(bio); @@ -1379,11 +1383,12 @@ static void __bio_unmap_user(struct bio *bio) { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; /* * make sure we dirty pages we wrote to */ - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { if (bio_data_dir(bio) == READ) set_page_dirty_lock(bvec->bv_page); @@ -1475,8 +1480,9 @@ static void bio_copy_kern_endio_read(struct bio *bio) char *p = bio->bi_private; struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { memcpy(p, page_address(bvec->bv_page), bvec->bv_len); p += bvec->bv_len; } @@ -1585,8 +1591,9 @@ void bio_set_pages_dirty(struct bio *bio) { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { if (!PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); } @@ -1596,8 +1603,9 @@ static void bio_release_pages(struct bio *bio) { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) put_page(bvec->bv_page); } @@ -1644,8 +1652,9 @@ void bio_check_pages_dirty(struct bio *bio) struct bio_vec *bvec; unsigned long flags; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) goto defer; } diff --git a/block/bounce.c b/block/bounce.c index ffb9e9ecfa7e..add085e28b1d 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -165,11 +165,12 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) struct bio_vec *bvec, orig_vec; int i; struct bvec_iter orig_iter = bio_orig->bi_iter; + struct bvec_iter_all iter_all; /* * free up bounce indirect pages used */ - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { orig_vec = bio_iter_iovec(bio_orig, orig_iter); if (bvec->bv_page != orig_vec.bv_page) { dec_zone_page_state(bvec->bv_page, NR_BOUNCE); @@ -294,6 +295,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, bool bounce = false; int sectors = 0; bool passthrough = bio_is_passthrough(*bio_orig); + struct bvec_iter_all iter_all; bio_for_each_segment(from, *bio_orig, iter) { if (i++ < BIO_MAX_PAGES) @@ -313,7 +315,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : &bounce_bio_set); - bio_for_each_segment_all(to, bio, i) { + bio_for_each_segment_all(to, bio, i, iter_all) { struct page *page = to->bv_page; if (page_to_pfn(page) <= q->limits.bounce_pfn) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 23cb1dc7296b..64def336f053 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -432,8 +432,9 @@ static void do_btree_node_write(struct btree *b) int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, b->bio, j) + bio_for_each_segment_all(bv, b->bio, j, iter_all) memcpy(page_address(bv->bv_page), base + j * PAGE_SIZE, PAGE_SIZE); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 47d4e0d30bf0..9a29037f5615 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1447,8 +1447,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) { unsigned int i; struct bio_vec *bv; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, clone, i) { + bio_for_each_segment_all(bv, clone, i, iter_all) { BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, &cc->page_pool); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7e63ccc4ae7b..88c61d3090b0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2112,13 +2112,14 @@ static void process_checks(struct r1bio *r1_bio) struct page **spages = get_resync_pages(sbio)->pages; struct bio_vec *bi; int page_len[RESYNC_PAGES] = { 0 }; + struct bvec_iter_all iter_all; if (sbio->bi_end_io != end_sync_read) continue; /* Now we can 'fixup' the error value */ sbio->bi_status = 0; - bio_for_each_segment_all(bi, sbio, j) + bio_for_each_segment_all(bi, sbio, j, iter_all) page_len[j] = bi->bv_len; if (!status) { diff --git a/drivers/staging/erofs/data.c b/drivers/staging/erofs/data.c index 5a55f0bfdfbb..4871ba7b7d9a 100644 --- a/drivers/staging/erofs/data.c +++ b/drivers/staging/erofs/data.c @@ -20,8 +20,9 @@ static inline void read_endio(struct bio *bio) int i; struct bio_vec *bvec; const blk_status_t err = bio->bi_status; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; /* page is already locked */ diff --git a/drivers/staging/erofs/unzip_vle.c b/drivers/staging/erofs/unzip_vle.c index 4ac1099a39c6..c057c5616b1d 100644 --- a/drivers/staging/erofs/unzip_vle.c +++ b/drivers/staging/erofs/unzip_vle.c @@ -830,8 +830,9 @@ static inline void z_erofs_vle_read_endio(struct bio *bio) #ifdef EROFS_FS_HAS_MANAGED_CACHE struct address_space *mc = NULL; #endif + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; bool cachemngd = false; diff --git a/fs/block_dev.c b/fs/block_dev.c index 58a4c1217fa8..7758adee6efe 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -211,6 +211,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, ssize_t ret; blk_qc_t qc; int i; + struct bvec_iter_all iter_all; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -260,7 +261,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, } __set_current_state(TASK_RUNNING); - bio_for_each_segment_all(bvec, &bio, i) { + bio_for_each_segment_all(bvec, &bio, i, iter_all) { if (should_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); put_page(bvec->bv_page); @@ -329,8 +330,9 @@ static void blkdev_bio_end_io(struct bio *bio) } else { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) put_page(bvec->bv_page); bio_put(bio); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 548057630b69..6896ea60c843 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -162,13 +162,14 @@ csum_failed: } else { int i; struct bio_vec *bvec; + struct bvec_iter_all iter_all; /* * we have verified the checksum already, set page * checked so the end_io handlers know about it */ ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, i) + bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all) SetPageChecked(bvec->bv_page); bio_endio(cb->orig_bio); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6a2a2a951705..ca1b7da6dd1b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -832,9 +832,10 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) struct bio_vec *bvec; struct btrfs_root *root; int i, ret = 0; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); if (ret) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 986ef49b0269..4ed58c9a94a9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2422,9 +2422,10 @@ static void end_bio_extent_writepage(struct bio *bio) u64 start; u64 end; int i; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2493,9 +2494,10 @@ static void end_bio_extent_readpage(struct bio *bio) int mirror; int ret; int i; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3635,9 +3637,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) struct bio_vec *bvec; struct extent_buffer *eb; int i, done; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; eb = (struct extent_buffer *)page->private; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5c349667c761..7ade5769f691 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7777,6 +7777,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) struct bio_vec *bvec; struct extent_io_tree *io_tree, *failure_tree; int i; + struct bvec_iter_all iter_all; if (bio->bi_status) goto end; @@ -7788,7 +7789,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) done->uptodate = 1; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, io_tree, done->start, bvec->bv_page, btrfs_ino(BTRFS_I(inode)), 0); @@ -7867,6 +7868,7 @@ static void btrfs_retry_endio(struct bio *bio) int uptodate; int ret; int i; + struct bvec_iter_all iter_all; if (bio->bi_status) goto end; @@ -7880,7 +7882,7 @@ static void btrfs_retry_endio(struct bio *bio) failure_tree = &BTRFS_I(inode)->io_failure_tree; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, bvec->bv_offset, done->start, bvec->bv_len); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index e74455eb42f9..1869ba8e5981 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1443,10 +1443,11 @@ static void set_bio_pages_uptodate(struct bio *bio) { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) SetPageUptodate(bvec->bv_page); } diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 0959044c5cee..5759bcd018cd 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -30,8 +30,9 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done) { struct bio_vec *bv; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter_all) { struct page *page = bv->bv_page; int ret = fscrypt_decrypt_page(page->mapping->host, page, PAGE_SIZE, 0, page->index); diff --git a/fs/direct-io.c b/fs/direct-io.c index ec2fb6fe6d37..9bb015bc4a83 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -551,7 +551,9 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { bio_check_pages_dirty(bio); /* transfers ownership */ } else { - bio_for_each_segment_all(bvec, bio, i) { + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; if (dio->op == REQ_OP_READ && !PageCompound(page) && diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 5331a15a61f1..24a8e34882e9 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -420,8 +420,9 @@ static void _clear_bio(struct bio *bio) { struct bio_vec *bv; unsigned i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter_all) { unsigned this_count = bv->bv_len; if (likely(PAGE_SIZE == this_count)) diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 199590f36203..e83bab54b03e 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -468,11 +468,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) /* loop on all devices all pages */ for (d = 0; d < ios->numdevs; d++) { struct bio *bio = ios->per_dev[d].bio; + struct bvec_iter_all iter_all; if (!bio) continue; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter_all) { struct page *page = bv->bv_page; SetPageUptodate(page); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 2aa62d58d8dd..cff4c4aa7a9c 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -63,8 +63,9 @@ static void ext4_finish_bio(struct bio *bio) { int i; struct bio_vec *bvec; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; #ifdef CONFIG_EXT4_FS_ENCRYPTION struct page *data_page = NULL; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 6aa282ee455a..e53639784892 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -72,6 +72,7 @@ static void mpage_end_io(struct bio *bio) { struct bio_vec *bv; int i; + struct bvec_iter_all iter_all; if (ext4_bio_encrypted(bio)) { if (bio->bi_status) { @@ -81,7 +82,7 @@ static void mpage_end_io(struct bio *bio) return; } } - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter_all) { struct page *page = bv->bv_page; if (!bio->bi_status) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f91d8630c9a2..da060b77f64d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -87,8 +87,9 @@ static void __read_end_io(struct bio *bio) struct page *page; struct bio_vec *bv; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter_all) { page = bv->bv_page; /* PG_error was set if any post_read step failed */ @@ -164,13 +165,14 @@ static void f2fs_write_end_io(struct bio *bio) struct f2fs_sb_info *sbi = bio->bi_private; struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; if (time_to_inject(sbi, FAULT_WRITE_IO)) { f2fs_show_injection_info(FAULT_WRITE_IO); bio->bi_status = BLK_STS_IOERR; } - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); @@ -347,6 +349,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, struct bio_vec *bvec; struct page *target; int i; + struct bvec_iter_all iter_all; if (!io->bio) return false; @@ -354,7 +357,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, if (!inode && !page && !ino) return true; - bio_for_each_segment_all(bvec, io->bio, i) { + bio_for_each_segment_all(bvec, io->bio, i, iter_all) { if (bvec->bv_page->mapping) target = bvec->bv_page; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 94dcab655bc0..15deefeaafd0 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -170,7 +170,8 @@ u64 gfs2_log_bmap(struct gfs2_sbd *sdp) * that is pinned in the pagecache. */ -static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, +static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, + struct bio_vec *bvec, blk_status_t error) { struct buffer_head *bh, *next; @@ -208,6 +209,7 @@ static void gfs2_end_log_write(struct bio *bio) struct bio_vec *bvec; struct page *page; int i; + struct bvec_iter_all iter_all; if (bio->bi_status) { fs_err(sdp, "Error %d writing to journal, jid=%u\n", @@ -215,7 +217,7 @@ static void gfs2_end_log_write(struct bio *bio) wake_up(&sdp->sd_logd_waitq); } - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { page = bvec->bv_page; if (page_has_buffers(page)) gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); @@ -388,8 +390,9 @@ static void gfs2_end_log_read(struct bio *bio) struct page *page; struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { page = bvec->bv_page; if (bio->bi_status) { int err = blk_status_to_errno(bio->bi_status); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index be9c0bf697fe..3201342404a7 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -190,8 +190,9 @@ static void gfs2_meta_read_endio(struct bio *bio) { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; struct buffer_head *bh = page_buffers(page); unsigned int len = bvec->bv_len; diff --git a/fs/iomap.c b/fs/iomap.c index a3088fae567b..af736acd9006 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -267,8 +267,9 @@ iomap_read_end_io(struct bio *bio) int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) iomap_read_page_end_io(bvec, error); bio_put(bio); } @@ -1559,8 +1560,9 @@ static void iomap_dio_bio_end_io(struct bio *bio) } else { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) put_page(bvec->bv_page); bio_put(bio); } diff --git a/fs/mpage.c b/fs/mpage.c index c820dc9bebab..3f19da75178b 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -48,8 +48,9 @@ static void mpage_end_io(struct bio *bio) { struct bio_vec *bv; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter_all) { struct page *page = bv->bv_page; page_endio(page, bio_op(bio), blk_status_to_errno(bio->bi_status)); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 338b9d9984e0..1f1829e506e8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -62,7 +62,7 @@ xfs_find_daxdev_for_inode( static void xfs_finish_page_writeback( struct inode *inode, - struct bio_vec *bvec, + struct bio_vec *bvec, int error) { struct iomap_page *iop = to_iomap_page(bvec->bv_page); @@ -98,6 +98,7 @@ xfs_destroy_ioend( for (bio = &ioend->io_inline_bio; bio; bio = next) { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; /* * For the last bio, bi_private points to the ioend, so we @@ -109,7 +110,7 @@ xfs_destroy_ioend( next = bio->bi_private; /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) xfs_finish_page_writeback(inode, bvec, error); bio_put(bio); } diff --git a/include/linux/bio.h b/include/linux/bio.h index 7ef8a7505c0a..089370eb84d9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -128,12 +128,19 @@ static inline bool bio_full(struct bio *bio) return bio->bi_vcnt >= bio->bi_max_vecs; } +#define mp_bvec_for_each_segment(bv, bvl, i, iter_all) \ + for (bv = bvec_init_iter_all(&iter_all); \ + (iter_all.done < (bvl)->bv_len) && \ + (mp_bvec_next_segment((bvl), &iter_all), 1); \ + iter_all.done += bv->bv_len, i += 1) + /* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it */ -#define bio_for_each_segment_all(bvl, bio, i) \ - for (i = 0, bvl = (bio)->bi_io_vec; i < (bio)->bi_vcnt; i++, bvl++) +#define bio_for_each_segment_all(bvl, bio, i, iter_all) \ + for (i = 0, iter_all.idx = 0; iter_all.idx < (bio)->bi_vcnt; iter_all.idx++) \ + mp_bvec_for_each_segment(bvl, &((bio)->bi_io_vec[iter_all.idx]), i, iter_all) static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, unsigned bytes) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 21f76bad7be2..30a57b68d017 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -45,6 +45,12 @@ struct bvec_iter { current bvec */ }; +struct bvec_iter_all { + struct bio_vec bv; + int idx; + unsigned done; +}; + /* * various member access, note that bio_data should of course not be used * on highmem page vectors @@ -131,6 +137,30 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, .bi_bvec_done = 0, \ } +static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all) +{ + iter_all->bv.bv_page = NULL; + iter_all->done = 0; + + return &iter_all->bv; +} + +static inline void mp_bvec_next_segment(const struct bio_vec *bvec, + struct bvec_iter_all *iter_all) +{ + struct bio_vec *bv = &iter_all->bv; + + if (bv->bv_page) { + bv->bv_page = nth_page(bv->bv_page, 1); + bv->bv_offset = 0; + } else { + bv->bv_page = bvec->bv_page; + bv->bv_offset = bvec->bv_offset; + } + bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset, + bvec->bv_len - iter_all->done); +} + /* * Get the last single-page segment from the multi-page bvec and store it * in @seg -- cgit v1.2.3-59-g8ed1b From 07173c3ec276cbb18dc0e0687d37d310e98a1480 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:20 +0800 Subject: block: enable multipage bvecs This patch pulls the trigger for multi-page bvecs. Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 22 +++++++++++++++------- fs/iomap.c | 4 ++-- fs/xfs/xfs_aops.c | 4 ++-- include/linux/bio.h | 2 +- 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/block/bio.c b/block/bio.c index 968b12fea564..83a2dfa417ca 100644 --- a/block/bio.c +++ b/block/bio.c @@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page); * @page: page to add * @len: length of the data to add * @off: offset of the data in @page + * @same_page: if %true only merge if the new data is in the same physical + * page as the last segment of the bio. * * Try to add the data at @page + @off to the last bvec of @bio. This is a * a useful optimisation for file systems with a block size smaller than the @@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page); * Return %true on success or %false on failure. */ bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off) + unsigned int len, unsigned int off, bool same_page) { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return false; if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + + bv->bv_offset + bv->bv_len - 1; + phys_addr_t page_addr = page_to_phys(page); - if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) { - bv->bv_len += len; - bio->bi_iter.bi_size += len; - return true; - } + if (vec_end_addr + 1 != page_addr + off) + return false; + if (same_page && (vec_end_addr & PAGE_MASK) != page_addr) + return false; + + bv->bv_len += len; + bio->bi_iter.bi_size += len; + return true; } return false; } @@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page); int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - if (!__bio_try_merge_page(bio, page, len, offset)) { + if (!__bio_try_merge_page(bio, page, len, offset, false)) { if (bio_full(bio)) return 0; __bio_add_page(bio, page, len, offset); diff --git a/fs/iomap.c b/fs/iomap.c index af736acd9006..0c350e658b7f 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -318,7 +318,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, */ sector = iomap_sector(iomap, pos); if (ctx->bio && bio_end_sector(ctx->bio) == sector) { - if (__bio_try_merge_page(ctx->bio, page, plen, poff)) + if (__bio_try_merge_page(ctx->bio, page, plen, poff, true)) goto done; is_contig = true; } @@ -349,7 +349,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ctx->bio->bi_end_io = iomap_read_end_io; } - __bio_add_page(ctx->bio, page, plen, poff); + bio_add_page(ctx->bio, page, plen, poff); done: /* * Move the caller beyond our range so that it keeps making progress. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 1f1829e506e8..b9fd44168f61 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -616,12 +616,12 @@ xfs_add_to_ioend( bdev, sector); } - if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { + if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) { if (iop) atomic_inc(&iop->write_count); if (bio_full(wpc->ioend->io_bio)) xfs_chain_bio(wpc->ioend, wbc, bdev, sector); - __bio_add_page(wpc->ioend->io_bio, page, len, poff); + bio_add_page(wpc->ioend->io_bio, page, len, poff); } wpc->ioend->io_size += len; diff --git a/include/linux/bio.h b/include/linux/bio.h index 089370eb84d9..9f77adcfde82 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -441,7 +441,7 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off); + unsigned int len, unsigned int off, bool same_page); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); -- cgit v1.2.3-59-g8ed1b From 6861428921b51113520cd47897be6c2774e4fc58 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:21 +0800 Subject: block: always define BIO_MAX_PAGES as 256 Now multi-page bvec can cover CONFIG_THP_SWAP, so we don't need to increase BIO_MAX_PAGES for it. CONFIG_THP_SWAP needs to split one THP into normal pages and adds them all to one bio. With multipage-bvec, it just takes one bvec to hold them all. Reviewed-by: Omar Sandoval Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 9f77adcfde82..bdd11d4c2f05 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -34,15 +34,7 @@ #define BIO_BUG_ON #endif -#ifdef CONFIG_THP_SWAP -#if HPAGE_PMD_NR > 256 -#define BIO_MAX_PAGES HPAGE_PMD_NR -#else #define BIO_MAX_PAGES 256 -#endif -#else -#define BIO_MAX_PAGES 256 -#endif #define bio_prio(bio) (bio)->bi_ioprio #define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) -- cgit v1.2.3-59-g8ed1b From ac4fa1d107addb2c6b21067d8945a39316a09fc8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:22 +0800 Subject: block: document usage of bio iterator helpers Now multi-page bvec is supported, some helpers may return page by page, meantime some may return segment by segment, this patch documents the usage. Reviewed-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- Documentation/block/biovecs.txt | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt index 25689584e6e0..ce6eccaf5df7 100644 --- a/Documentation/block/biovecs.txt +++ b/Documentation/block/biovecs.txt @@ -117,3 +117,28 @@ Other implications: size limitations and the limitations of the underlying devices. Thus there's no need to define ->merge_bvec_fn() callbacks for individual block drivers. + +Usage of helpers: +================= + +* The following helpers whose names have the suffix of "_all" can only be used +on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers +shouldn't use them because the bio may have been split before it reached the +driver. + + bio_for_each_segment_all() + bio_first_bvec_all() + bio_first_page_all() + bio_last_bvec_all() + +* The following helpers iterate over single-page segment. The passed 'struct +bio_vec' will contain a single-page IO vector during the iteration + + bio_for_each_segment() + bio_for_each_segment_all() + +* The following helpers iterate over multi-page bvec. The passed 'struct +bio_vec' will contain a multi-page IO vector during the iteration + + bio_for_each_bvec() + rq_for_each_bvec() -- cgit v1.2.3-59-g8ed1b From 2705c93742e91730d335838025d75d8043861174 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:23 +0800 Subject: block: kill QUEUE_FLAG_NO_SG_MERGE Since bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting"), physical segment number is mainly figured out in blk_queue_split() for fast path, and the flag of BIO_SEG_VALID is set there too. Now only blk_recount_segments() and blk_recalc_rq_segments() use this flag. Basically blk_recount_segments() is bypassed in fast path given BIO_SEG_VALID is set in blk_queue_split(). For another user of blk_recalc_rq_segments(): - run in partial completion branch of blk_update_request, which is an unusual case - run in blk_cloned_rq_check_limits(), still not a big problem if the flag is killed since dm-rq is the only user. Multi-page bvec is enabled now, not doing S/G merging is rather pointless with the current setup of the I/O path, as it isn't going to save you a significant amount of cycles. Reviewed-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 31 ++++++------------------------- block/blk-mq-debugfs.c | 1 - block/blk-mq.c | 3 --- drivers/md/dm-table.c | 13 ------------- include/linux/blkdev.h | 1 - 5 files changed, 6 insertions(+), 43 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 1912499b08b7..bed065904677 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -358,8 +358,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio) EXPORT_SYMBOL(blk_queue_split); static unsigned int __blk_recalc_rq_segments(struct request_queue *q, - struct bio *bio, - bool no_sg_merge) + struct bio *bio) { struct bio_vec bv, bvprv = { NULL }; int prev = 0; @@ -385,13 +384,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, nr_phys_segs = 0; for_each_bio(bio) { bio_for_each_bvec(bv, bio, iter) { - /* - * If SG merging is disabled, each bio vector is - * a segment - */ - if (no_sg_merge) - goto new_segment; - if (prev) { if (seg_size + bv.bv_len > queue_max_segment_size(q)) @@ -421,27 +413,16 @@ new_segment: void blk_recalc_rq_segments(struct request *rq) { - bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, - &rq->q->queue_flags); - - rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio, - no_sg_merge); + rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio); } void blk_recount_segments(struct request_queue *q, struct bio *bio) { - unsigned short seg_cnt = bio_segments(bio); - - if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) && - (seg_cnt < queue_max_segments(q))) - bio->bi_phys_segments = seg_cnt; - else { - struct bio *nxt = bio->bi_next; + struct bio *nxt = bio->bi_next; - bio->bi_next = NULL; - bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false); - bio->bi_next = nxt; - } + bio->bi_next = NULL; + bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); + bio->bi_next = nxt; bio_set_flag(bio, BIO_SEG_VALID); } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index c782e81db627..697d6213c82b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -128,7 +128,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(DEAD), QUEUE_FLAG_NAME(INIT_DONE), - QUEUE_FLAG_NAME(NO_SG_MERGE), QUEUE_FLAG_NAME(POLL), QUEUE_FLAG_NAME(WC), QUEUE_FLAG_NAME(FUA), diff --git a/block/blk-mq.c b/block/blk-mq.c index 44d471ff8754..fa508ee31742 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2837,9 +2837,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, set->map[HCTX_TYPE_POLL].nr_queues) blk_queue_flag_set(QUEUE_FLAG_POLL, q); - if (!(set->flags & BLK_MQ_F_SG_MERGE)) - blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q); - q->sg_reserved_size = INT_MAX; INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4b1be754cc41..ba9481f1bf3c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1698,14 +1698,6 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, return q && !blk_queue_add_random(q); } -static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); -} - static bool dm_table_all_devices_attribute(struct dm_table *t, iterate_devices_callout_fn func) { @@ -1902,11 +1894,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_write_zeroes(t)) q->limits.max_write_zeroes_sectors = 0; - if (dm_table_all_devices_attribute(t, queue_supports_sg_merge)) - blk_queue_flag_clear(QUEUE_FLAG_NO_SG_MERGE, q); - else - blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q); - dm_table_verify_integrity(t); /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b6292d469ea4..faed9d9eb84c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -588,7 +588,6 @@ struct request_queue { #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_DEAD 13 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ -#define QUEUE_FLAG_NO_SG_MERGE 15 /* don't attempt to merge SG segments*/ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ #define QUEUE_FLAG_WC 17 /* Write back caching */ #define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ -- cgit v1.2.3-59-g8ed1b From 56d18f62f556b80105e38e7975975cf7465aae3e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Feb 2019 19:13:24 +0800 Subject: block: kill BLK_MQ_F_SG_MERGE QUEUE_FLAG_NO_SG_MERGE has been killed, so kill BLK_MQ_F_SG_MERGE too. Reviewed-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - drivers/block/loop.c | 2 +- drivers/block/nbd.c | 2 +- drivers/block/rbd.c | 2 +- drivers/block/skd_main.c | 1 - drivers/block/xen-blkfront.c | 2 +- drivers/md/dm-rq.c | 2 +- drivers/mmc/core/queue.c | 3 +-- drivers/scsi/scsi_lib.c | 2 +- include/linux/blk-mq.h | 1 - 10 files changed, 7 insertions(+), 11 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 697d6213c82b..c39247c5ddb6 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -249,7 +249,6 @@ static const char *const alloc_policy_name[] = { static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(SHOULD_MERGE), HCTX_FLAG_NAME(TAG_SHARED), - HCTX_FLAG_NAME(SG_MERGE), HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(NO_SCHED), }; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 8ef583197414..3d63ad036398 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1937,7 +1937,7 @@ static int loop_add(struct loop_device **l, int i) lo->tag_set.queue_depth = 128; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 7c9a949e876b..32a7ba1674b7 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1571,7 +1571,7 @@ static int nbd_dev_add(int index) nbd->tag_set.numa_node = NUMA_NO_NODE; nbd->tag_set.cmd_size = sizeof(struct nbd_cmd); nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | - BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING; + BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; err = blk_mq_alloc_tag_set(&nbd->tag_set); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1e92b61d0bd5..abe9e1c89227 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3988,7 +3988,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->tag_set.ops = &rbd_mq_ops; rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; rbd_dev->tag_set.numa_node = NUMA_NO_NODE; - rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; rbd_dev->tag_set.nr_hw_queues = 1; rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index ab893a7571a2..7d3ad6c22ee5 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2843,7 +2843,6 @@ static int skd_cons_disk(struct skd_device *skdev) skdev->sgs_per_request * sizeof(struct scatterlist); skdev->tag_set.numa_node = NUMA_NO_NODE; skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | - BLK_MQ_F_SG_MERGE | BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO); skdev->tag_set.driver_data = skdev; rc = blk_mq_alloc_tag_set(&skdev->tag_set); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 0ed4b200fa58..d43a5677ccbc 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -977,7 +977,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, } else info->tag_set.queue_depth = BLK_RING_SIZE(info); info->tag_set.numa_node = NUMA_NO_NODE; - info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; info->tag_set.cmd_size = sizeof(struct blkif_req); info->tag_set.driver_data = info; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 4eb5f8c56535..b2f8eb2365ee 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -527,7 +527,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) md->tag_set->ops = &dm_mq_ops; md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); md->tag_set->numa_node = md->numa_node_id; - md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE; md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); md->tag_set->driver_data = md; diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 35cc138b096d..cc19e71c71d4 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -410,8 +410,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card) else mq->tag_set.queue_depth = MMC_QUEUE_DEPTH; mq->tag_set.numa_node = NUMA_NO_NODE; - mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE | - BLK_MQ_F_BLOCKING; + mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; mq->tag_set.nr_hw_queues = 1; mq->tag_set.cmd_size = sizeof(struct mmc_queue_req); mq->tag_set.driver_data = mq; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 6d65ac584eba..6cadbe945bdb 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1899,7 +1899,7 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost) shost->tag_set.queue_depth = shost->can_queue; shost->tag_set.cmd_size = cmd_size; shost->tag_set.numa_node = NUMA_NO_NODE; - shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; shost->tag_set.flags |= BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); shost->tag_set.driver_data = shost; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0e030f5f76b6..b0c814bcc7e3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -218,7 +218,6 @@ struct blk_mq_ops { enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, - BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, -- cgit v1.2.3-59-g8ed1b From 49b1f22b567ba1d7d8174950be4398a69d0effb7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 19 Feb 2019 11:08:19 +0800 Subject: block: avoid to READ fields of null bio rq->bio can be NULL sometimes, such as flush request, so don't read bio->bi_seg_front_size until this 'bio' is checked as valid. Cc: Bart Van Assche Reported-by: Bart Van Assche Fixes: dcebd755926b0f39dd1e ("block: use bio_for_each_bvec() to compute multi-page bvec count") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index bed065904677..066b66430523 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -363,13 +363,15 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, struct bio_vec bv, bvprv = { NULL }; int prev = 0; unsigned int seg_size, nr_phys_segs; - unsigned front_seg_size = bio->bi_seg_front_size; + unsigned front_seg_size; struct bio *fbio, *bbio; struct bvec_iter iter; if (!bio) return 0; + front_seg_size = bio->bi_seg_front_size; + switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: -- cgit v1.2.3-59-g8ed1b From 75c10e73272484bc3a940a9c8e4ec39a7a1b8c21 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 18 Feb 2019 11:43:26 +0100 Subject: nvme-multipath: round-robin I/O policy Implement a simple round-robin I/O policy for multipathing. Path selection is done in two rounds, first iterating across all optimized paths, and if that doesn't return any valid paths, iterate over all optimized and non-optimized paths. If no paths are found, use the existing algorithm. Also add a sysfs attribute 'iopolicy' to switch between the current NUMA-aware I/O policy and the 'round-robin' I/O policy. Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 +++ drivers/nvme/host/multipath.c | 86 ++++++++++++++++++++++++++++++++++++++++++- drivers/nvme/host/nvme.h | 9 +++++ 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 34758cca7836..cba58d995b30 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2328,6 +2328,9 @@ static struct attribute *nvme_subsys_attrs[] = { &subsys_attr_serial.attr, &subsys_attr_firmware_rev.attr, &subsys_attr_subsysnqn.attr, +#ifdef CONFIG_NVME_MULTIPATH + &subsys_attr_iopolicy.attr, +#endif NULL, }; @@ -2380,6 +2383,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); subsys->vendor_id = le16_to_cpu(id->vid); subsys->cmic = id->cmic; +#ifdef CONFIG_NVME_MULTIPATH + subsys->iopolicy = NVME_IOPOLICY_NUMA; +#endif subsys->dev.class = nvme_subsys_class; subsys->dev.release = nvme_release_subsystem; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index b9fff3b8ed1b..1f7fe1bd2936 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -141,7 +141,10 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) test_bit(NVME_NS_ANA_PENDING, &ns->flags)) continue; - distance = node_distance(node, ns->ctrl->numa_node); + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) + distance = node_distance(node, ns->ctrl->numa_node); + else + distance = LOCAL_DISTANCE; switch (ns->ana_state) { case NVME_ANA_OPTIMIZED: @@ -168,6 +171,47 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) return found; } +static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, + struct nvme_ns *ns) +{ + ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, + siblings); + if (ns) + return ns; + return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); +} + +static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, + int node, struct nvme_ns *old) +{ + struct nvme_ns *ns, *found, *fallback = NULL; + + if (list_is_singular(&head->list)) + return old; + + for (ns = nvme_next_ns(head, old); + ns != old; + ns = nvme_next_ns(head, ns)) { + if (ns->ctrl->state != NVME_CTRL_LIVE || + test_bit(NVME_NS_ANA_PENDING, &ns->flags)) + continue; + + if (ns->ana_state == NVME_ANA_OPTIMIZED) { + found = ns; + goto out; + } + if (ns->ana_state == NVME_ANA_NONOPTIMIZED) + fallback = ns; + } + + if (!fallback) + return NULL; + found = fallback; +out: + rcu_assign_pointer(head->current_path[node], found); + return found; +} + static inline bool nvme_path_is_optimized(struct nvme_ns *ns) { return ns->ctrl->state == NVME_CTRL_LIVE && @@ -180,6 +224,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) struct nvme_ns *ns; ns = srcu_dereference(head->current_path[node], &head->srcu); + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns) + ns = nvme_round_robin_path(head, node, ns); if (unlikely(!ns || !nvme_path_is_optimized(ns))) ns = __nvme_find_path(head, node); return ns; @@ -471,6 +517,44 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl) cancel_work_sync(&ctrl->ana_work); } +#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ + struct device_attribute subsys_attr_##_name = \ + __ATTR(_name, _mode, _show, _store) + +static const char *nvme_iopolicy_names[] = { + [NVME_IOPOLICY_NUMA] = "numa", + [NVME_IOPOLICY_RR] = "round-robin", +}; + +static ssize_t nvme_subsys_iopolicy_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + + return sprintf(buf, "%s\n", + nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); +} + +static ssize_t nvme_subsys_iopolicy_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + int i; + + for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { + if (sysfs_streq(buf, nvme_iopolicy_names[i])) { + WRITE_ONCE(subsys->iopolicy, i); + return count; + } + } + + return -EINVAL; +} +SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, + nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); + static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 23db2d99b53a..8c646ab26677 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -252,6 +252,11 @@ struct nvme_ctrl { unsigned long discard_page_busy; }; +enum nvme_iopolicy { + NVME_IOPOLICY_NUMA, + NVME_IOPOLICY_RR, +}; + struct nvme_subsystem { int instance; struct device dev; @@ -271,6 +276,9 @@ struct nvme_subsystem { u8 cmic; u16 vendor_id; struct ida ns_ida; +#ifdef CONFIG_NVME_MULTIPATH + enum nvme_iopolicy iopolicy; +#endif }; /* @@ -491,6 +499,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; +extern struct device_attribute subsys_attr_iopolicy; #else static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) -- cgit v1.2.3-59-g8ed1b From 5bc373ff254002b5ce1b7e7f37f8ed7074e45d2a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Feb 2019 14:50:52 -0800 Subject: nvmet: fix indentation This patch avoids that smatch complains about inconsistent indentation. Fixes: a07b4970f464 ("nvmet: add a generic NVMe target") # v4.10 Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/discovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index d2cb71a0b419..a34cf4986a49 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -331,7 +331,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) cmd->get_log_page.lid); req->error_loc = offsetof(struct nvme_get_log_page_command, lid); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } case nvme_admin_identify: req->data_len = NVME_IDENTIFY_DATA_SIZE; -- cgit v1.2.3-59-g8ed1b From a467fc55fc509e034e3f839dc1f6ceed74462da9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Feb 2019 14:50:53 -0800 Subject: nvme-fabrics: document the poll function argument This patch avoids that the kernel-doc tool reports a warning when building with W=1. Fixes: 26c682274e0a ("nvme-fabrics: allow nvmf_connect_io_queue to poll") # v5.0-rc1 Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 3eb908c50e1a..70c09abcfcbf 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -430,6 +430,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue); * @qid: NVMe I/O queue number for the new I/O connection between * host and target (note qid == 0 is illegal as this is * the Admin queue, per NVMe standard). + * @poll: Whether or not to poll for the completion of the connect cmd. * * This function issues a fabrics-protocol connection * of a NVMe I/O queue (via NVMe Fabrics "Connect" command) -- cgit v1.2.3-59-g8ed1b From e895fedf12dc0663a925b54eb0961fc927208097 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Feb 2019 14:50:54 -0800 Subject: nvme-pci: check kstrtoint() return value in queue_count_set() This patch avoids that the compiler complains about 'ret' being set but not being used when building with W=1. Fixes: 3b6592f70ad7 ("nvme: utilize two queue maps, one for reads and one for writes") # v5.0-rc1 Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 022ea1ee63f8..84ed1bbce86b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -157,6 +157,8 @@ static int queue_count_set(const char *val, const struct kernel_param *kp) int n = 0, ret; ret = kstrtoint(val, 10, &n); + if (ret) + return ret; if (n > num_possible_cpus()) n = num_possible_cpus(); -- cgit v1.2.3-59-g8ed1b From d84c4b024ac36c778fd6020fb8560447365a5478 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Feb 2019 14:50:55 -0800 Subject: nvme: unexport nvme_delete_ctrl_sync() Since nvme_delete_ctrl_sync() is not called from any other kernel module, unexport it. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 +-- drivers/nvme/host/nvme.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cba58d995b30..2c7a7a4f532f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -177,7 +177,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_delete_ctrl); -int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) +static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) { int ret = 0; @@ -192,7 +192,6 @@ int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) nvme_put_ctrl(ctrl); return ret; } -EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync); static inline bool nvme_ns_has_pi(struct nvme_ns *ns) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8c646ab26677..1c5878f886c6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -466,7 +466,6 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); -int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, void *log, size_t size, u64 offset); -- cgit v1.2.3-59-g8ed1b From a686ed75c0fb1ee2b87920aedc2027491da9fe6d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Feb 2019 14:50:56 -0800 Subject: nvme: introduce a helper function for controller deletion This patch does not change any functionality but makes the next patch in this series easier to read. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2c7a7a4f532f..9ec88253ebcd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -151,11 +151,8 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); -static void nvme_delete_ctrl_work(struct work_struct *work) +static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) { - struct nvme_ctrl *ctrl = - container_of(work, struct nvme_ctrl, delete_work); - dev_info(ctrl->device, "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn); @@ -167,6 +164,14 @@ static void nvme_delete_ctrl_work(struct work_struct *work) nvme_put_ctrl(ctrl); } +static void nvme_delete_ctrl_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, delete_work); + + nvme_do_delete_ctrl(ctrl); +} + int nvme_delete_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) -- cgit v1.2.3-59-g8ed1b From b9c77583b0a242e02615b3c295d613e7fe4df415 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Feb 2019 14:50:57 -0800 Subject: nvme: avoid that deleting a controller triggers a circular locking complaint Rework nvme_delete_ctrl_sync() such that it does not have to wait for queued work. This patch avoids that test nvme/008 triggers the following complaint: WARNING: possible circular locking dependency detected 5.0.0-rc6-dbg+ #10 Not tainted ------------------------------------------------------ nvme/7918 is trying to acquire lock: 000000009a1a7b69 ((work_completion)(&ctrl->delete_work)){+.+.}, at: __flush_work+0x379/0x410 but task is already holding lock: 00000000ef5a45b4 (kn->count#389){++++}, at: kernfs_remove_self+0x196/0x210 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (kn->count#389){++++}: lock_acquire+0xc5/0x1e0 __kernfs_remove+0x42a/0x4a0 kernfs_remove_by_name_ns+0x45/0x90 remove_files.isra.1+0x3a/0x90 sysfs_remove_group+0x5c/0xc0 sysfs_remove_groups+0x39/0x60 device_remove_attrs+0x68/0xb0 device_del+0x24d/0x570 cdev_device_del+0x1a/0x50 nvme_delete_ctrl_work+0xbd/0xe0 process_one_work+0x4f1/0xa40 worker_thread+0x67/0x5b0 kthread+0x1cf/0x1f0 ret_from_fork+0x24/0x30 -> #0 ((work_completion)(&ctrl->delete_work)){+.+.}: __lock_acquire+0x1323/0x17b0 lock_acquire+0xc5/0x1e0 __flush_work+0x399/0x410 flush_work+0x10/0x20 nvme_delete_ctrl_sync+0x65/0x70 nvme_sysfs_delete+0x4f/0x60 dev_attr_store+0x3e/0x50 sysfs_kf_write+0x87/0xa0 kernfs_fop_write+0x186/0x240 __vfs_write+0xd7/0x430 vfs_write+0xfa/0x260 ksys_write+0xab/0x130 __x64_sys_write+0x43/0x50 do_syscall_64+0x71/0x210 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(kn->count#389); lock((work_completion)(&ctrl->delete_work)); lock(kn->count#389); lock((work_completion)(&ctrl->delete_work)); *** DEADLOCK *** 3 locks held by nvme/7918: #0: 00000000e2223b44 (sb_writers#6){.+.+}, at: vfs_write+0x1eb/0x260 #1: 000000003404976f (&of->mutex){+.+.}, at: kernfs_fop_write+0x128/0x240 #2: 00000000ef5a45b4 (kn->count#389){++++}, at: kernfs_remove_self+0x196/0x210 stack backtrace: CPU: 4 PID: 7918 Comm: nvme Not tainted 5.0.0-rc6-dbg+ #10 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 Call Trace: dump_stack+0x86/0xca print_circular_bug.isra.36.cold.54+0x173/0x1d5 check_prev_add.constprop.45+0x996/0x1110 __lock_acquire+0x1323/0x17b0 lock_acquire+0xc5/0x1e0 __flush_work+0x399/0x410 flush_work+0x10/0x20 nvme_delete_ctrl_sync+0x65/0x70 nvme_sysfs_delete+0x4f/0x60 dev_attr_store+0x3e/0x50 sysfs_kf_write+0x87/0xa0 kernfs_fop_write+0x186/0x240 __vfs_write+0xd7/0x430 vfs_write+0xfa/0x260 ksys_write+0xab/0x130 __x64_sys_write+0x43/0x50 do_syscall_64+0x71/0x210 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Bart Van Assche Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9ec88253ebcd..170ae6c66423 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -191,9 +191,10 @@ static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) * can free the controller. */ nvme_get_ctrl(ctrl); - ret = nvme_delete_ctrl(ctrl); + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) + ret = -EBUSY; if (!ret) - flush_work(&ctrl->delete_work); + nvme_do_delete_ctrl(ctrl); nvme_put_ctrl(ctrl); return ret; } -- cgit v1.2.3-59-g8ed1b From ab4ab09cbd8a417c1530196a9f72797ca9af0258 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 Feb 2019 13:13:57 +0100 Subject: nvme: return error from nvme_alloc_ns() nvme_alloc_ns() might fail, so we should be returning an error code. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 170ae6c66423..127abc12489d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3222,21 +3222,23 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) return 0; } -static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; - int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; + int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) - return; + return -ENOMEM; ns->queue = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ns->queue)) + if (IS_ERR(ns->queue)) { + ret = PTR_ERR(ns->queue); goto out_free_ns; + } blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) @@ -3252,20 +3254,27 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_set_queue_limits(ctrl, ns->queue); id = nvme_identify_ns(ctrl, nsid); - if (!id) + if (!id) { + ret = -EIO; goto out_free_queue; + } - if (id->ncap == 0) + if (id->ncap == 0) { + ret = -EINVAL; goto out_free_id; + } - if (nvme_init_ns_head(ns, nsid, id)) + ret = nvme_init_ns_head(ns, nsid, id); + if (ret) goto out_free_id; nvme_setup_streams_ns(ctrl, ns); nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); - if (!disk) + if (!disk) { + ret = -ENOMEM; goto out_unlink_ns; + } disk->fops = &nvme_fops; disk->private_data = ns; @@ -3277,7 +3286,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) __nvme_revalidate_disk(disk, id); if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - if (nvme_nvm_register(ns, disk_name, node)) { + ret = nvme_nvm_register(ns, disk_name, node); + if (ret) { dev_warn(ctrl->device, "LightNVM init failure\n"); goto out_put_disk; } @@ -3295,7 +3305,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_fault_inject_init(ns); kfree(id); - return; + return 0; out_put_disk: put_disk(ns->disk); out_unlink_ns: @@ -3308,6 +3318,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); + return ret; } static void nvme_ns_remove(struct nvme_ns *ns) -- cgit v1.2.3-59-g8ed1b From fadccd8fc2d06cf7fd222245d7e04b00fae946cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:37:13 +0100 Subject: nvme_ioctl.h: remove duplicate GPL boilerplate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We already have a Đ…PDX header, so no need to duplicate the information. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- include/linux/nvme.h | 10 +--------- include/uapi/linux/nvme_ioctl.h | 9 --------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bbcc83886899..baa49e6a23cc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1,15 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for the NVM Express interface * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_NVME_H diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index 6e74b1eaf541..1c215ea1798e 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -2,15 +2,6 @@ /* * Definitions for the NVM Express ioctl interface * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _UAPI_LINUX_NVME_IOCTL_H -- cgit v1.2.3-59-g8ed1b From 055d045a7aaeef326f8ab6845519da3157887830 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:37:42 +0100 Subject: nvme-tcp.h: fix SPDX header For .h files we need to use /* */ style comments. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- include/linux/nvme-tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h index 03d87c0550a9..959e0bd9a913 100644 --- a/include/linux/nvme-tcp.h +++ b/include/linux/nvme-tcp.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * NVMe over Fabrics TCP protocol header. * Copyright (c) 2018 Lightbits Labs. All rights reserved. -- cgit v1.2.3-59-g8ed1b From 9002c4e5ff006c62de09fe2b6966403bdf96afa1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:31:03 +0100 Subject: nvme-fabrics: convert to SPDX identifiers Update license to use SPDX-License-Identifier instead of verbose license text. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/host/fabrics.c | 10 +--------- drivers/nvme/host/fabrics.h | 10 +--------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 70c09abcfcbf..d4cb826f58ff 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVMe over Fabrics common host code. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 478343b73e38..3044d8b99a24 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -1,15 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * NVMe over Fabrics common host code. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _NVME_FABRICS_H #define _NVME_FABRICS_H 1 -- cgit v1.2.3-59-g8ed1b From 8638b2461475ad4c35a957156ecf2425b9b82e85 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:33:28 +0100 Subject: nvme-fc: convert to SPDX identifiers Update license to use SPDX-License-Identifier instead of verbose license text. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 14 +------------- include/linux/nvme-fc-driver.h | 10 +--------- include/linux/nvme-fc.h | 14 +------------- 3 files changed, 3 insertions(+), 35 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 89accc76d71c..b29b12498a1a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016 Avago Technologies. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful. - * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES, - * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A - * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO - * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID. - * See the GNU General Public License for more details, a copy of which - * can be found in the file COPYING included with this package - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 91745cc3704c..2bb349035431 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2016, Avago Technologies - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _NVME_FC_DRIVER_H diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index 36cca93a5ff2..067c9fea64fe 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -1,18 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2016 Avago Technologies. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful. - * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES, - * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A - * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO - * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID. - * See the GNU General Public License for more details, a copy of which - * can be found in the file COPYING included with this package - * */ /* -- cgit v1.2.3-59-g8ed1b From 5d8762d5684ab997c7ccf2457c8beec7ef972ceb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:34:21 +0100 Subject: nvme-rdma: convert to SPDX identifiers Update license to use SPDX-License-Identifier instead of verbose license text. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 10 +--------- include/linux/nvme-rdma.h | 10 +--------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ac365366c2ec..7c0d29185249 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVMe over Fabrics RDMA host code. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index a72fd04aa5e1..3aa97b98dc89 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2015 Mellanox Technologies. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_NVME_RDMA_H -- cgit v1.2.3-59-g8ed1b From 115aa7abd7463006c4c22082c7722bae1b475e63 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:34:50 +0100 Subject: nvme-lightnvm: convert to SPDX identifiers Update license to use SPDX-License-Identifier instead of verbose license text. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/host/lightnvm.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index b759c25c89c8..949e29e1d782 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -1,23 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * nvme-lightnvm.c - LightNVM NVMe device * * Copyright (C) 2014-2015 IT University of Copenhagen * Initial release: Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, - * USA. - * */ #include "nvme.h" -- cgit v1.2.3-59-g8ed1b From 5f37396dffb89dcf4aceaf5dbdf4f133695d5afb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:36:08 +0100 Subject: nvme-pci: convert to SPDX identifiers Update license to use SPDX-License-Identifier instead of verbose license text. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 84ed1bbce86b..f54718b63637 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVM Express device driver * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include -- cgit v1.2.3-59-g8ed1b From bc50ad7501dd3629af9aa423ed0d1eae0061bcf1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:36:29 +0100 Subject: nvme: convert to SPDX identifiers Update license to use SPDX-License-Identifier instead of verbose license text. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/host/core.c | 10 +--------- drivers/nvme/host/fault_inject.c | 2 +- drivers/nvme/host/multipath.c | 10 +--------- drivers/nvme/host/nvme.h | 10 +--------- drivers/nvme/host/trace.c | 10 +--------- drivers/nvme/host/trace.h | 10 +--------- 6 files changed, 6 insertions(+), 46 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 127abc12489d..07bf2bff3a76 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVM Express device driver * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c index 02632266ac06..4cfd2c9222d4 100644 --- a/drivers/nvme/host/fault_inject.c +++ b/drivers/nvme/host/fault_inject.c @@ -1,8 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fault injection support for nvme. * * Copyright (c) 2018, Oracle and/or its affiliates - * */ #include diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1f7fe1bd2936..2839bb70badf 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2017-2018 Christoph Hellwig. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 1c5878f886c6..b91f1838bbd5 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _NVME_H diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 5566dda3237a..58456de78bb2 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVM Express device driver tracepoints * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 3564120aa7b3..244d7c177e5a 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -1,15 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * NVM Express device driver tracepoints * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #undef TRACE_SYSTEM -- cgit v1.2.3-59-g8ed1b From 4f80fc77fc14d0d1da28573f5116aded2932f5ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 11:35:04 +0100 Subject: nvmet-fc: convert to SPDX identifiers Update license to use SPDX-License-Identifier instead of verbose license text. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/target/fc.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index f98f5c5bea26..1e9654f04c60 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016 Avago Technologies. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful. - * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES, - * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A - * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO - * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID. - * See the GNU General Public License for more details, a copy of which - * can be found in the file COPYING included with this package - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -- cgit v1.2.3-59-g8ed1b From a4b74fcc2982dc82de1086fda0e66555d8425592 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 11:35:19 +0100 Subject: nvmet-fcloop: convert to SPDX identifiers Update license to use SPDX-License-Identifier instead of verbose license text. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/target/fcloop.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 291f4121f516..381b5a90c48b 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1,17 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016 Avago Technologies. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful. - * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES, - * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A - * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO - * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID. - * See the GNU General Public License for more details, a copy of which - * can be found in the file COPYING included with this package */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -- cgit v1.2.3-59-g8ed1b From d0ad69043d730d38b445f81b1ffefccc2c497ff0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 11:35:42 +0100 Subject: nvme-loop: convert to SPDX identifiers Update license to use SPDX-License-Identifier instead of verbose license text. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/target/loop.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 4aac1b4a8112..b9f623ab01f3 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVMe over Fabrics loopback device. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -- cgit v1.2.3-59-g8ed1b From 3641bd323fb145710b3ee4c522670c77410984a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 11:35:54 +0100 Subject: nvmet-rdma: convert to SPDX identifiers Update license to use SPDX-License-Identifier instead of verbose license text. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/target/rdma.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index a884e3a0e8af..ef893addf341 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVMe over Fabrics RDMA target. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -- cgit v1.2.3-59-g8ed1b From 77141dc6ceffdbf4dd3470911c838309e23cd55c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 11:36:11 +0100 Subject: nvmet: convert to SPDX identifiers Update license to use SPDX-License-Identifier instead of verbose license text. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/target/admin-cmd.c | 10 +--------- drivers/nvme/target/configfs.c | 10 +--------- drivers/nvme/target/core.c | 10 +--------- drivers/nvme/target/discovery.c | 10 +--------- drivers/nvme/target/fabrics-cmd.c | 10 +--------- drivers/nvme/target/io-cmd-bdev.c | 10 +--------- drivers/nvme/target/nvmet.h | 10 +--------- 7 files changed, 7 insertions(+), 63 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 11baeb14c388..76250181fee0 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVMe admin command implementation. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 618bbd006544..adb79545cdd7 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Configfs interface for the NVMe target. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 88d260f31835..d44ede147263 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Common code for the NVMe target. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index a34cf4986a49..c872b47a88f3 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Discovery service for the NVMe over Fabrics target. * Copyright (C) 2016 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 6cf1fd9eb32e..3a76ebc3d155 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVMe Fabrics command implementation. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index b6d030d3259f..71dfedbadc26 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVMe I/O command implementation. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 3e4719fdba85..51e49efd7849 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _NVMET_H -- cgit v1.2.3-59-g8ed1b From 34e08191b146a363d72be1a488d55ecdc01a5c7d Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 20 Feb 2019 20:13:34 -0800 Subject: nvme-rdma: use nr_phys_segments when map rq to sgl Use blk_rq_nr_phys_segments() instead of blk_rq_payload_bytes() to check if a command contains data to be mapped. This fixes the case where a struct request contains LBAs, but it has no payload, such as Write Zeroes support. Fixes: 6e02318eaea5 ("nvme: add support for the Write Zeroes command") Reported-by: Ming Lei Signed-off-by: Chaitanya Kulkarni Tested-by: Ming Lei Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 7c0d29185249..11a5ecae78c8 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1142,7 +1142,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; - if (!blk_rq_payload_bytes(rq)) + if (!blk_rq_nr_phys_segments(rq)) return; if (req->mr) { @@ -1265,7 +1265,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, c->common.flags |= NVME_CMD_SGL_METABUF; - if (!blk_rq_payload_bytes(rq)) + if (!blk_rq_nr_phys_segments(rq)) return nvme_rdma_set_sg_null(c); req->sg_table.sgl = req->first_sgl; -- cgit v1.2.3-59-g8ed1b From 8f4e80da764ec1ca44c83f3e17dbc9bf0209bccc Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 21 Feb 2019 23:43:36 +0800 Subject: block: bounce: make sure that bvec table is updated Block bounce needs to allocate new page for doing IO, and the new page has to be updated to bvec table. Commit 6dc4f100c switches __blk_queue_bounce() to use the new bio_for_each_segment_all() interface. Unfortunately the new bio_for_each_segment_all() can't be used to update bvec table. This patch fixes this issue by retrieving bvec from the table directly, then the new allocated page can be updated to the bio. This way is safe because the cloned bio is single page bvec. Fixes: 6dc4f100c ("block: allow bio_for_each_segment_all() to iterate over multi-page bvec") Cc: Christoph Hellwig Cc: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bounce.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/bounce.c b/block/bounce.c index add085e28b1d..47eb7e936e22 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -295,7 +295,6 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, bool bounce = false; int sectors = 0; bool passthrough = bio_is_passthrough(*bio_orig); - struct bvec_iter_all iter_all; bio_for_each_segment(from, *bio_orig, iter) { if (i++ < BIO_MAX_PAGES) @@ -315,7 +314,12 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : &bounce_bio_set); - bio_for_each_segment_all(to, bio, i, iter_all) { + /* + * Bvec table can't be updated by bio_for_each_segment_all(), + * so retrieve bvec from the table directly. This way is safe + * because the 'bio' is single-page bvec. + */ + for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { struct page *page = to->bv_page; if (page_to_pfn(page) <= q->limits.bounce_pfn) -- cgit v1.2.3-59-g8ed1b From 40853d6fc619a6fd3d3177c3973a2eac9b598a80 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 22 Feb 2019 22:10:19 +0800 Subject: loop: do not print warn message if partition scan is successful Do not print warn message when the partition scan returns 0. Fixes: d57f3374ba48 ("loop: Move special partition reread handling in loop_clr_fd()") Signed-off-by: Dongli Zhang Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/block/loop.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 3d63ad036398..e25df5ea9792 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1115,8 +1115,9 @@ out_unlock: err = __blkdev_reread_part(bdev); else err = blkdev_reread_part(bdev); - pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", - __func__, lo_number, err); + if (err) + pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", + __func__, lo_number, err); /* Device is gone, no point in returning error */ err = 0; } -- cgit v1.2.3-59-g8ed1b From 758a58d0bc67457f1215321a536226654a830eeb Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 22 Feb 2019 22:10:20 +0800 Subject: loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part() Commit 0da03cab87e6 ("loop: Fix deadlock when calling blkdev_reread_part()") moves blkdev_reread_part() out of the loop_ctl_mutex. However, GENHD_FL_NO_PART_SCAN is set before __blkdev_reread_part(). As a result, __blkdev_reread_part() will fail the check of GENHD_FL_NO_PART_SCAN and will not rescan the loop device to delete all partitions. Below are steps to reproduce the issue: step1 # dd if=/dev/zero of=tmp.raw bs=1M count=100 step2 # losetup -P /dev/loop0 tmp.raw step3 # parted /dev/loop0 mklabel gpt step4 # parted -a none -s /dev/loop0 mkpart primary 64s 1 step5 # losetup -d /dev/loop0 Step5 will not be able to delete /dev/loop0p1 (introduced by step4) and there is below kernel warning message: [ 464.414043] __loop_clr_fd: partition scan of loop0 failed (rc=-22) This patch sets GENHD_FL_NO_PART_SCAN after blkdev_reread_part(). Fixes: 0da03cab87e6 ("loop: Fix deadlock when calling blkdev_reread_part()") Signed-off-by: Dongli Zhang Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/block/loop.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e25df5ea9792..1e6edd568214 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1089,16 +1089,12 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); } mapping_set_gfp_mask(filp->f_mapping, gfp); - lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); blk_mq_unfreeze_queue(lo->lo_queue); partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev; lo_number = lo->lo_number; - lo->lo_flags = 0; - if (!part_shift) - lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; loop_unprepare_queue(lo); out_unlock: mutex_unlock(&loop_ctl_mutex); @@ -1121,6 +1117,23 @@ out_unlock: /* Device is gone, no point in returning error */ err = 0; } + + /* + * lo->lo_state is set to Lo_unbound here after above partscan has + * finished. + * + * There cannot be anybody else entering __loop_clr_fd() as + * lo->lo_backing_file is already cleared and Lo_rundown state + * protects us from all the other places trying to change the 'lo' + * device. + */ + mutex_lock(&loop_ctl_mutex); + lo->lo_flags = 0; + if (!part_shift) + lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; + lo->lo_state = Lo_unbound; + mutex_unlock(&loop_ctl_mutex); + /* * Need not hold loop_ctl_mutex to fput backing file. * Calling fput holding loop_ctl_mutex triggers a circular -- cgit v1.2.3-59-g8ed1b From fb7e160019f4abb4082740bfeb27a38f6389c745 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Nov 2018 16:37:38 +0100 Subject: fs: add an iopoll method to struct file_operations This new methods is used to explicitly poll for I/O completion for an iocb. It must be called for any iocb submitted asynchronously (that is with a non-null ki_complete) which has the IOCB_HIPRI flag set. The method is assisted by a new ki_cookie field in struct iocb to store the polling cookie. Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/filesystems/vfs.txt | 3 +++ include/linux/fs.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 8dc8e9c2913f..761c6fd24a53 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -857,6 +857,7 @@ struct file_operations { ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); + int (*iopoll)(struct kiocb *kiocb, bool spin); int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); @@ -902,6 +903,8 @@ otherwise noted. write_iter: possibly asynchronous write with iov_iter as source + iopoll: called when aio wants to poll for completions on HIPRI iocbs + iterate: called when the VFS needs to read the directory contents iterate_shared: called when the VFS needs to read the directory contents diff --git a/include/linux/fs.h b/include/linux/fs.h index 29d8e2cfed0e..dedcc2e9265c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -310,6 +310,7 @@ struct kiocb { int ki_flags; u16 ki_hint; u16 ki_ioprio; /* See linux/ioprio.h */ + unsigned int ki_cookie; /* for ->iopoll */ } __randomize_layout; static inline bool is_sync_kiocb(struct kiocb *kiocb) @@ -1787,6 +1788,7 @@ struct file_operations { ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); + int (*iopoll)(struct kiocb *kiocb, bool spin); int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); -- cgit v1.2.3-59-g8ed1b From eae83ce10b4713d9f4f3419af16436f89c1a7172 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Nov 2018 08:31:52 -0700 Subject: block: wire up block device iopoll method Just call blk_poll on the iocb cookie, we can derive the block device from the inode trivially. Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 7758adee6efe..1fe498b08f1b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -294,6 +294,14 @@ struct blkdev_dio { static struct bio_set blkdev_dio_pool; +static int blkdev_iopoll(struct kiocb *kiocb, bool wait) +{ + struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); + struct request_queue *q = bdev_get_queue(bdev); + + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); +} + static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; @@ -408,10 +416,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); if (!nr_pages) { - if (iocb->ki_flags & IOCB_HIPRI) + bool polled = false; + + if (iocb->ki_flags & IOCB_HIPRI) { bio->bi_opf |= REQ_HIPRI; + polled = true; + } qc = submit_bio(bio); + + if (polled) + WRITE_ONCE(iocb->ki_cookie, qc); break; } @@ -2078,6 +2093,7 @@ const struct file_operations def_blk_fops = { .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, + .iopoll = blkdev_iopoll, .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, -- cgit v1.2.3-59-g8ed1b From 0bbb280d7b767e7c86a5adfc87c76a6f09ab0423 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 21 Dec 2018 09:10:46 -0700 Subject: block: add bio_set_polled() helper For the upcoming async polled IO, we can't sleep allocating requests. If we do, then we introduce a deadlock where the submitter already has async polled IO in-flight, but can't wait for them to complete since polled requests must be active found and reaped. Utilize the helper in the blockdev DIRECT_IO code. Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- include/linux/bio.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 1fe498b08f1b..e9faa52bb489 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -248,7 +248,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, task_io_account_write(ret); } if (iocb->ki_flags & IOCB_HIPRI) - bio.bi_opf |= REQ_HIPRI; + bio_set_polled(&bio, iocb); qc = submit_bio(&bio); for (;;) { @@ -419,7 +419,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bool polled = false; if (iocb->ki_flags & IOCB_HIPRI) { - bio->bi_opf |= REQ_HIPRI; + bio_set_polled(bio, iocb); polled = true; } diff --git a/include/linux/bio.h b/include/linux/bio.h index bdd11d4c2f05..bb6090aa165d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -826,5 +826,19 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, #endif /* CONFIG_BLK_DEV_INTEGRITY */ +/* + * Mark a bio as polled. Note that for async polled IO, the caller must + * expect -EWOULDBLOCK if we cannot allocate a request (or other resources). + * We cannot block waiting for requests on polled IO, as those completions + * must be found by the caller. This is different than IRQ driven IO, where + * it's safe to wait for IO to complete. + */ +static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) +{ + bio->bi_opf |= REQ_HIPRI; + if (!is_sync_kiocb(kiocb)) + bio->bi_opf |= REQ_NOWAIT; +} + #endif /* CONFIG_BLOCK */ #endif /* __LINUX_BIO_H */ -- cgit v1.2.3-59-g8ed1b From 81214bab582eeda068e7904d57b6a3095e8f3855 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 4 Dec 2018 11:12:08 -0700 Subject: iomap: wire up the iopoll method Store the request queue the last bio was submitted to in the iocb private data in addition to the cookie so that we find the right block device. Also refactor the common direct I/O bio submission code into a nice little helper. Signed-off-by: Christoph Hellwig Modified to use bio_set_polled(). Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/gfs2/file.c | 2 ++ fs/iomap.c | 43 ++++++++++++++++++++++++++++--------------- fs/xfs/xfs_file.c | 1 + include/linux/iomap.h | 1 + 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index a2dea5bc0427..58a768e59712 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1280,6 +1280,7 @@ const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, + .iopoll = iomap_dio_iopoll, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, @@ -1310,6 +1311,7 @@ const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, + .iopoll = iomap_dio_iopoll, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, diff --git a/fs/iomap.c b/fs/iomap.c index 6982d3d2bcc6..97cb9d486a7d 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1464,6 +1464,28 @@ struct iomap_dio { }; }; +int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) +{ + struct request_queue *q = READ_ONCE(kiocb->private); + + if (!q) + return 0; + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); +} +EXPORT_SYMBOL_GPL(iomap_dio_iopoll); + +static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, + struct bio *bio) +{ + atomic_inc(&dio->ref); + + if (dio->iocb->ki_flags & IOCB_HIPRI) + bio_set_polled(bio, dio->iocb); + + dio->submit.last_queue = bdev_get_queue(iomap->bdev); + dio->submit.cookie = submit_bio(bio); +} + static ssize_t iomap_dio_complete(struct iomap_dio *dio) { struct kiocb *iocb = dio->iocb; @@ -1577,7 +1599,7 @@ static void iomap_dio_bio_end_io(struct bio *bio) } } -static blk_qc_t +static void iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, unsigned len) { @@ -1591,15 +1613,10 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - if (dio->iocb->ki_flags & IOCB_HIPRI) - flags |= REQ_HIPRI; - get_page(page); __bio_add_page(bio, page, len, 0); bio_set_op_attrs(bio, REQ_OP_WRITE, flags); - - atomic_inc(&dio->ref); - return submit_bio(bio); + iomap_dio_submit_bio(dio, iomap, bio); } static loff_t @@ -1702,9 +1719,6 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio_set_pages_dirty(bio); } - if (dio->iocb->ki_flags & IOCB_HIPRI) - bio->bi_opf |= REQ_HIPRI; - iov_iter_advance(dio->submit.iter, n); dio->size += n; @@ -1712,11 +1726,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, copied += n; nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); - - atomic_inc(&dio->ref); - - dio->submit.last_queue = bdev_get_queue(iomap->bdev); - dio->submit.cookie = submit_bio(bio); + iomap_dio_submit_bio(dio, iomap, bio); } while (nr_pages); /* @@ -1927,6 +1937,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio->flags & IOMAP_DIO_WRITE_FUA) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie); + WRITE_ONCE(iocb->private, dio->submit.last_queue); + /* * We are about to drop our additional submission reference, which * might be the last reference to the dio. There are three three diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e47425071e65..60c2da41f0fc 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1203,6 +1203,7 @@ const struct file_operations xfs_file_operations = { .write_iter = xfs_file_write_iter, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .iopoll = iomap_dio_iopoll, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 9a4258154b25..0fefb5455bda 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -162,6 +162,7 @@ typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret, unsigned flags); ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, iomap_dio_end_io_t end_io); +int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); #ifdef CONFIG_SWAP struct file; -- cgit v1.2.3-59-g8ed1b From 4d633062c1c0794a6b3836b7b55afba4599736e8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Feb 2019 20:40:10 +0800 Subject: block: introduce bvec_nth_page() Single-page bvec can often be seen in small BS workloads, so introduce bvec_nth_page() for avoiding to call nth_page() unnecessarily, which looks not cheap. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- include/linux/bvec.h | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 066b66430523..c7e8a8273460 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -483,7 +483,7 @@ static unsigned blk_bvec_map_sg(struct request_queue *q, offset = (total + bvec->bv_offset) % PAGE_SIZE; idx = (total + bvec->bv_offset) / PAGE_SIZE; - pg = nth_page(bvec->bv_page, idx); + pg = bvec_nth_page(bvec->bv_page, idx); sg_set_page(*sg, pg, seg_size, offset); diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 30a57b68d017..4376f683c08a 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -51,6 +51,11 @@ struct bvec_iter_all { unsigned done; }; +static inline struct page *bvec_nth_page(struct page *page, int idx) +{ + return idx == 0 ? page : nth_page(page, idx); +} + /* * various member access, note that bio_data should of course not be used * on highmem page vectors @@ -87,8 +92,8 @@ struct bvec_iter_all { PAGE_SIZE - bvec_iter_offset((bvec), (iter))) #define bvec_iter_page(bvec, iter) \ - nth_page(mp_bvec_iter_page((bvec), (iter)), \ - mp_bvec_iter_page_idx((bvec), (iter))) + bvec_nth_page(mp_bvec_iter_page((bvec), (iter)), \ + mp_bvec_iter_page_idx((bvec), (iter))) #define bvec_iter_bvec(bvec, iter) \ ((struct bio_vec) { \ @@ -171,7 +176,7 @@ static inline void mp_bvec_last_segment(const struct bio_vec *bvec, unsigned total = bvec->bv_offset + bvec->bv_len; unsigned last_page = (total - 1) / PAGE_SIZE; - seg->bv_page = nth_page(bvec->bv_page, last_page); + seg->bv_page = bvec_nth_page(bvec->bv_page, last_page); /* the whole segment is inside the last page */ if (bvec->bv_offset >= last_page * PAGE_SIZE) { -- cgit v1.2.3-59-g8ed1b From 48d7727cae1209235700ed90f8f11426027b333b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Feb 2019 20:40:11 +0800 Subject: block: optimize __blk_segment_map_sg() for single-page bvec Introduce a fast path for single-page bvec IO, then blk_bvec_map_sg() can be avoided. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index c7e8a8273460..c1ad8abbd9d6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -447,7 +447,7 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, return biovec_phys_mergeable(q, &end_bv, &nxt_bv); } -static struct scatterlist *blk_next_sg(struct scatterlist **sg, +static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) { if (!*sg) @@ -512,7 +512,12 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, (*sg)->length += nbytes; } else { new_segment: - (*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg); + if (bvec->bv_offset + bvec->bv_len <= PAGE_SIZE) { + *sg = blk_next_sg(sg, sglist); + sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); + (*nsegs) += 1; + } else + (*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg); } *bvprv = *bvec; } -- cgit v1.2.3-59-g8ed1b From bbcbbd567cc15823a6e9d4e2c5899ea3defa7b6d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Feb 2019 20:40:12 +0800 Subject: block: optimize blk_bio_segment_split for single-page bvec Introduce a fast path for single-page bvec IO, then we can avoid to call bvec_split_segs() unnecessarily. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index c1ad8abbd9d6..9402a7c3ba22 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -286,10 +286,16 @@ new_segment: bvprv = bv; bvprvp = &bvprv; - if (bvec_split_segs(q, &bv, &nsegs, &seg_size, - &front_seg_size, §ors)) + if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) { + nsegs++; + seg_size = bv.bv_len; + sectors += bv.bv_len >> 9; + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; + } else if (bvec_split_segs(q, &bv, &nsegs, &seg_size, + &front_seg_size, §ors)) { goto split; - + } } do_split = false; -- cgit v1.2.3-59-g8ed1b From 594b9a89af8e7629e95a4cd844d188361be32790 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Feb 2019 20:40:13 +0800 Subject: block: introduce mp_bvec_for_each_page() for iterating over page mp_bvec_for_each_segment() is a bit big for the iteration, so introduce a light-weight helper for iterating over pages, then 32bytes stack space can be saved. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bvec.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 4376f683c08a..87e82e503a52 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -188,4 +188,9 @@ static inline void mp_bvec_last_segment(const struct bio_vec *bvec, } } +#define mp_bvec_for_each_page(pg, bv, i) \ + for (i = (bv)->bv_offset / PAGE_SIZE; \ + (i <= (((bv)->bv_offset + (bv)->bv_len - 1) / PAGE_SIZE)) && \ + (pg = bvec_nth_page((bv)->bv_page, i)); i += 1) + #endif /* __LINUX_BVEC_ITER_H */ -- cgit v1.2.3-59-g8ed1b From 5b88a17cfdeba75e0092bab2c79aaf7d9e7db482 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Feb 2019 11:00:18 -0500 Subject: block: optimize bvec iteration in bvec_iter_advance There is no need to only iterate in chunks of PAGE_SIZE or less in bvec_iter_advance, given that the callers pass in the chunk length that they are operating on - either that already is less than PAGE_SIZE because they do classic page-based iteration, or it is larger because the caller operates on multi-page bvecs. This should help shaving off a few cycles of the I/O hot path. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bvec.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 87e82e503a52..f6275c4da13a 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -112,14 +112,15 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, } while (bytes) { - unsigned iter_len = bvec_iter_len(bv, *iter); - unsigned len = min(bytes, iter_len); + const struct bio_vec *cur = bv + iter->bi_idx; + unsigned len = min3(bytes, iter->bi_size, + cur->bv_len - iter->bi_bvec_done); bytes -= len; iter->bi_size -= len; iter->bi_bvec_done += len; - if (iter->bi_bvec_done == __bvec_iter_bvec(bv, *iter)->bv_len) { + if (iter->bi_bvec_done == cur->bv_len) { iter->bi_bvec_done = 0; iter->bi_idx++; } -- cgit v1.2.3-59-g8ed1b From 7d76f8562f4c42a5515741375790843fe4b8df83 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Wed, 27 Feb 2019 21:35:01 +0800 Subject: blk-mq: use HCTX_TYPE_DEFAULT but not 0 to index blk_mq_tag_set->map Replace set->map[0] with set->map[HCTX_TYPE_DEFAULT] to avoid hardcoding. Signed-off-by: Dongli Zhang Signed-off-by: Jens Axboe --- block/blk-mq.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index fa508ee31742..fa024bce2b38 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2061,7 +2061,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags; int node; - node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); + node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -2117,7 +2117,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, size_t rq_size, left; int node; - node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); + node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -2416,7 +2416,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { - hctx_idx = set->map[0].mq_map[i]; + hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && !__blk_mq_alloc_rq_map(set, hctx_idx)) { @@ -2426,7 +2426,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ - set->map[0].mq_map[i] = 0; + set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0; } ctx = per_cpu_ptr(q->queue_ctx, i); @@ -2733,7 +2733,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, int node; struct blk_mq_hw_ctx *hctx; - node = blk_mq_hw_queue_to_node(&set->map[0], i); + node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); /* * If the hw queue has been mapped to another numa node, * we need to realloc the hctx. If allocation fails, fallback @@ -2964,7 +2964,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) return set->ops->map_queues(set); } else { BUG_ON(set->nr_maps > 1); - return blk_mq_map_queues(&set->map[0]); + return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } } @@ -3234,7 +3234,7 @@ fallback: pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); set->nr_hw_queues = prev_nr_hw_queues; - blk_mq_map_queues(&set->map[0]); + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); goto fallback; } blk_mq_map_swqueue(q); -- cgit v1.2.3-59-g8ed1b From dce30ca9e3b676fb288c33c1f4725a0621361185 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 26 Feb 2019 11:51:50 +0100 Subject: fs: fix guard_bio_eod to check for real EOD errors guard_bio_eod() can truncate a segment in bio to allow it to do IO on odd last sectors of a device. It already checks if the IO starts past EOD, but it does not consider the possibility of an IO request starting within device boundaries can contain more than one segment past EOD. In such cases, truncated_bytes can be bigger than PAGE_SIZE, and will underflow bvec->bv_len. Fix this by checking if truncated_bytes is lower than PAGE_SIZE. This situation has been found on filesystems such as isofs and vfat, which doesn't check the device size before mount, if the device is smaller than the filesystem itself, a readahead on such filesystem, which spans EOD, can trigger this situation, leading a call to zero_user() with a wrong size possibly corrupting memory. I didn't see any crash, or didn't let the system run long enough to check if memory corruption will be hit somewhere, but adding instrumentation to guard_bio_end() to check truncated_bytes size, was enough to see the error. The following script can trigger the error. MNT=/mnt IMG=./DISK.img DEV=/dev/loop0 mkfs.vfat $IMG mount $IMG $MNT cp -R /etc $MNT &> /dev/null umount $MNT losetup -D losetup --find --show --sizelimit 16247280 $IMG mount $DEV $MNT find $MNT -type f -exec cat {} + >/dev/null Kudos to Eric Sandeen for coming up with the reproducer above Reviewed-by: Ming Lei Signed-off-by: Carlos Maiolino Signed-off-by: Jens Axboe --- fs/buffer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index 89a4e42b9aad..ce357602f471 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3027,6 +3027,13 @@ void guard_bio_eod(int op, struct bio *bio) /* Uhhuh. We've got a bio that straddles the device size! */ truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); + /* + * The bio contains more than one segment which spans EOD, just return + * and let IO layer turn it into an EIO + */ + if (truncated_bytes > bvec->bv_len) + return; + /* Truncate the bio.. */ bio->bi_iter.bi_size -= truncated_bytes; bvec->bv_len -= truncated_bytes; -- cgit v1.2.3-59-g8ed1b From 4d7c1d3fd7c7eda7dea351f071945e843a46c145 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 20 Feb 2019 21:27:05 +0800 Subject: block: fix NULL pointer dereference in register_disk If __device_add_disk-->bdi_register_owner-->bdi_register--> bdi_register_va-->device_create_vargs fails, bdi->dev is still NULL, __device_add_disk-->register_disk will visit bdi->dev->kobj. This patch fixes that. Signed-off-by: zhengbin Signed-off-by: Jens Axboe --- block/genhd.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 1dd8fd6613b8..78b82d26aa9c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -655,10 +655,12 @@ exit: kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); disk_part_iter_exit(&piter); - err = sysfs_create_link(&ddev->kobj, - &disk->queue->backing_dev_info->dev->kobj, - "bdi"); - WARN_ON(err); + if (disk->queue->backing_dev_info->dev) { + err = sysfs_create_link(&ddev->kobj, + &disk->queue->backing_dev_info->dev->kobj, + "bdi"); + WARN_ON(err); + } } /** -- cgit v1.2.3-59-g8ed1b From bf7c7a04014678c6880642936e3d420cdfe453bc Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Fri, 22 Feb 2019 20:00:01 +0100 Subject: null_blk: fix checking for REQ_FUA null_handle_bio() erroneously uses the bio_op macro which masks respective request flag bits including REQ_FUA out thus failing the check. Fix by checking bio->bi_opf directly. Signed-off-by: Heinz Mauelshagen Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 83c38a6217d7..417a9f15c116 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1104,7 +1104,7 @@ static int null_handle_bio(struct nullb_cmd *cmd) len = bvec.bv_len; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, op_is_write(bio_op(bio)), sector, - bio_op(bio) & REQ_FUA); + bio->bi_opf & REQ_FUA); if (err) { spin_unlock_irq(&nullb->lock); return err; -- cgit v1.2.3-59-g8ed1b From 6dc8746d7124c14bd86679ead7e64614263212c9 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 18 Feb 2019 08:42:32 +0000 Subject: floppy: remove set but not used variable 'q' Fixes gcc '-Wunused-but-set-variable' warning: drivers/block/floppy.c: In function 'request_done': drivers/block/floppy.c:2233:24: warning: variable 'q' set but not used [-Wunused-but-set-variable] It's never used and can be removed. Acked-by: Jiri Kosina Signed-off-by: YueHaibing Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 6f2856c6d0f2..04d47683eddd 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2230,7 +2230,6 @@ static void floppy_end_request(struct request *req, blk_status_t error) static void request_done(int uptodate) { struct request *req = current_req; - struct request_queue *q; int block; char msg[sizeof("request done ") + sizeof(int) * 3]; @@ -2243,8 +2242,6 @@ static void request_done(int uptodate) return; } - q = req->q; - if (uptodate) { /* maintain values for invalidation on geometry * change */ -- cgit v1.2.3-59-g8ed1b From cd46eb89dff7f6390eaeb11013bb23aae196bbfc Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 19 Feb 2019 13:14:07 +0800 Subject: nbd: propagate genlmsg_reply return code genlmsg_reply can fail, so propagate its return code Signed-off-by: Li RongQing Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 32a7ba1674b7..90ba9f4c03f3 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -2118,8 +2118,7 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info) } nla_nest_end(reply, dev_list); genlmsg_end(reply, reply_head); - genlmsg_reply(reply, info); - ret = 0; + ret = genlmsg_reply(reply, info); out: mutex_unlock(&nbd_index_mutex); return ret; -- cgit v1.2.3-59-g8ed1b From dfc76d11dd455a63d50e63bf0a7edc690b3a37d0 Mon Sep 17 00:00:00 2001 From: Keyur Patel Date: Sun, 17 Feb 2019 10:21:56 -0500 Subject: block: Replace function name in string with __func__ Replace hard coded function name register_blkdev with __func__, to improve robustness and to conform to the Linux kernel coding style. Issue found using checkpatch. Signed-off-by: Keyur Patel Signed-off-by: Jens Axboe --- block/genhd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 78b82d26aa9c..703267865f14 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -365,8 +365,8 @@ int register_blkdev(unsigned int major, const char *name) } if (index == 0) { - printk("register_blkdev: failed to get major for %s\n", - name); + printk("%s: failed to get major for %s\n", + __func__, name); ret = -EBUSY; goto out; } @@ -375,8 +375,8 @@ int register_blkdev(unsigned int major, const char *name) } if (major >= BLKDEV_MAJOR_MAX) { - pr_err("register_blkdev: major requested (%u) is greater than the maximum (%u) for %s\n", - major, BLKDEV_MAJOR_MAX-1, name); + pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n", + __func__, major, BLKDEV_MAJOR_MAX-1, name); ret = -EINVAL; goto out; -- cgit v1.2.3-59-g8ed1b From aaeee62c841cc1e48231e1d60c304d2da9c4e41c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 2 Mar 2019 16:43:44 +0800 Subject: block: fix updating bio's front segment size When the current bvec can be merged to the 1st segment, the bio's front segment size has to be updated. However, dcebd755926b doesn't consider that case, then bio's front segment size may not be correct. This patch fixes this issue. Cc: Christoph Hellwig Cc: Omar Sandoval Fixes: dcebd755926b ("block: use bio_for_each_bvec() to compute multi-page bvec count") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 9402a7c3ba22..22467f475ab4 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -277,6 +277,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bvprvp = &bvprv; sectors += bv.bv_len >> 9; + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; + continue; } new_segment: @@ -401,6 +404,11 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, seg_size += bv.bv_len; bvprv = bv; + + if (nr_phys_segs == 1 && seg_size > + front_seg_size) + front_seg_size = seg_size; + continue; } new_segment: -- cgit v1.2.3-59-g8ed1b