diff options
Diffstat (limited to 'drivers/nvme/host/tcp.c')
-rw-r--r-- | drivers/nvme/host/tcp.c | 136 |
1 files changed, 106 insertions, 30 deletions
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 49d4373b84eb..c15a92163c1f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -20,6 +20,16 @@ struct nvme_tcp_queue; +/* Define the socket priority to use for connections were it is desirable + * that the NIC consider performing optimized packet processing or filtering. + * A non-zero value being sufficient to indicate general consideration of any + * possible optimization. Making it a module param allows for alternative + * values that may be unique for some NIC implementations. + */ +static int so_priority; +module_param(so_priority, int, 0644); +MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); + enum nvme_tcp_send_state { NVME_TCP_SEND_CMD_PDU = 0, NVME_TCP_SEND_H2C_PDU, @@ -164,16 +174,14 @@ static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req) { struct request *rq; - unsigned int bytes; if (unlikely(nvme_tcp_async_req(req))) return false; /* async events don't have a request */ rq = blk_mq_rq_from_pdu(req); - bytes = blk_rq_payload_bytes(rq); - return rq_data_dir(rq) == WRITE && bytes && - bytes <= nvme_tcp_inline_data_size(req->queue); + return rq_data_dir(rq) == WRITE && req->data_len && + req->data_len <= nvme_tcp_inline_data_size(req->queue); } static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) @@ -1017,8 +1025,15 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) if (req->state == NVME_TCP_SEND_DDGST) ret = nvme_tcp_try_send_ddgst(req); done: - if (ret == -EAGAIN) + if (ret == -EAGAIN) { ret = 0; + } else if (ret < 0) { + dev_err(queue->ctrl->ctrl.device, + "failed to send request %d\n", ret); + if (ret != -EPIPE && ret != -ECONNRESET) + nvme_tcp_fail_request(queue->request); + nvme_tcp_done_send_req(queue); + } return ret; } @@ -1049,25 +1064,16 @@ static void nvme_tcp_io_work(struct work_struct *w) int result; result = nvme_tcp_try_send(queue); - if (result > 0) { + if (result > 0) pending = true; - } else if (unlikely(result < 0)) { - dev_err(queue->ctrl->ctrl.device, - "failed to send request %d\n", result); - - /* - * Fail the request unless peer closed the connection, - * in which case error recovery flow will complete all. - */ - if ((result != -EPIPE) && (result != -ECONNRESET)) - nvme_tcp_fail_request(queue->request); - nvme_tcp_done_send_req(queue); - return; - } + else if (unlikely(result < 0)) + break; result = nvme_tcp_try_recv(queue); if (result > 0) pending = true; + else if (unlikely(result < 0)) + return; if (!pending) return; @@ -1248,13 +1254,67 @@ free_icreq: return ret; } +static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue) +{ + return nvme_tcp_queue_id(queue) == 0; +} + +static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT]; +} + +static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; +} + +static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + !nvme_tcp_read_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ] + + ctrl->io_queues[HCTX_TYPE_POLL]; +} + +static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + int n = 0; + + if (nvme_tcp_default_queue(queue)) + n = qid - 1; + else if (nvme_tcp_read_queue(queue)) + n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1; + else if (nvme_tcp_poll_queue(queue)) + n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - + ctrl->io_queues[HCTX_TYPE_READ] - 1; + queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); +} + static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, size_t queue_size) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; struct linger sol = { .l_onoff = 1, .l_linger = 0 }; - int ret, opt, rcv_pdu_size, n; + int ret, opt, rcv_pdu_size; queue->ctrl = ctrl; INIT_LIST_HEAD(&queue->send_list); @@ -1309,6 +1369,17 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, goto err_sock; } + if (so_priority > 0) { + ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY, + (char *)&so_priority, sizeof(so_priority)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to set SO_PRIORITY sock opt, ret %d\n", + ret); + goto err_sock; + } + } + /* Set socket type of service */ if (nctrl->opts->tos >= 0) { opt = nctrl->opts->tos; @@ -1322,11 +1393,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } queue->sock->sk->sk_allocation = GFP_ATOMIC; - if (!qid) - n = 0; - else - n = (qid - 1) % num_online_cpus(); - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + nvme_tcp_set_queue_io_cpu(queue); queue->request = NULL; queue->data_remaining = 0; queue->ddgst_remaining = 0; @@ -1861,8 +1928,13 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) } if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { - /* state change failure is ok if we're in DELETING state */ + /* + * state change failure is ok if we're in DELETING state, + * unless we're during creation of a new controller to + * avoid races with teardown flow. + */ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; } @@ -2090,7 +2162,9 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, c->common.flags |= NVME_CMD_SGL_METABUF; - if (rq_data_dir(rq) == WRITE && req->data_len && + if (!blk_rq_nr_phys_segments(rq)) + nvme_tcp_set_sg_null(c); + else if (rq_data_dir(rq) == WRITE && req->data_len <= nvme_tcp_inline_data_size(queue)) nvme_tcp_set_sg_inline(queue, c, req->data_len); else @@ -2117,7 +2191,8 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, req->data_sent = 0; req->pdu_len = 0; req->pdu_sent = 0; - req->data_len = blk_rq_payload_bytes(rq); + req->data_len = blk_rq_nr_phys_segments(rq) ? + blk_rq_payload_bytes(rq) : 0; req->curr_bio = rq->bio; if (rq_data_dir(rq) == WRITE && @@ -2224,6 +2299,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) struct nvme_tcp_queue *queue = hctx->driver_data; struct sock *sk = queue->sock->sk; + if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) + return 0; + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) sk_busy_loop(sk, true); nvme_tcp_try_recv(queue); @@ -2359,8 +2437,6 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - nvme_get_ctrl(&ctrl->ctrl); - mutex_lock(&nvme_tcp_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); mutex_unlock(&nvme_tcp_ctrl_mutex); |