diff options
Diffstat (limited to 'drivers/nvme/host/tcp.c')
-rw-r--r-- | drivers/nvme/host/tcp.c | 144 |
1 files changed, 101 insertions, 43 deletions
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 606b13d35d16..4ffd5957637a 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -13,6 +13,7 @@ #include <net/tcp.h> #include <linux/blk-mq.h> #include <crypto/hash.h> +#include <net/busy_poll.h> #include "nvme.h" #include "fabrics.h" @@ -72,6 +73,7 @@ struct nvme_tcp_queue { int pdu_offset; size_t data_remaining; size_t ddgst_remaining; + unsigned int nr_cqe; /* send state */ struct nvme_tcp_request *request; @@ -438,6 +440,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, } nvme_end_request(rq, cqe->status, cqe->result); + queue->nr_cqe++; return 0; } @@ -608,23 +611,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, switch (hdr->type) { case nvme_tcp_c2h_data: - ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); case nvme_tcp_rsp: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_comp(queue, (void *)queue->pdu); case nvme_tcp_r2t: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); default: dev_err(queue->ctrl->ctrl.device, "unsupported pdu type (%d)\n", hdr->type); return -EINVAL; } - - return ret; } static inline void nvme_tcp_end_request(struct request *rq, u16 status) @@ -701,8 +699,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { - if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + queue->nr_cqe++; + } nvme_tcp_init_recv_ctx(queue); } } @@ -742,6 +742,7 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, pdu->command_id); nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + queue->nr_cqe++; } nvme_tcp_init_recv_ctx(queue); @@ -841,7 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) static void nvme_tcp_fail_request(struct nvme_tcp_request *req) { - nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR); + nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR); } static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) @@ -1023,14 +1024,16 @@ done: static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) { - struct sock *sk = queue->sock->sk; + struct socket *sock = queue->sock; + struct sock *sk = sock->sk; read_descriptor_t rd_desc; int consumed; rd_desc.arg.data = queue; rd_desc.count = 1; lock_sock(sk); - consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb); + queue->nr_cqe = 0; + consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); release_sock(sk); return consumed; } @@ -1255,7 +1258,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->queue_size = queue_size; if (qid > 0) - queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + queue->cmnd_capsule_len = nctrl->ioccsz * 16; else queue->cmnd_capsule_len = sizeof(struct nvme_command) + NVME_TCP_ADMIN_CCSZ; @@ -1263,7 +1266,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &queue->sock); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to create socket: %d\n", ret); return ret; } @@ -1273,7 +1276,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, (char *)&opt, sizeof(opt)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set TCP_SYNCNT sock opt %d\n", ret); goto err_sock; } @@ -1283,7 +1286,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set TCP_NODELAY sock opt %d\n", ret); goto err_sock; } @@ -1296,11 +1299,23 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, (char *)&sol, sizeof(sol)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set SO_LINGER sock opt %d\n", ret); goto err_sock; } + /* Set socket type of service */ + if (nctrl->opts->tos >= 0) { + opt = nctrl->opts->tos; + ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, + (char *)&opt, sizeof(opt)); + if (ret) { + dev_err(nctrl->device, + "failed to set IP_TOS sock opt %d\n", ret); + goto err_sock; + } + } + queue->sock->sk->sk_allocation = GFP_ATOMIC; if (!qid) n = 0; @@ -1314,11 +1329,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->pdu_offset = 0; sk_set_memalloc(queue->sock->sk); - if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { + if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, sizeof(ctrl->src_addr)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to bind queue %d socket %d\n", qid, ret); goto err_sock; @@ -1330,7 +1345,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, if (queue->hdr_digest || queue->data_digest) { ret = nvme_tcp_alloc_crypto(queue); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to allocate queue %d crypto\n", qid); goto err_sock; } @@ -1344,13 +1359,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, goto err_crypto; } - dev_dbg(ctrl->ctrl.device, "connecting queue %d\n", + dev_dbg(nctrl->device, "connecting queue %d\n", nvme_tcp_queue_id(queue)); ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, sizeof(ctrl->addr), 0); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to connect socket: %d\n", ret); goto err_rcv_pdu; } @@ -1371,6 +1386,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; queue->sock->sk->sk_state_change = nvme_tcp_state_change; queue->sock->sk->sk_write_space = nvme_tcp_write_space; + queue->sock->sk->sk_ll_usec = 1; write_unlock_bh(&queue->sock->sk->sk_callback_lock); return 0; @@ -1469,7 +1485,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->driver_data = ctrl; set->nr_hw_queues = nctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; - set->nr_maps = 2 /* default + read */; + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; } ret = blk_mq_alloc_tag_set(set); @@ -1568,6 +1584,7 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); + nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus()); return nr_io_queues; } @@ -1599,6 +1616,12 @@ static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, min(opts->nr_io_queues, nr_io_queues); nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; } + + if (opts->nr_poll_queues && nr_io_queues) { + /* map dedicated poll queues only if we have queues left */ + ctrl->io_queues[HCTX_TYPE_POLL] = + min(opts->nr_poll_queues, nr_io_queues); + } } static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) @@ -1680,6 +1703,7 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) nvme_tcp_stop_queue(ctrl, 0); if (remove) { blk_cleanup_queue(ctrl->admin_q); + blk_cleanup_queue(ctrl->fabrics_q); blk_mq_free_tag_set(ctrl->admin_tagset); } nvme_tcp_free_admin_queue(ctrl); @@ -1700,10 +1724,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) goto out_free_queue; } + ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); + if (IS_ERR(ctrl->fabrics_q)) { + error = PTR_ERR(ctrl->fabrics_q); + goto out_free_tagset; + } + ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); if (IS_ERR(ctrl->admin_q)) { error = PTR_ERR(ctrl->admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } } @@ -1711,19 +1741,12 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_cleanup_queue; - error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); - if (error) { - dev_err(ctrl->device, - "prop_get NVME_REG_CAP failed\n"); - goto out_stop_queue; - } - - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); - - error = nvme_enable_ctrl(ctrl, ctrl->cap); + error = nvme_enable_ctrl(ctrl); if (error) goto out_stop_queue; + blk_mq_unquiesce_queue(ctrl->admin_q); + error = nvme_init_identify(ctrl); if (error) goto out_stop_queue; @@ -1735,6 +1758,9 @@ out_stop_queue: out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->fabrics_q); out_free_tagset: if (new) blk_mq_free_tag_set(ctrl->admin_tagset); @@ -1748,10 +1774,13 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, { blk_mq_quiesce_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); - if (ctrl->admin_tagset) + if (ctrl->admin_tagset) { blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl); - blk_mq_unquiesce_queue(ctrl->admin_q); + blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); + } + if (remove) + blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1762,9 +1791,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, return; nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); - if (ctrl->tagset) + if (ctrl->tagset) { blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl); + blk_mq_tagset_wait_completed_request(ctrl->tagset); + } if (remove) nvme_start_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); @@ -1793,7 +1824,7 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) { struct nvmf_ctrl_options *opts = ctrl->opts; - int ret = -EINVAL; + int ret; ret = nvme_tcp_configure_admin_queue(ctrl, new); if (ret) @@ -1876,6 +1907,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) /* unquiesce to fail fast pending requests */ nvme_start_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); + blk_mq_unquiesce_queue(ctrl->admin_q); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1892,10 +1924,11 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); nvme_tcp_teardown_io_queues(ctrl, shutdown); + blk_mq_quiesce_queue(ctrl->admin_q); if (shutdown) nvme_shutdown_ctrl(ctrl); else - nvme_disable_ctrl(ctrl, ctrl->cap); + nvme_disable_ctrl(ctrl); nvme_tcp_teardown_admin_queue(ctrl, shutdown); } @@ -2151,14 +2184,36 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { + /* map dedicated poll queues only if we have queues left */ + set->map[HCTX_TYPE_POLL].nr_queues = + ctrl->io_queues[HCTX_TYPE_POLL]; + set->map[HCTX_TYPE_POLL].queue_offset = + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + } + dev_info(ctrl->ctrl.device, - "mapped %d/%d default/read queues.\n", + "mapped %d/%d/%d default/read/poll queues.\n", ctrl->io_queues[HCTX_TYPE_DEFAULT], - ctrl->io_queues[HCTX_TYPE_READ]); + ctrl->io_queues[HCTX_TYPE_READ], + ctrl->io_queues[HCTX_TYPE_POLL]); return 0; } +static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + struct sock *sk = queue->sock->sk; + + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue)) + sk_busy_loop(sk, true); + nvme_tcp_try_recv(queue); + return queue->nr_cqe; +} + static struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, .complete = nvme_complete_rq, @@ -2167,6 +2222,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = { .init_hctx = nvme_tcp_init_hctx, .timeout = nvme_tcp_timeout, .map_queues = nvme_tcp_map_queues, + .poll = nvme_tcp_poll, }; static struct blk_mq_ops nvme_tcp_admin_mq_ops = { @@ -2220,7 +2276,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, INIT_LIST_HEAD(&ctrl->list); ctrl->ctrl.opts = opts; - ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1; + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + + opts->nr_poll_queues + 1; ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; @@ -2314,7 +2371,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = { .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | - NVMF_OPT_NR_WRITE_QUEUES, + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS, .create_ctrl = nvme_tcp_create_ctrl, }; |