aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/drivers/nvme/host/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/tcp.c')
-rw-r--r--drivers/nvme/host/tcp.c117
1 files changed, 55 insertions, 62 deletions
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index c15a92163c1f..1843110ec34f 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -60,6 +60,7 @@ struct nvme_tcp_request {
enum nvme_tcp_queue_flags {
NVME_TCP_Q_ALLOCATED = 0,
NVME_TCP_Q_LIVE = 1,
+ NVME_TCP_Q_POLLING = 2,
};
enum nvme_tcp_recv_state {
@@ -75,6 +76,7 @@ struct nvme_tcp_queue {
int io_cpu;
spinlock_t lock;
+ struct mutex send_mutex;
struct list_head send_list;
/* recv state */
@@ -131,6 +133,7 @@ static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
static struct workqueue_struct *nvme_tcp_wq;
static struct blk_mq_ops nvme_tcp_mq_ops;
static struct blk_mq_ops nvme_tcp_admin_mq_ops;
+static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
{
@@ -257,15 +260,29 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
}
}
-static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
+static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
+ bool sync)
{
struct nvme_tcp_queue *queue = req->queue;
+ bool empty;
spin_lock(&queue->lock);
+ empty = list_empty(&queue->send_list) && !queue->request;
list_add_tail(&req->entry, &queue->send_list);
spin_unlock(&queue->lock);
- queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ /*
+ * if we're the first on the send_list and we can try to send
+ * directly, otherwise queue io_work. Also, only do that if we
+ * are on the same cpu, so we don't introduce contention.
+ */
+ if (queue->io_cpu == smp_processor_id() &&
+ sync && empty && mutex_trylock(&queue->send_mutex)) {
+ nvme_tcp_try_send(queue);
+ mutex_unlock(&queue->send_mutex);
+ } else {
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ }
}
static inline struct nvme_tcp_request *
@@ -578,7 +595,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
req->state = NVME_TCP_SEND_H2C_PDU;
req->offset = 0;
- nvme_tcp_queue_request(req);
+ nvme_tcp_queue_request(req, false);
return 0;
}
@@ -794,11 +811,12 @@ static void nvme_tcp_data_ready(struct sock *sk)
{
struct nvme_tcp_queue *queue;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
- if (likely(queue && queue->rd_enabled))
+ if (likely(queue && queue->rd_enabled) &&
+ !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void nvme_tcp_write_space(struct sock *sk)
@@ -867,7 +885,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
if (last && !queue->data_digest)
flags |= MSG_EOR;
else
- flags |= MSG_MORE;
+ flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
/* can't zcopy slab pages */
if (unlikely(PageSlab(page))) {
@@ -906,11 +924,16 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
struct nvme_tcp_queue *queue = req->queue;
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
bool inline_data = nvme_tcp_has_inline_data(req);
- int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) + hdgst - req->offset;
+ int flags = MSG_DONTWAIT;
int ret;
+ if (inline_data)
+ flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ else
+ flags |= MSG_EOR;
+
if (queue->hdr_digest && !req->offset)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
@@ -949,7 +972,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
offset_in_page(pdu) + req->offset, len,
- MSG_DONTWAIT | MSG_MORE);
+ MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
if (unlikely(ret <= 0))
return ret;
@@ -1063,11 +1086,14 @@ static void nvme_tcp_io_work(struct work_struct *w)
bool pending = false;
int result;
- result = nvme_tcp_try_send(queue);
- if (result > 0)
- pending = true;
- else if (unlikely(result < 0))
- break;
+ if (mutex_trylock(&queue->send_mutex)) {
+ result = nvme_tcp_try_send(queue);
+ mutex_unlock(&queue->send_mutex);
+ if (result > 0)
+ pending = true;
+ else if (unlikely(result < 0))
+ break;
+ }
result = nvme_tcp_try_recv(queue);
if (result > 0)
@@ -1313,12 +1339,12 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
- struct linger sol = { .l_onoff = 1, .l_linger = 0 };
- int ret, opt, rcv_pdu_size;
+ int ret, rcv_pdu_size;
queue->ctrl = ctrl;
INIT_LIST_HEAD(&queue->send_list);
spin_lock_init(&queue->lock);
+ mutex_init(&queue->send_mutex);
INIT_WORK(&queue->io_work, nvme_tcp_io_work);
queue->queue_size = queue_size;
@@ -1337,60 +1363,24 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
}
/* Single syn retry */
- opt = 1;
- ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
- (char *)&opt, sizeof(opt));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set TCP_SYNCNT sock opt %d\n", ret);
- goto err_sock;
- }
+ tcp_sock_set_syncnt(queue->sock->sk, 1);
/* Set TCP no delay */
- opt = 1;
- ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
- TCP_NODELAY, (char *)&opt, sizeof(opt));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set TCP_NODELAY sock opt %d\n", ret);
- goto err_sock;
- }
+ tcp_sock_set_nodelay(queue->sock->sk);
/*
* Cleanup whatever is sitting in the TCP transmit queue on socket
* close. This is done to prevent stale data from being sent should
* the network connection be restored before TCP times out.
*/
- ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
- (char *)&sol, sizeof(sol));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set SO_LINGER sock opt %d\n", ret);
- goto err_sock;
- }
+ sock_no_linger(queue->sock->sk);
- if (so_priority > 0) {
- ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY,
- (char *)&so_priority, sizeof(so_priority));
- if (ret) {
- dev_err(ctrl->ctrl.device,
- "failed to set SO_PRIORITY sock opt, ret %d\n",
- ret);
- goto err_sock;
- }
- }
+ if (so_priority > 0)
+ sock_set_priority(queue->sock->sk, so_priority);
/* Set socket type of service */
- if (nctrl->opts->tos >= 0) {
- opt = nctrl->opts->tos;
- ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS,
- (char *)&opt, sizeof(opt));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set IP_TOS sock opt %d\n", ret);
- goto err_sock;
- }
- }
+ if (nctrl->opts->tos >= 0)
+ ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
queue->sock->sk->sk_allocation = GFP_ATOMIC;
nvme_tcp_set_queue_io_cpu(queue);
@@ -1543,6 +1533,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = 2; /* connect + keep-alive */
set->numa_node = NUMA_NO_NODE;
+ set->flags = BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = 1;
@@ -1554,7 +1545,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = 1; /* fabric connect */
set->numa_node = NUMA_NO_NODE;
- set->flags = BLK_MQ_F_SHOULD_MERGE;
+ set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
@@ -2113,7 +2104,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
- nvme_tcp_queue_request(&ctrl->async_req);
+ nvme_tcp_queue_request(&ctrl->async_req, true);
}
static enum blk_eh_timer_return
@@ -2244,7 +2235,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(rq);
- nvme_tcp_queue_request(req);
+ nvme_tcp_queue_request(req, true);
return BLK_STS_OK;
}
@@ -2302,9 +2293,11 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
return 0;
+ set_bit(NVME_TCP_Q_POLLING, &queue->flags);
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
nvme_tcp_try_recv(queue);
+ clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
return queue->nr_cqe;
}