diff options
Diffstat (limited to 'net/sunrpc/xprtrdma')
-rw-r--r-- | net/sunrpc/xprtrdma/Makefile | 2 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/backchannel.c | 28 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/frwr_ops.c | 469 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/module.c | 1 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 302 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma.c | 199 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 170 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_pcl.c | 306 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 683 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_rw.c | 863 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_sendto.c | 1016 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 174 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 142 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 913 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 156 |
15 files changed, 2980 insertions, 2444 deletions
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 8ed0377d7a18..55b21bae866d 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ - module.o + svc_rdma_pcl.o module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 1a0ae0c61353..e4d84a13c566 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015-2020, Oracle and/or its affiliates. * - * Support for backward direction RPCs on RPC/RDMA. + * Support for reverse-direction RPCs on RPC/RDMA. */ #include <linux/sunrpc/xprt.h> @@ -13,10 +13,6 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - #undef RPCRDMA_BACKCHANNEL_DEBUG /** @@ -44,10 +40,10 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; size_t maxmsg; - maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv); + maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv); maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE); return maxmsg - RPCRDMA_HDRLEN_MIN; } @@ -82,7 +78,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) &rqst->rq_snd_buf, rpcrdma_noch_pullup)) return -EIO; - trace_xprtrdma_cb_reply(rqst); + trace_xprtrdma_cb_reply(r_xprt, rqst); return 0; } @@ -115,7 +111,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (rc < 0) goto failed_marshal; - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + if (frwr_send(r_xprt, req)) goto drop_connection; return 0; @@ -155,9 +151,11 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) { struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_rep *rep = req->rl_reply; struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - rpcrdma_recv_buffer_put(req->rl_reply); + rpcrdma_rep_put(&r_xprt->rx_buf, rep); req->rl_reply = NULL; spin_lock(&xprt->bc_pa_lock); @@ -190,8 +188,8 @@ create_req: if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS) return NULL; - size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE); - req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); + size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE); + req = rpcrdma_req_create(r_xprt, size); if (!req) return NULL; if (rpcrdma_req_setup(r_xprt, req)) { @@ -208,7 +206,7 @@ create_req: } /** - * rpcrdma_bc_receive_call - Handle a backward direction call + * rpcrdma_bc_receive_call - Handle a reverse-direction Call * @r_xprt: transport receiving the call * @rep: receive buffer containing the call * @@ -260,7 +258,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, */ req = rpcr_to_rdmar(rqst); req->rl_reply = rep; - trace_xprtrdma_cb_call(rqst); + trace_xprtrdma_cb_call(r_xprt, rqst); /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 125297c9aa3e..ffbf99894970 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -40,51 +40,56 @@ * New MRs are created on demand. */ -#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif +static void frwr_cid_init(struct rpcrdma_ep *ep, + struct rpcrdma_mr *mr) +{ + struct rpc_rdma_cid *cid = &mr->mr_cid; + + cid->ci_queue_id = ep->re_attr.send_cq->res.id; + cid->ci_completion_id = mr->mr_ibmr->res.id; +} + +static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) +{ + if (mr->mr_device) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents, + mr->mr_dir); + mr->mr_device = NULL; + } +} /** - * frwr_release_mr - Destroy one MR - * @mr: MR allocated by frwr_init_mr + * frwr_mr_release - Destroy one MR + * @mr: MR allocated by frwr_mr_init * */ -void frwr_release_mr(struct rpcrdma_mr *mr) +void frwr_mr_release(struct rpcrdma_mr *mr) { int rc; - rc = ib_dereg_mr(mr->frwr.fr_mr); + frwr_mr_unmap(mr->mr_xprt, mr); + + rc = ib_dereg_mr(mr->mr_ibmr); if (rc) trace_xprtrdma_frwr_dereg(mr, rc); kfree(mr->mr_sg); kfree(mr); } -static void frwr_mr_recycle(struct rpcrdma_mr *mr) +static void frwr_mr_put(struct rpcrdma_mr *mr) { - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - trace_xprtrdma_mr_recycle(mr); - - if (mr->mr_dir != DMA_NONE) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; - } - - spin_lock(&r_xprt->rx_buf.rb_lock); - list_del(&mr->mr_all); - r_xprt->rx_stats.mrs_recycled++; - spin_unlock(&r_xprt->rx_buf.rb_lock); + frwr_mr_unmap(mr->mr_xprt, mr); - frwr_release_mr(mr); + /* The MR is returned to the req's MR free list instead + * of to the xprt's MR free list. No spinlock is needed. + */ + rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); } /* frwr_reset - Place MRs back on the free list @@ -102,76 +107,70 @@ void frwr_reset(struct rpcrdma_req *req) struct rpcrdma_mr *mr; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) - rpcrdma_mr_put(mr); + frwr_mr_put(mr); } /** - * frwr_init_mr - Initialize one MR - * @ia: interface adapter + * frwr_mr_init - Initialize one MR + * @r_xprt: controlling transport instance * @mr: generic MR to prepare for FRWR * * Returns zero if successful. Otherwise a negative errno * is returned. */ -int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - unsigned int depth = ia->ri_max_frwr_depth; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + unsigned int depth = ep->re_max_fr_depth; struct scatterlist *sg; struct ib_mr *frmr; - int rc; - frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); + sg = kcalloc_node(depth, sizeof(*sg), XPRTRDMA_GFP_FLAGS, + ibdev_to_node(ep->re_id->device)); + if (!sg) + return -ENOMEM; + + frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth); if (IS_ERR(frmr)) goto out_mr_err; - sg = kcalloc(depth, sizeof(*sg), GFP_NOFS); - if (!sg) - goto out_list_err; - - mr->frwr.fr_mr = frmr; - mr->mr_dir = DMA_NONE; + mr->mr_xprt = r_xprt; + mr->mr_ibmr = frmr; + mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); - init_completion(&mr->frwr.fr_linv_done); + init_completion(&mr->mr_linv_done); + frwr_cid_init(ep, mr); sg_init_table(sg, depth); mr->mr_sg = sg; return 0; out_mr_err: - rc = PTR_ERR(frmr); - trace_xprtrdma_frwr_alloc(mr, rc); - return rc; - -out_list_err: - ib_dereg_mr(frmr); - return -ENOMEM; + kfree(sg); + trace_xprtrdma_frwr_alloc(mr, PTR_ERR(frmr)); + return PTR_ERR(frmr); } /** * frwr_query_device - Prepare a transport for use with FRWR - * @r_xprt: controlling transport instance + * @ep: endpoint to fill in * @device: RDMA device to query * * On success, sets: - * ep->rep_attr - * ep->rep_max_requests - * ia->ri_max_rdma_segs - * - * And these FRWR-related fields: - * ia->ri_max_frwr_depth - * ia->ri_mrtype + * ep->re_attr + * ep->re_max_requests + * ep->re_max_rdma_segs + * ep->re_max_fr_depth + * ep->re_mrtype * * Return values: * On success, returns zero. * %-EINVAL - the device does not support FRWR memory registration * %-ENOMEM - the device is not sufficiently capable for NFS/RDMA */ -int frwr_query_device(struct rpcrdma_xprt *r_xprt, - const struct ib_device *device) +int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) { const struct ib_device_attr *attrs = &device->attrs; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; int max_qp_wr, depth, delta; unsigned int max_sge; @@ -188,23 +187,23 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge); return -ENOMEM; } - ep->rep_attr.cap.max_send_sge = max_sge; - ep->rep_attr.cap.max_recv_sge = 1; + ep->re_attr.cap.max_send_sge = max_sge; + ep->re_attr.cap.max_recv_sge = 1; - ia->ri_mrtype = IB_MR_TYPE_MEM_REG; - if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) - ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; + ep->re_mrtype = IB_MR_TYPE_MEM_REG; + if (attrs->kernel_cap_flags & IBK_SG_GAPS_REG) + ep->re_mrtype = IB_MR_TYPE_SG_GAPS; /* Quirk: Some devices advertise a large max_fast_reg_page_list_len * capability, but perform optimally when the MRs are not larger * than a page. */ if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS) - ia->ri_max_frwr_depth = attrs->max_sge_rd; + ep->re_max_fr_depth = attrs->max_sge_rd; else - ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len; - if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS) - ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS; + ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len; + if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS) + ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS; /* Add room for frwr register and invalidate WRs. * 1. FRWR reg WR for head @@ -220,11 +219,11 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, /* Calculate N if the device max FRWR depth is smaller than * RPCRDMA_MAX_DATA_SEGS. */ - if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) { - delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth; + if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth; do { depth += 2; /* FRWR reg + invalidate */ - delta -= ia->ri_max_frwr_depth; + delta -= ep->re_max_fr_depth; } while (delta > 0); } @@ -233,34 +232,35 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, max_qp_wr -= 1; if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) return -ENOMEM; - if (ep->rep_max_requests > max_qp_wr) - ep->rep_max_requests = max_qp_wr; - ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; - if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { - ep->rep_max_requests = max_qp_wr / depth; - if (!ep->rep_max_requests) + if (ep->re_max_requests > max_qp_wr) + ep->re_max_requests = max_qp_wr; + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; + if (ep->re_attr.cap.max_send_wr > max_qp_wr) { + ep->re_max_requests = max_qp_wr / depth; + if (!ep->re_max_requests) return -ENOMEM; - ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; } - ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ - ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests; - ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ - - ia->ri_max_rdma_segs = - DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth); + ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ + ep->re_attr.cap.max_recv_wr = ep->re_max_requests; + ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH; + ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ + + ep->re_max_rdma_segs = + DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth); /* Reply chunks require segments for head and tail buffers */ - ia->ri_max_rdma_segs += 2; - if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) - ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; + ep->re_max_rdma_segs += 2; + if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) + ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; /* Ensure the underlying device is capable of conveying the * largest r/wsize NFS will ask for. This guarantees that * failing over from one RDMA device to another will not * break NFS I/O. */ - if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS) + if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS) return -ENOMEM; return 0; @@ -286,41 +286,36 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, int nsegs, bool writing, __be32 xid, struct rpcrdma_mr *mr) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_reg_wr *reg_wr; int i, n, dma_nents; struct ib_mr *ibmr; u8 key; - if (nsegs > ia->ri_max_frwr_depth) - nsegs = ia->ri_max_frwr_depth; + if (nsegs > ep->re_max_fr_depth) + nsegs = ep->re_max_fr_depth; for (i = 0; i < nsegs;) { - if (seg->mr_page) - sg_set_page(&mr->mr_sg[i], - seg->mr_page, - seg->mr_len, - offset_in_page(seg->mr_offset)); - else - sg_set_buf(&mr->mr_sg[i], seg->mr_offset, - seg->mr_len); + sg_set_page(&mr->mr_sg[i], seg->mr_page, + seg->mr_len, seg->mr_offset); ++seg; ++i; - if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS) + if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) continue; - if ((i < nsegs && offset_in_page(seg->mr_offset)) || + if ((i < nsegs && seg->mr_offset) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } mr->mr_dir = rpcrdma_data_dir(writing); mr->mr_nents = i; - dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents, + dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); if (!dma_nents) goto out_dmamap_err; + mr->mr_device = ep->re_id->device; - ibmr = mr->frwr.fr_mr; + ibmr = mr->mr_ibmr; n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); if (n != dma_nents) goto out_mapmr_err; @@ -330,7 +325,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, key = (u8)(ibmr->rkey & 0x000000FF); ib_update_fast_reg_key(ibmr, ++key); - reg_wr = &mr->frwr.fr_regwr; + reg_wr = &mr->mr_regwr; reg_wr->mr = ibmr; reg_wr->key = ibmr->rkey; reg_wr->access = writing ? @@ -345,7 +340,6 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, return seg; out_dmamap_err: - mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); return ERR_PTR(-EIO); @@ -356,54 +350,73 @@ out_mapmr_err: /** * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed FastReg WR * + * Each flushed MR gets destroyed after the QP has drained. */ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_fastreg(wc, frwr); - /* The MR will get recycled when the associated req is retransmitted */ + trace_xprtrdma_wc_fastreg(wc, &mr->mr_cid); + + rpcrdma_flush_disconnect(cq->cq_context, wc); } /** - * frwr_send - post Send WR containing the RPC Call message - * @ia: interface adapter - * @req: Prepared RPC Call + * frwr_send - post Send WRs containing the RPC Call message + * @r_xprt: controlling transport instance + * @req: prepared RPC Call * * For FRWR, chain any FastReg WRs to the Send WR. Only a * single ib_post_send call is needed to register memory * and then post the Send WR. * - * Returns the result of ib_post_send. + * Returns the return code from ib_post_send. + * + * Caller must hold the transport send lock to ensure that the + * pointers to the transport's rdma_cm_id and QP are stable. */ -int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { - struct ib_send_wr *post_wr; + struct ib_send_wr *post_wr, *send_wr = &req->rl_wr; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rpcrdma_mr *mr; + unsigned int num_wrs; + int ret; - post_wr = &req->rl_wr; + num_wrs = 1; + post_wr = send_wr; list_for_each_entry(mr, &req->rl_registered, mr_list) { - struct rpcrdma_frwr *frwr; - - frwr = &mr->frwr; - - frwr->fr_cqe.done = frwr_wc_fastreg; - frwr->fr_regwr.wr.next = post_wr; - frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; - frwr->fr_regwr.wr.num_sge = 0; - frwr->fr_regwr.wr.opcode = IB_WR_REG_MR; - frwr->fr_regwr.wr.send_flags = 0; + trace_xprtrdma_mr_fastreg(mr); + + mr->mr_cqe.done = frwr_wc_fastreg; + mr->mr_regwr.wr.next = post_wr; + mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe; + mr->mr_regwr.wr.num_sge = 0; + mr->mr_regwr.wr.opcode = IB_WR_REG_MR; + mr->mr_regwr.wr.send_flags = 0; + post_wr = &mr->mr_regwr.wr; + ++num_wrs; + } - post_wr = &frwr->fr_regwr.wr; + if ((kref_read(&req->rl_kref) > 1) || num_wrs > ep->re_send_count) { + send_wr->send_flags |= IB_SEND_SIGNALED; + ep->re_send_count = min_t(unsigned int, ep->re_send_batch, + num_wrs - ep->re_send_count); + } else { + send_wr->send_flags &= ~IB_SEND_SIGNALED; + ep->re_send_count -= num_wrs; } - return ib_post_send(ia->ri_id->qp, post_wr, NULL); + trace_xprtrdma_post_send(req); + ret = ib_post_send(ep->re_id->qp, post_wr, NULL); + if (ret) + trace_xprtrdma_post_send_err(r_xprt, req, ret); + return ret; } /** @@ -419,56 +432,54 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_mr_remoteinv(mr); - rpcrdma_mr_put(mr); + trace_xprtrdma_mr_reminv(mr); + frwr_mr_put(mr); break; /* only one invalidated MR per RPC */ } } -static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) +static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr) { - if (wc->status != IB_WC_SUCCESS) - frwr_mr_recycle(mr); - else - rpcrdma_mr_put(mr); + if (likely(wc->status == IB_WC_SUCCESS)) + frwr_mr_put(mr); } /** * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * */ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li(wc, &mr->mr_cid); + frwr_mr_done(wc, mr); + + rpcrdma_flush_disconnect(cq->cq_context, wc); } /** * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * * Awaken anyone waiting for an MR to finish being fenced. */ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_wake(wc, frwr); - __frwr_release_mr(wc, mr); - complete(&frwr->fr_linv_done); + trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid); + frwr_mr_done(wc, mr); + complete(&mr->mr_linv_done); + + rpcrdma_flush_disconnect(cq->cq_context, wc); } /** @@ -485,8 +496,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, **prev, *last; + struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; - struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; int rc; @@ -495,85 +506,86 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ - frwr = NULL; prev = &first; - while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { - + mr = rpcrdma_mr_pop(&req->rl_registered); + do { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; - frwr = &mr->frwr; - frwr->fr_cqe.done = frwr_wc_localinv; - last = &frwr->fr_invwr; + last = &mr->mr_invwr; last->next = NULL; - last->wr_cqe = &frwr->fr_cqe; + last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; last->num_sge = 0; last->opcode = IB_WR_LOCAL_INV; last->send_flags = IB_SEND_SIGNALED; last->ex.invalidate_rkey = mr->mr_handle; + last->wr_cqe->done = frwr_wc_localinv; + *prev = last; prev = &last->next; - } + } while ((mr = rpcrdma_mr_pop(&req->rl_registered))); + + mr = container_of(last, struct rpcrdma_mr, mr_invwr); /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain * are complete. */ - frwr->fr_cqe.done = frwr_wc_localinv_wake; - reinit_completion(&frwr->fr_linv_done); + last->wr_cqe->done = frwr_wc_localinv_wake; + reinit_completion(&mr->mr_linv_done); /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us - * unless ri_id->qp is a valid pointer. + * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, &bad_wr); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will * not happen, so don't wait in that case. */ if (bad_wr != first) - wait_for_completion(&frwr->fr_linv_done); + wait_for_completion(&mr->mr_linv_done); if (!rc) return; - /* Recycle MRs in the LOCAL_INV chain that did not get posted. + /* On error, the MRs get destroyed once the QP has drained. */ + trace_xprtrdma_post_linv_err(req, rc); + + /* Force a connection loss to ensure complete recovery. */ - trace_xprtrdma_post_linv(req, rc); - while (bad_wr) { - frwr = container_of(bad_wr, struct rpcrdma_frwr, - fr_invwr); - mr = container_of(frwr, struct rpcrdma_mr, frwr); - bad_wr = bad_wr->next; - - list_del_init(&mr->mr_list); - frwr_mr_recycle(mr); - } + rpcrdma_force_disconnect(ep); } /** * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * */ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); - struct rpcrdma_rep *rep = mr->mr_req->rl_reply; + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); + struct rpcrdma_rep *rep; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_done(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li_done(wc, &mr->mr_cid); - /* Ensure @rep is generated before __frwr_release_mr */ + /* Ensure that @rep is generated before the MR is released */ + rep = mr->mr_req->rl_reply; smp_rmb(); + + if (wc->status != IB_WC_SUCCESS) { + if (rep) + rpcrdma_unpin_rqst(rep); + rpcrdma_flush_disconnect(cq->cq_context, wc); + return; + } + frwr_mr_put(mr); rpcrdma_complete_rqst(rep); } @@ -590,66 +602,95 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, *last, **prev; - const struct ib_send_wr *bad_wr; - struct rpcrdma_frwr *frwr; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rpcrdma_mr *mr; int rc; /* Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ - frwr = NULL; prev = &first; - while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { - + mr = rpcrdma_mr_pop(&req->rl_registered); + do { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; - frwr = &mr->frwr; - frwr->fr_cqe.done = frwr_wc_localinv; - last = &frwr->fr_invwr; + last = &mr->mr_invwr; last->next = NULL; - last->wr_cqe = &frwr->fr_cqe; + last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; last->num_sge = 0; last->opcode = IB_WR_LOCAL_INV; last->send_flags = IB_SEND_SIGNALED; last->ex.invalidate_rkey = mr->mr_handle; + last->wr_cqe->done = frwr_wc_localinv; + *prev = last; prev = &last->next; - } + } while ((mr = rpcrdma_mr_pop(&req->rl_registered))); /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain * are complete. The last completion will wake up the * RPC waiter. */ - frwr->fr_cqe.done = frwr_wc_localinv_done; + last->wr_cqe->done = frwr_wc_localinv_done; /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us - * unless ri_id->qp is a valid pointer. + * unless re_id->qp is a valid pointer. */ - bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, NULL); if (!rc) return; - /* Recycle MRs in the LOCAL_INV chain that did not get posted. - */ - trace_xprtrdma_post_linv(req, rc); - while (bad_wr) { - frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); - mr = container_of(frwr, struct rpcrdma_mr, frwr); - bad_wr = bad_wr->next; - - frwr_mr_recycle(mr); - } + /* On error, the MRs get destroyed once the QP has drained. */ + trace_xprtrdma_post_linv_err(req, rc); /* The final LOCAL_INV WR in the chain is supposed to - * do the wake. If it was never posted, the wake will - * not happen, so wake here in that case. + * do the wake. If it was never posted, the wake does + * not happen. Unpin the rqst in preparation for its + * retransmission. + */ + rpcrdma_unpin_rqst(req->rl_reply); + + /* Force a connection loss to ensure complete recovery. */ - rpcrdma_complete_rqst(req->rl_reply); + rpcrdma_force_disconnect(ep); +} + +/** + * frwr_wp_create - Create an MR for padding Write chunks + * @r_xprt: transport resources to use + * + * Return 0 on success, negative errno on failure. + */ +int frwr_wp_create(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rpcrdma_mr_seg seg; + struct rpcrdma_mr *mr; + + mr = rpcrdma_mr_get(r_xprt); + if (!mr) + return -EAGAIN; + mr->mr_req = NULL; + ep->re_write_pad_mr = mr; + + seg.mr_len = XDR_UNIT; + seg.mr_page = virt_to_page(ep->re_write_pad); + seg.mr_offset = offset_in_page(ep->re_write_pad); + if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr))) + return -EIO; + trace_xprtrdma_mr_fastreg(mr); + + mr->mr_cqe.done = frwr_wc_fastreg; + mr->mr_regwr.wr.next = NULL; + mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe; + mr->mr_regwr.wr.num_sge = 0; + mr->mr_regwr.wr.opcode = IB_WR_REG_MR; + mr->mr_regwr.wr.send_flags = 0; + + return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL); } diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 620327c01302..45c5b41ac8dc 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("svcrdma"); MODULE_ALIAS("xprtrdma"); +MODULE_ALIAS("rpcrdma6"); static void __exit rpc_rdma_cleanup(void) { diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 28020ec104d4..190a4de239c8 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (c) 2014-2017 Oracle. All rights reserved. + * Copyright (c) 2014-2020, Oracle and/or its affiliates. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -54,10 +54,6 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - /* Returns size of largest RPC-over-RDMA header in a Call message * * The largest Call header contains a full-size Read list and a @@ -71,7 +67,7 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Read list size */ - size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); + size += maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); /* Minimal Read chunk size */ size += sizeof(__be32); /* segment count */ @@ -94,7 +90,7 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Write list size */ - size = sizeof(__be32); /* segment count */ + size += sizeof(__be32); /* segment count */ size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ @@ -103,21 +99,20 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) /** * rpcrdma_set_max_header_sizes - Initialize inline payload sizes - * @r_xprt: transport instance to initialize + * @ep: endpoint to initialize * * The max_inline fields contain the maximum size of an RPC message * so the marshaling code doesn't have to repeat this calculation * for every RPC. */ -void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) +void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep) { - unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + unsigned int maxsegs = ep->re_max_rdma_segs; - ep->rep_max_inline_send = - ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs); - ep->rep_max_inline_recv = - ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs); + ep->re_max_inline_send = + ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs); + ep->re_max_inline_recv = + ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs); } /* The client can send a request inline as long as the RPCRDMA header @@ -132,9 +127,10 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { struct xdr_buf *xdr = &rqst->rq_snd_buf; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned int count, remaining, offset; - if (xdr->len > r_xprt->rx_ep.rep_max_inline_send) + if (xdr->len > ep->re_max_inline_send) return false; if (xdr->page_len) { @@ -145,7 +141,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, remaining -= min_t(unsigned int, PAGE_SIZE - offset, remaining); offset = 0; - if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge) + if (++count > ep->re_attr.cap.max_send_sge) return false; } } @@ -162,7 +158,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv; + return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv; } /* The client is required to provide a Reply chunk if the maximum @@ -176,12 +172,35 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, const struct xdr_buf *buf = &rqst->rq_rcv_buf; return (buf->head[0].iov_len + buf->tail[0].iov_len) < - r_xprt->rx_ep.rep_max_inline_recv; + r_xprt->rx_ep->re_max_inline_recv; +} + +/* ACL likes to be lazy in allocating pages. For TCP, these + * pages can be allocated during receive processing. Not true + * for RDMA, which must always provision receive buffers + * up front. + */ +static noinline int +rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) +{ + struct page **ppages; + int len; + + len = buf->page_len; + ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); + while (len > 0) { + if (!*ppages) + *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + if (!*ppages) + return -ENOBUFS; + ppages++; + len -= PAGE_SIZE; + } + + return 0; } -/* Split @vec on page boundaries into SGEs. FMR registers pages, not - * a byte range. Other modes coalesce these SGEs into a single MR - * when they can. +/* Convert @vec to a single SGL element. * * Returns pointer to next available SGE, and bumps the total number * of SGEs consumed. @@ -190,22 +209,11 @@ static struct rpcrdma_mr_seg * rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, unsigned int *n) { - u32 remaining, page_offset; - char *base; - - base = vec->iov_base; - page_offset = offset_in_page(base); - remaining = vec->iov_len; - while (remaining) { - seg->mr_page = NULL; - seg->mr_offset = base; - seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); - remaining -= seg->mr_len; - base += seg->mr_len; - ++seg; - ++(*n); - page_offset = 0; - } + seg->mr_page = virt_to_page(vec->iov_base); + seg->mr_offset = offset_in_page(vec->iov_base); + seg->mr_len = vec->iov_len; + ++seg; + ++(*n); return seg; } @@ -233,17 +241,8 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); while (len) { - /* ACL likes to be lazy in allocating pages - ACLs - * are small by default but can get huge. - */ - if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { - if (!*ppages) - *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); - if (!*ppages) - return -ENOBUFS; - } seg->mr_page = *ppages; - seg->mr_offset = (char *)page_base; + seg->mr_offset = page_base; seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); len -= seg->mr_len; ++ppages; @@ -252,22 +251,11 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, page_base = 0; } - /* When encoding a Read chunk, the tail iovec contains an - * XDR pad and may be omitted. - */ - if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) - goto out; - - /* When encoding a Write chunk, some servers need to see an - * extra segment for non-XDR-aligned Write chunks. The upper - * layer provides space in the tail iovec that may be used - * for this purpose. - */ - if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) + if (type == rpcrdma_readch || type == rpcrdma_writech) goto out; if (xdrbuf->tail[0].iov_len) - seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); + rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); out: if (unlikely(n > RPCRDMA_MAX_SEGS)) @@ -275,40 +263,6 @@ out: return n; } -static inline int -encode_item_present(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, sizeof(*p)); - if (unlikely(!p)) - return -EMSGSIZE; - - *p = xdr_one; - return 0; -} - -static inline int -encode_item_not_present(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, sizeof(*p)); - if (unlikely(!p)) - return -EMSGSIZE; - - *p = xdr_zero; - return 0; -} - -static void -xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) -{ - *iptr++ = cpu_to_be32(mr->mr_handle); - *iptr++ = cpu_to_be32(mr->mr_length); - xdr_encode_hyper(iptr, mr->mr_offset); -} - static int encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) { @@ -318,7 +272,7 @@ encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) if (unlikely(!p)) return -EMSGSIZE; - xdr_encode_rdma_segment(p, mr); + xdr_encode_rdma_segment(p, mr->mr_handle, mr->mr_length, mr->mr_offset); return 0; } @@ -333,8 +287,8 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return -EMSGSIZE; *p++ = xdr_one; /* Item present */ - *p++ = cpu_to_be32(position); - xdr_encode_rdma_segment(p, mr); + xdr_encode_read_segment(p, position, mr->mr_handle, mr->mr_length, + mr->mr_offset); return 0; } @@ -349,7 +303,6 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, *mr = rpcrdma_mr_get(r_xprt); if (!*mr) goto out_getmr_err; - trace_xprtrdma_mr_get(req); (*mr)->mr_req = req; } @@ -357,7 +310,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); out_getmr_err: - trace_xprtrdma_nomrs(req); + trace_xprtrdma_nomrs_err(r_xprt, req); xprt_wait_for_buffer_space(&r_xprt->rx_xprt); rpcrdma_mrs_refresh(r_xprt); return ERR_PTR(-EAGAIN); @@ -414,7 +367,9 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, } while (nsegs); done: - return encode_item_not_present(xdr); + if (xdr_stream_encode_item_absent(xdr) < 0) + return -EMSGSIZE; + return 0; } /* Register and XDR encode the Write list. Supports encoding a list @@ -438,6 +393,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rpcrdma_mr_seg *seg; struct rpcrdma_mr *mr; int nsegs, nchunks; @@ -453,7 +409,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, if (nsegs < 0) return nsegs; - if (encode_item_present(xdr) < 0) + if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; segcount = xdr_reserve_space(xdr, sizeof(*segcount)); if (unlikely(!segcount)) @@ -476,11 +432,25 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, nsegs -= mr->mr_nents; } while (nsegs); + if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) { + if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) + return -EMSGSIZE; + + trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, + nsegs); + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; + nchunks++; + nsegs -= mr->mr_nents; + } + /* Update count of segments in this Write chunk */ *segcount = cpu_to_be32(nchunks); done: - return encode_item_not_present(xdr); + if (xdr_stream_encode_item_absent(xdr) < 0) + return -EMSGSIZE; + return 0; } /* Register and XDR encode the Reply chunk. Supports encoding an array @@ -506,15 +476,18 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, int nsegs, nchunks; __be32 *segcount; - if (wtype != rpcrdma_replych) - return encode_item_not_present(xdr); + if (wtype != rpcrdma_replych) { + if (xdr_stream_encode_item_absent(xdr) < 0) + return -EMSGSIZE; + return 0; + } seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) return nsegs; - if (encode_item_present(xdr) < 0) + if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; segcount = xdr_reserve_space(xdr, sizeof(*segcount)); if (unlikely(!segcount)) @@ -894,6 +867,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) __be32 *p; int ret; + if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) { + ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf); + if (ret) + return ret; + } + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), rqst); @@ -911,8 +890,8 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) * or privacy, direct data placement of individual data items * is not allowed. */ - ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & - RPCAUTH_AUTH_DATATOUCH); + ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH, + &rqst->rq_cred->cr_auth->au_flags); /* * Chunks needed for results? @@ -1142,6 +1121,7 @@ static bool rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) #if defined(CONFIG_SUNRPC_BACKCHANNEL) { + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct xdr_stream *xdr = &rep->rr_stream; __be32 *p; @@ -1152,11 +1132,11 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) p = xdr_inline_decode(xdr, 0); /* Chunk lists */ - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; /* RPC header */ @@ -1165,19 +1145,19 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) if (*p != cpu_to_be32(RPC_CALL)) return false; + /* No bc service. */ + if (xprt->bc_serv == NULL) + return false; + /* Now that we are sure this is a backchannel call, * advance to the RPC header. */ p = xdr_inline_decode(xdr, 3 * sizeof(*p)); if (unlikely(!p)) - goto out_short; + return true; rpcrdma_bc_receive_call(r_xprt, rep); return true; - -out_short: - pr_warn("RPC/RDMA short backward direction call\n"); - return true; } #else /* CONFIG_SUNRPC_BACKCHANNEL */ { @@ -1195,10 +1175,7 @@ static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) if (unlikely(!p)) return -EIO; - handle = be32_to_cpup(p++); - *length = be32_to_cpup(p++); - xdr_decode_hyper(p, &offset); - + xdr_decode_rdma_segment(p, &handle, length, &offset); trace_xprtrdma_decode_seg(handle, *length, offset); return 0; } @@ -1234,7 +1211,7 @@ static int decode_read_list(struct xdr_stream *xdr) p = xdr_inline_decode(xdr, sizeof(*p)); if (unlikely(!p)) return -EIO; - if (unlikely(*p != xdr_zero)) + if (unlikely(xdr_item_is_present(p))) return -EIO; return 0; } @@ -1253,7 +1230,7 @@ static int decode_write_list(struct xdr_stream *xdr, u32 *length) p = xdr_inline_decode(xdr, sizeof(*p)); if (unlikely(!p)) return -EIO; - if (*p == xdr_zero) + if (xdr_item_is_absent(p)) break; if (!first) return -EIO; @@ -1275,7 +1252,7 @@ static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) return -EIO; *length = 0; - if (*p != xdr_zero) + if (xdr_item_is_present(p)) if (decode_write_chunk(xdr, length)) return -EIO; return 0; @@ -1352,29 +1329,47 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) break; - dprintk("RPC: %s: server reports " - "version error (%u-%u), xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(*(p + 1)), - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_vers(rqst, p, p + 1); break; case err_chunk: - dprintk("RPC: %s: server reports " - "header decoding error, xid %08x\n", __func__, - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_chunk(rqst); break; default: - dprintk("RPC: %s: server reports " - "unrecognized error %d, xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_unrecognized(rqst, p); } - r_xprt->rx_stats.bad_reply_count++; - return -EREMOTEIO; + return -EIO; +} + +/** + * rpcrdma_unpin_rqst - Release rqst without completing it + * @rep: RPC/RDMA Receive context + * + * This is done when a connection is lost so that a Reply + * can be dropped and its matching Call can be subsequently + * retransmitted on a new connection. + */ +void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep) +{ + struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt; + struct rpc_rqst *rqst = rep->rr_rqst; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + + req->rl_reply = NULL; + rep->rr_rqst = NULL; + + spin_lock(&xprt->queue_lock); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->queue_lock); } -/* Perform XID lookup, reconstruction of the RPC reply, and - * RPC completion while holding the transport lock to ensure - * the rep, rqst, and rq_task pointers remain stable. +/** + * rpcrdma_complete_rqst - Pass completed rqst back to RPC + * @rep: RPC/RDMA Receive context + * + * Reconstruct the RPC reply and complete the transaction + * while @rqst is still pinned to ensure the rep, rqst, and + * rq_task pointers remain stable. */ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) { @@ -1406,13 +1401,11 @@ out: spin_unlock(&xprt->queue_lock); return; -/* If the incoming reply terminated a pending RPC, the next - * RPC call will post a replacement receive buffer as it is - * being marshaled. - */ out_badheader: - trace_xprtrdma_reply_hdr(rep); + trace_xprtrdma_reply_hdr_err(rep); r_xprt->rx_stats.bad_reply_count++; + rqst->rq_task->tk_status = status; + status = 0; goto out; } @@ -1476,21 +1469,20 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (credits == 0) credits = 1; /* don't deadlock */ - else if (credits > r_xprt->rx_ep.rep_max_requests) - credits = r_xprt->rx_ep.rep_max_requests; + else if (credits > r_xprt->rx_ep->re_max_requests) + credits = r_xprt->rx_ep->re_max_requests; + rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1), + false); if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); - rpcrdma_post_recvs(r_xprt, false); req = rpcr_to_rdmar(rqst); - if (req->rl_reply) { - trace_xprtrdma_leaked_rep(rqst, req->rl_reply); - rpcrdma_recv_buffer_put(req->rl_reply); - } + if (unlikely(req->rl_reply)) + rpcrdma_rep_put(buf, req->rl_reply); req->rl_reply = rep; rep->rr_rqst = rqst; - trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); + trace_xprtrdma_reply(rqst->rq_task, rep, credits); if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) frwr_reminv(rep, &req->rl_registered); @@ -1502,17 +1494,17 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) return; out_badversion: - trace_xprtrdma_reply_vers(rep); + trace_xprtrdma_reply_vers_err(rep); goto out; out_norqst: spin_unlock(&xprt->queue_lock); - trace_xprtrdma_reply_rqst(rep); + trace_xprtrdma_reply_rqst_err(rep); goto out; out_shortreply: - trace_xprtrdma_reply_short(rep); + trace_xprtrdma_reply_short_err(rep); out: - rpcrdma_recv_buffer_put(rep); + rpcrdma_rep_put(buf, rep); } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 97bca509a391..5bc20e9d09cd 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -62,52 +62,47 @@ static unsigned int max_max_requests = 16384; unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH; static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH; static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH; +static unsigned int svcrdma_stat_unused; +static unsigned int zero; -atomic_t rdma_stat_recv; -atomic_t rdma_stat_read; -atomic_t rdma_stat_write; -atomic_t rdma_stat_sq_starve; -atomic_t rdma_stat_rq_starve; -atomic_t rdma_stat_rq_poll; -atomic_t rdma_stat_rq_prod; -atomic_t rdma_stat_sq_poll; -atomic_t rdma_stat_sq_prod; +struct percpu_counter svcrdma_stat_read; +struct percpu_counter svcrdma_stat_recv; +struct percpu_counter svcrdma_stat_sq_starve; +struct percpu_counter svcrdma_stat_write; -/* - * This function implements reading and resetting an atomic_t stat - * variable through read/write to a proc file. Any write to the file - * resets the associated statistic to zero. Any read returns it's - * current value. - */ -static int read_reset_stat(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +enum { + SVCRDMA_COUNTER_BUFSIZ = sizeof(unsigned long long), +}; + +static int svcrdma_counter_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { - atomic_t *stat = (atomic_t *)table->data; - - if (!stat) - return -EINVAL; - - if (write) - atomic_set(stat, 0); - else { - char str_buf[32]; - int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); - if (len >= 32) - return -EFAULT; - len = strlen(str_buf); - if (*ppos > len) { - *lenp = 0; - return 0; - } - len -= *ppos; - if (len > *lenp) - len = *lenp; - if (len && copy_to_user(buffer, str_buf, len)) - return -EFAULT; - *lenp = len; - *ppos += len; + struct percpu_counter *stat = (struct percpu_counter *)table->data; + char tmp[SVCRDMA_COUNTER_BUFSIZ + 1]; + int len; + + if (write) { + percpu_counter_set(stat, 0); + return 0; } + + len = snprintf(tmp, SVCRDMA_COUNTER_BUFSIZ, "%lld\n", + percpu_counter_sum_positive(stat)); + if (len >= SVCRDMA_COUNTER_BUFSIZ) + return -EFAULT; + len = strlen(tmp); + if (*ppos > len) { + *lenp = 0; + return 0; + } + len -= *ppos; + if (len > *lenp) + len = *lenp; + if (len) + memcpy(buffer, tmp, len); + *lenp = len; + *ppos += len; + return 0; } @@ -143,66 +138,76 @@ static struct ctl_table svcrdma_parm_table[] = { { .procname = "rdma_stat_read", - .data = &rdma_stat_read, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_read, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_recv", - .data = &rdma_stat_recv, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_recv, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_write", - .data = &rdma_stat_write, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_write, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_sq_starve", - .data = &rdma_stat_sq_starve, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_sq_starve, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_rq_starve", - .data = &rdma_stat_rq_starve, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_rq_poll", - .data = &rdma_stat_rq_poll, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_rq_prod", - .data = &rdma_stat_rq_prod, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_sq_poll", - .data = &rdma_stat_sq_poll, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_sq_prod", - .data = &rdma_stat_sq_prod, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { }, }; @@ -225,27 +230,69 @@ static struct ctl_table svcrdma_root_table[] = { { }, }; +static void svc_rdma_proc_cleanup(void) +{ + if (!svcrdma_table_header) + return; + unregister_sysctl_table(svcrdma_table_header); + svcrdma_table_header = NULL; + + percpu_counter_destroy(&svcrdma_stat_write); + percpu_counter_destroy(&svcrdma_stat_sq_starve); + percpu_counter_destroy(&svcrdma_stat_recv); + percpu_counter_destroy(&svcrdma_stat_read); +} + +static int svc_rdma_proc_init(void) +{ + int rc; + + if (svcrdma_table_header) + return 0; + + rc = percpu_counter_init(&svcrdma_stat_read, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_write, 0, GFP_KERNEL); + if (rc) + goto out_err; + + svcrdma_table_header = register_sysctl_table(svcrdma_root_table); + return 0; + +out_err: + percpu_counter_destroy(&svcrdma_stat_sq_starve); + percpu_counter_destroy(&svcrdma_stat_recv); + percpu_counter_destroy(&svcrdma_stat_read); + return rc; +} + void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); - if (svcrdma_table_header) { - unregister_sysctl_table(svcrdma_table_header); - svcrdma_table_header = NULL; - } svc_unreg_xprt_class(&svc_rdma_class); + svc_rdma_proc_cleanup(); } int svc_rdma_init(void) { + int rc; + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); dprintk("\tmax_requests : %u\n", svcrdma_max_requests); dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); - if (!svcrdma_table_header) - svcrdma_table_header = - register_sysctl_table(svcrdma_root_table); + rc = svc_rdma_proc_init(); + if (rc) + return rc; /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 908e78bb87c6..aa2227a7e552 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2015-2018 Oracle. All rights reserved. * - * Support for backward direction RPCs on RPC/RDMA (server-side). + * Support for reverse-direction RPCs on RPC/RDMA (server-side). */ #include <linux/sunrpc/svc_rdma.h> @@ -10,59 +10,34 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - -#undef SVCRDMA_BACKCHANNEL_DEBUG - /** - * svc_rdma_handle_bc_reply - Process incoming backchannel reply - * @xprt: controlling backchannel transport - * @rdma_resp: pointer to incoming transport header - * @rcvbuf: XDR buffer into which to decode the reply + * svc_rdma_handle_bc_reply - Process incoming backchannel Reply + * @rqstp: resources for handling the Reply + * @rctxt: Received message * - * Returns: - * %0 if @rcvbuf is filled in, xprt_complete_rqst called, - * %-EAGAIN if server should call ->recvfrom again. */ -int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, - struct xdr_buf *rcvbuf) +void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *rctxt) { + struct svc_xprt *sxprt = rqstp->rq_xprt; + struct rpc_xprt *xprt = sxprt->xpt_bc_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct xdr_buf *rcvbuf = &rqstp->rq_arg; struct kvec *dst, *src = &rcvbuf->head[0]; + __be32 *rdma_resp = rctxt->rc_recv_buf; struct rpc_rqst *req; u32 credits; - size_t len; - __be32 xid; - __be32 *p; - int ret; - - p = (__be32 *)src->iov_base; - len = src->iov_len; - xid = *rdma_resp; - -#ifdef SVCRDMA_BACKCHANNEL_DEBUG - pr_info("%s: xid=%08x, length=%zu\n", - __func__, be32_to_cpu(xid), len); - pr_info("%s: RPC/RDMA: %*ph\n", - __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp); - pr_info("%s: RPC: %*ph\n", - __func__, (int)len, p); -#endif - - ret = -EAGAIN; - if (src->iov_len < 24) - goto out_shortreply; spin_lock(&xprt->queue_lock); - req = xprt_lookup_rqst(xprt, xid); + req = xprt_lookup_rqst(xprt, *rdma_resp); if (!req) - goto out_notfound; + goto out_unlock; dst = &req->rq_private_buf.head[0]; memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); - if (dst->iov_len < len) + if (dst->iov_len < src->iov_len) goto out_unlock; - memcpy(dst->iov_base, p, len); + memcpy(dst->iov_base, src->iov_base, src->iov_len); xprt_pin_rqst(req); spin_unlock(&xprt->queue_lock); @@ -71,34 +46,20 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_buf.rb_bc_max_requests) credits = r_xprt->rx_buf.rb_bc_max_requests; - spin_lock(&xprt->transport_lock); xprt->cwnd = credits << RPC_CWNDSHIFT; spin_unlock(&xprt->transport_lock); spin_lock(&xprt->queue_lock); - ret = 0; xprt_complete_rqst(req->rq_task, rcvbuf->len); xprt_unpin_rqst(req); rcvbuf->len = 0; out_unlock: spin_unlock(&xprt->queue_lock); -out: - return ret; - -out_shortreply: - dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n", - xprt, src->iov_len); - goto out; - -out_notfound: - dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n", - xprt, be32_to_cpu(xid)); - goto out_unlock; } -/* Send a backwards direction RPC call. +/* Send a reverse-direction RPC Call. * * Caller holds the connection's mutex and has already marshaled * the RPC/RDMA request. @@ -113,11 +74,17 @@ out_notfound: */ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst, - struct svc_rdma_send_ctxt *ctxt) + struct svc_rdma_send_ctxt *sctxt) { + struct svc_rdma_recv_ctxt *rctxt; int ret; - ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqst->rq_snd_buf, NULL); + rctxt = svc_rdma_recv_ctxt_get(rdma); + if (!rctxt) + return -EIO; + + ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf); + svc_rdma_recv_ctxt_put(rdma, rctxt); if (ret < 0) return -EIO; @@ -125,8 +92,14 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, * the rq_buffer before all retransmits are complete. */ get_page(virt_to_page(rqst->rq_buffer)); - ctxt->sc_send_wr.opcode = IB_WR_SEND; - return svc_rdma_send(rdma, &ctxt->sc_send_wr); + sctxt->sc_send_wr.opcode = IB_WR_SEND; + ret = svc_rdma_send(rdma, sctxt); + if (ret < 0) + return ret; + + ret = wait_for_completion_killable(&sctxt->sc_done); + svc_rdma_send_ctxt_put(rdma, sctxt); + return ret; } /* Server-side transport endpoint wants a whole page for its send @@ -146,12 +119,12 @@ xprt_rdma_bc_allocate(struct rpc_task *task) return -EINVAL; } - page = alloc_page(RPCRDMA_DEF_GFP); + page = alloc_page(GFP_NOIO | __GFP_NOWARN); if (!page) return -ENOMEM; rqst->rq_buffer = page_address(page); - rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP); + rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, GFP_NOIO | __GFP_NOWARN); if (!rqst->rq_rbuffer) { put_page(page); return -ENOMEM; @@ -181,7 +154,9 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) if (!ctxt) goto drop_connection; - p = ctxt->sc_xprt_buf; + p = xdr_reserve_space(&ctxt->sc_stream, RPCRDMA_HDRLEN_MIN); + if (!p) + goto put_ctxt; *p++ = rqst->rq_xid; *p++ = rpcrdma_version; *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); @@ -189,67 +164,55 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) *p++ = xdr_zero; *p++ = xdr_zero; *p = xdr_zero; - svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_MIN); - -#ifdef SVCRDMA_BACKCHANNEL_DEBUG - pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); -#endif rqst->rq_xtime = ktime_get(); rc = svc_rdma_bc_sendto(rdma, rqst, ctxt); - if (rc) { - svc_rdma_send_ctxt_put(rdma, ctxt); - goto drop_connection; - } + if (rc) + goto put_ctxt; return 0; +put_ctxt: + svc_rdma_send_ctxt_put(rdma, ctxt); + drop_connection: - dprintk("svcrdma: failed to send bc call\n"); return -ENOTCONN; } -/* Send an RPC call on the passive end of a transport - * connection. +/** + * xprt_rdma_bc_send_request - Send a reverse-direction Call + * @rqst: rpc_rqst containing Call message to be sent + * + * Return values: + * %0 if the message was sent successfully + * %ENOTCONN if the message was not sent */ -static int -xprt_rdma_bc_send_request(struct rpc_rqst *rqst) +static int xprt_rdma_bc_send_request(struct rpc_rqst *rqst) { struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; - struct svcxprt_rdma *rdma; + struct svcxprt_rdma *rdma = + container_of(sxprt, struct svcxprt_rdma, sc_xprt); int ret; - dprintk("svcrdma: sending bc call with xid: %08x\n", - be32_to_cpu(rqst->rq_xid)); - - mutex_lock(&sxprt->xpt_mutex); - - ret = -ENOTCONN; - rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) { - ret = rpcrdma_bc_send_request(rdma, rqst); - if (ret == -ENOTCONN) - svc_close_xprt(sxprt); - } - - mutex_unlock(&sxprt->xpt_mutex); + if (test_bit(XPT_DEAD, &sxprt->xpt_flags)) + return -ENOTCONN; - if (ret < 0) - return ret; - return 0; + ret = rpcrdma_bc_send_request(rdma, rqst); + if (ret == -ENOTCONN) + svc_xprt_close(sxprt); + return ret; } static void xprt_rdma_bc_close(struct rpc_xprt *xprt) { - dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + xprt_disconnect_done(xprt); xprt->cwnd = RPC_CWNDSHIFT; } static void xprt_rdma_bc_put(struct rpc_xprt *xprt) { - dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); - + xprt_rdma_free_addresses(xprt); xprt_free(xprt); } @@ -283,26 +246,21 @@ xprt_setup_rdma_bc(struct xprt_create *args) struct rpc_xprt *xprt; struct rpcrdma_xprt *new_xprt; - if (args->addrlen > sizeof(xprt->addr)) { - dprintk("RPC: %s: address too large\n", __func__); + if (args->addrlen > sizeof(xprt->addr)) return ERR_PTR(-EBADF); - } xprt = xprt_alloc(args->net, sizeof(*new_xprt), RPCRDMA_MAX_BC_REQUESTS, RPCRDMA_MAX_BC_REQUESTS); - if (!xprt) { - dprintk("RPC: %s: couldn't allocate rpc_xprt\n", - __func__); + if (!xprt) return ERR_PTR(-ENOMEM); - } xprt->timeout = &xprt_rdma_bc_timeout; xprt_set_bound(xprt); xprt_set_connected(xprt); - xprt->bind_timeout = RPCRDMA_BIND_TO; - xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; - xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + xprt->bind_timeout = 0; + xprt->reestablish_timeout = 0; + xprt->idle_timeout = 0; xprt->prot = XPRT_TRANSPORT_BC_RDMA; xprt->ops = &xprt_rdma_bc_procs; diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c new file mode 100644 index 000000000000..b63cfeaa2923 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Oracle. All rights reserved. + */ + +#include <linux/sunrpc/svc_rdma.h> +#include <linux/sunrpc/rpc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +/** + * pcl_free - Release all memory associated with a parsed chunk list + * @pcl: parsed chunk list + * + */ +void pcl_free(struct svc_rdma_pcl *pcl) +{ + while (!list_empty(&pcl->cl_chunks)) { + struct svc_rdma_chunk *chunk; + + chunk = pcl_first_chunk(pcl); + list_del(&chunk->ch_list); + kfree(chunk); + } +} + +static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position) +{ + struct svc_rdma_chunk *chunk; + + chunk = kmalloc(struct_size(chunk, ch_segments, segcount), GFP_KERNEL); + if (!chunk) + return NULL; + + chunk->ch_position = position; + chunk->ch_length = 0; + chunk->ch_payload_length = 0; + chunk->ch_segcount = 0; + return chunk; +} + +static struct svc_rdma_chunk * +pcl_lookup_position(struct svc_rdma_pcl *pcl, u32 position) +{ + struct svc_rdma_chunk *pos; + + pcl_for_each_chunk(pos, pcl) { + if (pos->ch_position == position) + return pos; + } + return NULL; +} + +static void pcl_insert_position(struct svc_rdma_pcl *pcl, + struct svc_rdma_chunk *chunk) +{ + struct svc_rdma_chunk *pos; + + pcl_for_each_chunk(pos, pcl) { + if (pos->ch_position > chunk->ch_position) + break; + } + __list_add(&chunk->ch_list, pos->ch_list.prev, &pos->ch_list); + pcl->cl_count++; +} + +static void pcl_set_read_segment(const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_chunk *chunk, + u32 handle, u32 length, u64 offset) +{ + struct svc_rdma_segment *segment; + + segment = &chunk->ch_segments[chunk->ch_segcount]; + segment->rs_handle = handle; + segment->rs_length = length; + segment->rs_offset = offset; + + trace_svcrdma_decode_rseg(&rctxt->rc_cid, chunk, segment); + + chunk->ch_length += length; + chunk->ch_segcount++; +} + +/** + * pcl_alloc_call - Construct a parsed chunk list for the Call body + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list + * + * Assumptions: + * - The incoming Read list has already been sanity checked. + * - cl_count is already set to the number of segments in + * the un-decoded list. + * - The list might not be in order by position. + * + * Return values: + * %true: Parsed chunk list was successfully constructed, and + * cl_count is updated to be the number of chunks (ie. + * unique positions) in the Read list. + * %false: Memory allocation failed. + */ +bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + struct svc_rdma_pcl *pcl = &rctxt->rc_call_pcl; + unsigned int i, segcount = pcl->cl_count; + + pcl->cl_count = 0; + for (i = 0; i < segcount; i++) { + struct svc_rdma_chunk *chunk; + u32 position, handle, length; + u64 offset; + + p++; /* skip the list discriminator */ + p = xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position != 0) + continue; + + if (pcl_is_empty(pcl)) { + chunk = pcl_alloc_chunk(segcount, position); + if (!chunk) + return false; + pcl_insert_position(pcl, chunk); + } else { + chunk = list_first_entry(&pcl->cl_chunks, + struct svc_rdma_chunk, + ch_list); + } + + pcl_set_read_segment(rctxt, chunk, handle, length, offset); + } + + return true; +} + +/** + * pcl_alloc_read - Construct a parsed chunk list for normal Read chunks + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list + * + * Assumptions: + * - The incoming Read list has already been sanity checked. + * - cl_count is already set to the number of segments in + * the un-decoded list. + * - The list might not be in order by position. + * + * Return values: + * %true: Parsed chunk list was successfully constructed, and + * cl_count is updated to be the number of chunks (ie. + * unique position values) in the Read list. + * %false: Memory allocation failed. + * + * TODO: + * - Check for chunk range overlaps + */ +bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + struct svc_rdma_pcl *pcl = &rctxt->rc_read_pcl; + unsigned int i, segcount = pcl->cl_count; + + pcl->cl_count = 0; + for (i = 0; i < segcount; i++) { + struct svc_rdma_chunk *chunk; + u32 position, handle, length; + u64 offset; + + p++; /* skip the list discriminator */ + p = xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position == 0) + continue; + + chunk = pcl_lookup_position(pcl, position); + if (!chunk) { + chunk = pcl_alloc_chunk(segcount, position); + if (!chunk) + return false; + pcl_insert_position(pcl, chunk); + } + + pcl_set_read_segment(rctxt, chunk, handle, length, offset); + } + + return true; +} + +/** + * pcl_alloc_write - Construct a parsed chunk list from a Write list + * @rctxt: Ingress receive context + * @pcl: Parsed chunk list to populate + * @p: Start of an un-decoded Write list + * + * Assumptions: + * - The incoming Write list has already been sanity checked, and + * - cl_count is set to the number of chunks in the un-decoded list. + * + * Return values: + * %true: Parsed chunk list was successfully constructed. + * %false: Memory allocation failed. + */ +bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_pcl *pcl, __be32 *p) +{ + struct svc_rdma_segment *segment; + struct svc_rdma_chunk *chunk; + unsigned int i, j; + u32 segcount; + + for (i = 0; i < pcl->cl_count; i++) { + p++; /* skip the list discriminator */ + segcount = be32_to_cpup(p++); + + chunk = pcl_alloc_chunk(segcount, 0); + if (!chunk) + return false; + list_add_tail(&chunk->ch_list, &pcl->cl_chunks); + + for (j = 0; j < segcount; j++) { + segment = &chunk->ch_segments[j]; + p = xdr_decode_rdma_segment(p, &segment->rs_handle, + &segment->rs_length, + &segment->rs_offset); + trace_svcrdma_decode_wseg(&rctxt->rc_cid, chunk, j); + + chunk->ch_length += segment->rs_length; + chunk->ch_segcount++; + } + } + return true; +} + +static int pcl_process_region(const struct xdr_buf *xdr, + unsigned int offset, unsigned int length, + int (*actor)(const struct xdr_buf *, void *), + void *data) +{ + struct xdr_buf subbuf; + + if (!length) + return 0; + if (xdr_buf_subsegment(xdr, &subbuf, offset, length)) + return -EMSGSIZE; + return actor(&subbuf, data); +} + +/** + * pcl_process_nonpayloads - Process non-payload regions inside @xdr + * @pcl: Chunk list to process + * @xdr: xdr_buf to process + * @actor: Function to invoke on each non-payload region + * @data: Arguments for @actor + * + * This mechanism must ignore not only result payloads that were already + * sent via RDMA Write, but also XDR padding for those payloads that + * the upper layer has added. + * + * Assumptions: + * The xdr->len and ch_position fields are aligned to 4-byte multiples. + * + * Returns: + * On success, zero, + * %-EMSGSIZE on XDR buffer overflow, or + * The return value of @actor + */ +int pcl_process_nonpayloads(const struct svc_rdma_pcl *pcl, + const struct xdr_buf *xdr, + int (*actor)(const struct xdr_buf *, void *), + void *data) +{ + struct svc_rdma_chunk *chunk, *next; + unsigned int start; + int ret; + + chunk = pcl_first_chunk(pcl); + + /* No result payloads were generated */ + if (!chunk || !chunk->ch_payload_length) + return actor(xdr, data); + + /* Process the region before the first result payload */ + ret = pcl_process_region(xdr, 0, chunk->ch_position, actor, data); + if (ret < 0) + return ret; + + /* Process the regions between each middle result payload */ + while ((next = pcl_next_chunk(pcl, chunk))) { + if (!next->ch_payload_length) + break; + + start = pcl_chunk_end_offset(chunk); + ret = pcl_process_region(xdr, start, next->ch_position - start, + actor, data); + if (ret < 0) + return ret; + + chunk = next; + } + + /* Process the region after the last result payload */ + start = pcl_chunk_end_offset(chunk); + ret = pcl_process_region(xdr, start, xdr->len - start, actor, data); + if (ret < 0) + return ret; + + return 0; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 96bccd398469..5242ad121450 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -89,10 +89,10 @@ * svc_rdma_recvfrom call returns. * * During the second svc_rdma_recvfrom call, RDMA Read sink pages - * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst - * (see rdma_read_complete() below). + * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst. */ +#include <linux/slab.h> #include <linux/spinlock.h> #include <asm/unaligned.h> #include <rdma/ib_verbs.h> @@ -106,8 +106,6 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc); static inline struct svc_rdma_recv_ctxt * @@ -117,6 +115,13 @@ svc_rdma_next_recv_ctxt(struct list_head *list) rc_list); } +static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_rq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { @@ -135,6 +140,12 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) goto fail2; + svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid); + pcl_init(&ctxt->rc_call_pcl); + pcl_init(&ctxt->rc_read_pcl); + pcl_init(&ctxt->rc_write_pcl); + pcl_init(&ctxt->rc_reply_pcl); + ctxt->rc_recv_wr.next = NULL; ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge; @@ -180,8 +191,13 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) } } -static struct svc_rdma_recv_ctxt * -svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) +/** + * svc_rdma_recv_ctxt_get - Allocate a recv_ctxt + * @rdma: controlling svcxprt_rdma + * + * Returns a recv_ctxt or (rarely) NULL if none are available. + */ +struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; struct llist_node *node; @@ -211,10 +227,10 @@ out_empty: void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { - unsigned int i; - - for (i = 0; i < ctxt->rc_page_count; i++) - put_page(ctxt->rc_pages[i]); + pcl_free(&ctxt->rc_call_pcl); + pcl_free(&ctxt->rc_read_pcl); + pcl_free(&ctxt->rc_write_pcl); + pcl_free(&ctxt->rc_reply_pcl); if (!ctxt->rc_temp) llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); @@ -222,32 +238,68 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, svc_rdma_recv_ctxt_destroy(rdma, ctxt); } -static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, - struct svc_rdma_recv_ctxt *ctxt) +/** + * svc_rdma_release_rqst - Release transport-specific per-rqst resources + * @rqstp: svc_rqst being released + * + * Ensure that the recv_ctxt is released whether or not a Reply + * was sent. For example, the client could close the connection, + * or svc_process could drop an RPC, before the Reply is sent. + */ +void svc_rdma_release_rqst(struct svc_rqst *rqstp) { - int ret; - - svc_xprt_get(&rdma->sc_xprt); - ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); - trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret); - if (ret) - goto err_post; - return 0; + struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt; + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); -err_post: - svc_rdma_recv_ctxt_put(rdma, ctxt); - svc_xprt_put(&rdma->sc_xprt); - return ret; + rqstp->rq_xprt_ctxt = NULL; + if (ctxt) + svc_rdma_recv_ctxt_put(rdma, ctxt); } -static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) +static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, + unsigned int wanted, bool temp) { + const struct ib_recv_wr *bad_wr = NULL; struct svc_rdma_recv_ctxt *ctxt; + struct ib_recv_wr *recv_chain; + int ret; - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - return -ENOMEM; - return __svc_rdma_post_recv(rdma, ctxt); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + return false; + + recv_chain = NULL; + while (wanted--) { + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + break; + + trace_svcrdma_post_recv(ctxt); + ctxt->rc_temp = temp; + ctxt->rc_recv_wr.next = recv_chain; + recv_chain = &ctxt->rc_recv_wr; + rdma->sc_pending_recvs++; + } + if (!recv_chain) + return false; + + ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr); + if (ret) + goto err_free; + return true; + +err_free: + trace_svcrdma_rq_post_err(rdma, ret); + while (bad_wr) { + ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt, + rc_recv_wr); + bad_wr = bad_wr->next; + svc_rdma_recv_ctxt_put(rdma, ctxt); + } + /* Since we're destroying the xprt, no need to reset + * sc_pending_recvs. */ + return false; } /** @@ -258,20 +310,7 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { - struct svc_rdma_recv_ctxt *ctxt; - unsigned int i; - int ret; - - for (i = 0; i < rdma->sc_max_requests; i++) { - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - return false; - ctxt->rc_temp = true; - ret = __svc_rdma_post_recv(rdma, ctxt); - if (ret) - return false; - } - return true; + return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true); } /** @@ -279,8 +318,6 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) * @cq: Completion Queue context * @wc: Work Completion object * - * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that - * the Receive completion handler could be running. */ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { @@ -288,22 +325,30 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_recv_ctxt *ctxt; - trace_svcrdma_wc_receive(wc); + rdma->sc_pending_recvs--; /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); if (wc->status != IB_WC_SUCCESS) goto flushed; - - if (svc_rdma_post_recv(rdma)) - goto post_err; + trace_svcrdma_wc_recv(wc, &ctxt->rc_cid); + + /* If receive posting fails, the connection is about to be + * lost anyway. The server will not be able to send a reply + * for this RPC, and the client will retransmit this RPC + * anyway when it reconnects. + * + * Therefore we drop the Receive, even if status was SUCCESS + * to reduce the likelihood of replayed requests once the + * client reconnects. + */ + if (rdma->sc_pending_recvs < rdma->sc_max_requests) + if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false)) + goto dropped; /* All wc fields are now known to be valid */ ctxt->rc_byte_len = wc->byte_len; - ib_dma_sync_single_for_cpu(rdma->sc_pd->device, - ctxt->rc_recv_sge.addr, - wc->byte_len, DMA_FROM_DEVICE); spin_lock(&rdma->sc_rq_dto_lock); list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q); @@ -312,15 +357,16 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) spin_unlock(&rdma->sc_rq_dto_lock); if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) svc_xprt_enqueue(&rdma->sc_xprt); - goto out; + return; flushed: -post_err: + if (wc->status == IB_WC_WR_FLUSH_ERR) + trace_svcrdma_wc_recv_flush(wc, &ctxt->rc_cid); + else + trace_svcrdma_wc_recv_err(wc, &ctxt->rc_cid); +dropped: svc_rdma_recv_ctxt_put(rdma, ctxt); - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_xprt_enqueue(&rdma->sc_xprt); -out: - svc_xprt_put(&rdma->sc_xprt); + svc_xprt_deferred_close(&rdma->sc_xprt); } /** @@ -332,10 +378,6 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; - while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) { - list_del(&ctxt->rc_list); - svc_rdma_recv_ctxt_put(rdma, ctxt); - } while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) { list_del(&ctxt->rc_list); svc_rdma_recv_ctxt_put(rdma, ctxt); @@ -357,122 +399,183 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, arg->len = ctxt->rc_byte_len; } -/* This accommodates the largest possible Write chunk, - * in one segment. +/** + * xdr_count_read_segments - Count number of Read segments in Read list + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list + * + * Before allocating anything, ensure the ingress Read list is safe + * to use. + * + * The segment count is limited to how many segments can fit in the + * transport header without overflowing the buffer. That's about 40 + * Read segments for a 1KB inline threshold. + * + * Return values: + * %true: Read list is valid. @rctxt's xdr_stream is updated to point + * to the first byte past the Read list. rc_read_pcl and + * rc_call_pcl cl_count fields are set to the number of + * Read segments in the list. + * %false: Read list is corrupt. @rctxt's xdr_stream is left in an + * unknown state. */ -#define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) +static bool xdr_count_read_segments(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + rctxt->rc_call_pcl.cl_count = 0; + rctxt->rc_read_pcl.cl_count = 0; + while (xdr_item_is_present(p)) { + u32 position, handle, length; + u64 offset; + + p = xdr_inline_decode(&rctxt->rc_stream, + rpcrdma_readseg_maxsz * sizeof(*p)); + if (!p) + return false; -/* This accommodates the largest possible Position-Zero - * Read chunk or Reply chunk, in one segment. - */ -#define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) + xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position) { + if (position & 3) + return false; + ++rctxt->rc_read_pcl.cl_count; + } else { + ++rctxt->rc_call_pcl.cl_count; + } + + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + } + return true; +} /* Sanity check the Read list. * - * Implementation limits: - * - This implementation supports only one Read chunk. - * * Sanity checks: - * - Read list does not overflow buffer. - * - Segment size limited by largest NFS data payload. - * - * The segment count is limited to how many segments can - * fit in the transport header without overflowing the - * buffer. That's about 40 Read segments for a 1KB inline - * threshold. - * - * Returns pointer to the following Write list. + * - Read list does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Read list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Read list. + * %false: Read list is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end) +static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 position; - bool first; - - first = true; - while (*p++ != xdr_zero) { - if (first) { - position = be32_to_cpup(p++); - first = false; - } else if (be32_to_cpup(p++) != position) { - return NULL; - } - p++; /* handle */ - if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG) - return NULL; - p += 2; /* offset */ + __be32 *p; - if (p > end) - return NULL; - } - return p; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + if (!xdr_count_read_segments(rctxt, p)) + return false; + if (!pcl_alloc_call(rctxt, p)) + return false; + return pcl_alloc_read(rctxt, p); } -/* The segment count is limited to how many segments can - * fit in the transport header without overflowing the - * buffer. That's about 60 Write segments for a 1KB inline - * threshold. - */ -static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end, - u32 maxlen) +static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt) { - u32 i, segcount; + u32 segcount; + __be32 *p; - segcount = be32_to_cpup(p++); - for (i = 0; i < segcount; i++) { - p++; /* handle */ - if (be32_to_cpup(p++) > maxlen) - return NULL; - p += 2; /* offset */ + if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount)) + return false; - if (p > end) - return NULL; - } + /* A bogus segcount causes this buffer overflow check to fail. */ + p = xdr_inline_decode(&rctxt->rc_stream, + segcount * rpcrdma_segment_maxsz * sizeof(*p)); + return p != NULL; +} - return p; +/** + * xdr_count_write_chunks - Count number of Write chunks in Write list + * @rctxt: Received header and decoding state + * @p: start of an un-decoded Write list + * + * Before allocating anything, ensure the ingress Write list is + * safe to use. + * + * Return values: + * %true: Write list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Write list, and + * the number of Write chunks is in rc_write_pcl.cl_count. + * %false: Write list is corrupt. @rctxt's xdr_stream is left + * in an indeterminate state. + */ +static bool xdr_count_write_chunks(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + rctxt->rc_write_pcl.cl_count = 0; + while (xdr_item_is_present(p)) { + if (!xdr_check_write_chunk(rctxt)) + return false; + ++rctxt->rc_write_pcl.cl_count; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + } + return true; } /* Sanity check the Write list. * * Implementation limits: - * - This implementation supports only one Write chunk. + * - This implementation currently supports only one Write chunk. * * Sanity checks: - * - Write list does not overflow buffer. - * - Segment size limited by largest NFS data payload. - * - * Returns pointer to the following Reply chunk. + * - Write list does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Write list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Write list. + * %false: Write list is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end) +static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 chcount; + __be32 *p; - chcount = 0; - while (*p++ != xdr_zero) { - p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG); - if (!p) - return NULL; - if (chcount++ > 1) - return NULL; - } - return p; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + if (!xdr_count_write_chunks(rctxt, p)) + return false; + if (!pcl_alloc_write(rctxt, &rctxt->rc_write_pcl, p)) + return false; + + rctxt->rc_cur_result_payload = pcl_first_chunk(&rctxt->rc_write_pcl); + return true; } /* Sanity check the Reply chunk. * * Sanity checks: - * - Reply chunk does not overflow buffer. - * - Segment size limited by largest NFS data payload. - * - * Returns pointer to the following RPC header. + * - Reply chunk does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Reply chunk is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Reply chunk. + * %false: Reply chunk is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) +static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt) { - if (*p++ != xdr_zero) { - p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG); - if (!p) - return NULL; - } - return p; + __be32 *p; + + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + + if (!xdr_item_is_present(p)) + return true; + if (!xdr_check_write_chunk(rctxt)) + return false; + + rctxt->rc_reply_pcl.cl_count = 1; + return pcl_alloc_write(rctxt, &rctxt->rc_reply_pcl, p); } /* RPC-over-RDMA Version One private extension: Remote Invalidation. @@ -481,206 +584,147 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) * * If there is exactly one distinct R_key in the received transport * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero. - * - * Perform this operation while the received transport header is - * still in the CPU cache. */ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { - __be32 inv_rkey, *p; - u32 i, segcount; + struct svc_rdma_segment *segment; + struct svc_rdma_chunk *chunk; + u32 inv_rkey; ctxt->rc_inv_rkey = 0; if (!rdma->sc_snd_w_inv) return; - inv_rkey = xdr_zero; - p = ctxt->rc_recv_buf; - p += rpcrdma_fixed_maxsz; - - /* Read list */ - while (*p++ != xdr_zero) { - p++; /* position */ - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) - return; - p += 4; + inv_rkey = 0; + pcl_for_each_chunk(chunk, &ctxt->rc_call_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) + return; + } } - - /* Write list */ - while (*p++ != xdr_zero) { - segcount = be32_to_cpup(p++); - for (i = 0; i < segcount; i++) { - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) + pcl_for_each_chunk(chunk, &ctxt->rc_read_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) return; - p += 4; } } - - /* Reply chunk */ - if (*p++ != xdr_zero) { - segcount = be32_to_cpup(p++); - for (i = 0; i < segcount; i++) { - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) + pcl_for_each_chunk(chunk, &ctxt->rc_write_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) return; - p += 4; } } - - ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey); + pcl_for_each_chunk(chunk, &ctxt->rc_reply_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) + return; + } + } + ctxt->rc_inv_rkey = inv_rkey; } -/* On entry, xdr->head[0].iov_base points to first byte in the - * RPC-over-RDMA header. +/** + * svc_rdma_xdr_decode_req - Decode the transport header + * @rq_arg: xdr_buf containing ingress RPC/RDMA message + * @rctxt: state of decoding + * + * On entry, xdr->head[0].iov_base points to first byte of the + * RPC-over-RDMA transport header. * * On successful exit, head[0] points to first byte past the * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. + * * The length of the RPC-over-RDMA header is returned. * * Assumptions: * - The transport header is entirely contained in the head iovec. */ -static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) +static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg, + struct svc_rdma_recv_ctxt *rctxt) { - __be32 *p, *end, *rdma_argp; + __be32 *p, *rdma_argp; unsigned int hdr_len; - /* Verify that there's enough bytes for header + something */ - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) - goto out_short; - rdma_argp = rq_arg->head[0].iov_base; - if (*(rdma_argp + 1) != rpcrdma_version) - goto out_version; + xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp, NULL); - switch (*(rdma_argp + 3)) { + p = xdr_inline_decode(&rctxt->rc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); + if (unlikely(!p)) + goto out_short; + p++; + if (*p != rpcrdma_version) + goto out_version; + p += 2; + rctxt->rc_msgtype = *p; + switch (rctxt->rc_msgtype) { case rdma_msg: break; case rdma_nomsg: break; - case rdma_done: goto out_drop; - case rdma_error: goto out_drop; - default: goto out_proc; } - end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); - p = xdr_check_read_list(rdma_argp + 4, end); - if (!p) + if (!xdr_check_read_list(rctxt)) goto out_inval; - p = xdr_check_write_list(p, end); - if (!p) - goto out_inval; - p = xdr_check_reply_chunk(p, end); - if (!p) + if (!xdr_check_write_list(rctxt)) goto out_inval; - if (p > end) + if (!xdr_check_reply_chunk(rctxt)) goto out_inval; - rq_arg->head[0].iov_base = p; - hdr_len = (unsigned long)p - (unsigned long)rdma_argp; + rq_arg->head[0].iov_base = rctxt->rc_stream.p; + hdr_len = xdr_stream_pos(&rctxt->rc_stream); rq_arg->head[0].iov_len -= hdr_len; rq_arg->len -= hdr_len; - trace_svcrdma_decode_rqst(rdma_argp, hdr_len); + trace_svcrdma_decode_rqst(rctxt, rdma_argp, hdr_len); return hdr_len; out_short: - trace_svcrdma_decode_short(rq_arg->len); + trace_svcrdma_decode_short_err(rctxt, rq_arg->len); return -EINVAL; out_version: - trace_svcrdma_decode_badvers(rdma_argp); + trace_svcrdma_decode_badvers_err(rctxt, rdma_argp); return -EPROTONOSUPPORT; out_drop: - trace_svcrdma_decode_drop(rdma_argp); + trace_svcrdma_decode_drop_err(rctxt, rdma_argp); return 0; out_proc: - trace_svcrdma_decode_badproc(rdma_argp); + trace_svcrdma_decode_badproc_err(rctxt, rdma_argp); return -EINVAL; out_inval: - trace_svcrdma_decode_parse(rdma_argp); + trace_svcrdma_decode_parse_err(rctxt, rdma_argp); return -EINVAL; } -static void rdma_read_complete(struct svc_rqst *rqstp, - struct svc_rdma_recv_ctxt *head) -{ - int page_no; - - /* Move Read chunk pages to rqstp so that they will be released - * when svc_process is done with them. - */ - for (page_no = 0; page_no < head->rc_page_count; page_no++) { - put_page(rqstp->rq_pages[page_no]); - rqstp->rq_pages[page_no] = head->rc_pages[page_no]; - } - head->rc_page_count = 0; - - /* Point rq_arg.pages past header */ - rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count]; - rqstp->rq_arg.page_len = head->rc_arg.page_len; - - /* rq_respages starts after the last arg page */ - rqstp->rq_respages = &rqstp->rq_pages[page_no]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* Rebuild rq_arg head and tail. */ - rqstp->rq_arg.head[0] = head->rc_arg.head[0]; - rqstp->rq_arg.tail[0] = head->rc_arg.tail[0]; - rqstp->rq_arg.len = head->rc_arg.len; - rqstp->rq_arg.buflen = head->rc_arg.buflen; -} - -static void svc_rdma_send_error(struct svcxprt_rdma *xprt, - __be32 *rdma_argp, int status) +static void svc_rdma_send_error(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *rctxt, + int status) { - struct svc_rdma_send_ctxt *ctxt; - unsigned int length; - __be32 *p; - int ret; + struct svc_rdma_send_ctxt *sctxt; - ctxt = svc_rdma_send_ctxt_get(xprt); - if (!ctxt) + sctxt = svc_rdma_send_ctxt_get(rdma); + if (!sctxt) return; - - p = ctxt->sc_xprt_buf; - *p++ = *rdma_argp; - *p++ = *(rdma_argp + 1); - *p++ = xprt->sc_fc_credits; - *p++ = rdma_error; - switch (status) { - case -EPROTONOSUPPORT: - *p++ = err_vers; - *p++ = rpcrdma_version; - *p++ = rpcrdma_version; - trace_svcrdma_err_vers(*rdma_argp); - break; - default: - *p++ = err_chunk; - trace_svcrdma_err_chunk(*rdma_argp); - } - length = (unsigned long)p - (unsigned long)ctxt->sc_xprt_buf; - svc_rdma_sync_reply_hdr(xprt, ctxt, length); - - ctxt->sc_send_wr.opcode = IB_WR_SEND; - ret = svc_rdma_send(xprt, &ctxt->sc_send_wr); - if (ret) - svc_rdma_send_ctxt_put(xprt, ctxt); + svc_rdma_send_error_msg(rdma, sctxt, rctxt, status); } /* By convention, backchannel calls arrive via rdma_msg type @@ -688,30 +732,28 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt, * the RPC/RDMA header small and fixed in size, so it is * straightforward to check the RPC header's direction field. */ -static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, - __be32 *rdma_resp) +static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, + struct svc_rdma_recv_ctxt *rctxt) { - __be32 *p; + __be32 *p = rctxt->rc_recv_buf; if (!xprt->xpt_bc_xprt) return false; - p = rdma_resp + 3; - if (*p++ != rdma_msg) + if (rctxt->rc_msgtype != rdma_msg) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_call_pcl)) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_read_pcl)) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_write_pcl)) return false; - - /* XID sanity */ - if (*p++ != *rdma_resp) + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) return false; - /* call direction */ - if (*p == cpu_to_be32(RPC_CALL)) + + /* RPC call direction */ + if (*(p + 8) == cpu_to_be32(RPC_CALL)) return false; return true; @@ -753,29 +795,29 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svcxprt_rdma *rdma_xprt = container_of(xprt, struct svcxprt_rdma, sc_xprt); struct svc_rdma_recv_ctxt *ctxt; - __be32 *p; int ret; + rqstp->rq_xprt_ctxt = NULL; + + ctxt = NULL; spin_lock(&rdma_xprt->sc_rq_dto_lock); - ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q); - if (ctxt) { - list_del(&ctxt->rc_list); - spin_unlock(&rdma_xprt->sc_rq_dto_lock); - rdma_read_complete(rqstp, ctxt); - goto complete; - } ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q); - if (!ctxt) { + if (ctxt) + list_del(&ctxt->rc_list); + else /* No new incoming requests, terminate the loop */ clear_bit(XPT_DATA, &xprt->xpt_flags); - spin_unlock(&rdma_xprt->sc_rq_dto_lock); - return 0; - } - list_del(&ctxt->rc_list); spin_unlock(&rdma_xprt->sc_rq_dto_lock); - atomic_inc(&rdma_stat_recv); + /* Unblock the transport for the next receive */ + svc_xprt_received(xprt); + if (!ctxt) + return 0; + percpu_counter_inc(&svcrdma_stat_recv); + ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device, + ctxt->rc_recv_sge.addr, ctxt->rc_byte_len, + DMA_FROM_DEVICE); svc_rdma_build_arg_xdr(rqstp, ctxt); /* Prevent svc_xprt_release from releasing pages in rq_pages @@ -784,49 +826,42 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_next_page = rqstp->rq_respages; - p = (__be32 *)rqstp->rq_arg.head[0].iov_base; - ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); + ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt); if (ret < 0) goto out_err; if (ret == 0) goto out_drop; - rqstp->rq_xprt_hlen = ret; - if (svc_rdma_is_backchannel_reply(xprt, p)) { - ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, - &rqstp->rq_arg); - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); - return ret; - } + if (svc_rdma_is_reverse_direction_reply(xprt, ctxt)) + goto out_backchannel; + svc_rdma_get_inv_rkey(rdma_xprt, ctxt); - p += rpcrdma_fixed_maxsz; - if (*p != xdr_zero) - goto out_readchunk; + if (!pcl_is_empty(&ctxt->rc_read_pcl) || + !pcl_is_empty(&ctxt->rc_call_pcl)) { + ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt); + if (ret < 0) + goto out_readfail; + } -complete: rqstp->rq_xprt_ctxt = ctxt; rqstp->rq_prot = IPPROTO_MAX; svc_xprt_copy_addrs(rqstp, xprt); return rqstp->rq_arg.len; -out_readchunk: - ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p); - if (ret < 0) - goto out_postfail; - return 0; - out_err: - svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_send_error(rdma_xprt, ctxt, ret); svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; -out_postfail: +out_readfail: if (ret == -EINVAL) - svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_send_error(rdma_xprt, ctxt, ret); svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; +out_backchannel: + svc_rdma_handle_bc_reply(rqstp, ctxt); out_drop: svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 48fe3b16b0d9..11cf7c646644 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -7,15 +7,13 @@ #include <rdma/rw.h> +#include <linux/sunrpc/xdr.h> #include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> -#include <linux/sunrpc/debug.h> #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc); static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); @@ -37,11 +35,12 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); * controlling svcxprt_rdma is destroyed. */ struct svc_rdma_rw_ctxt { + struct llist_node rw_node; struct list_head rw_list; struct rdma_rw_ctx rw_ctx; - int rw_nents; + unsigned int rw_nents; struct sg_table rw_sg_table; - struct scatterlist rw_first_sgl[0]; + struct scatterlist rw_first_sgl[]; }; static inline struct svc_rdma_rw_ctxt * @@ -55,41 +54,48 @@ static struct svc_rdma_rw_ctxt * svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_rw_ctxt_lock); - - ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts); - if (ctxt) { - list_del(&ctxt->rw_list); - spin_unlock(&rdma->sc_rw_ctxt_lock); + node = llist_del_first(&rdma->sc_rw_ctxts); + spin_unlock(&rdma->sc_rw_ctxt_lock); + if (node) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - spin_unlock(&rdma->sc_rw_ctxt_lock); ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), GFP_KERNEL); if (!ctxt) - goto out; + goto out_noctx; + INIT_LIST_HEAD(&ctxt->rw_list); } ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, ctxt->rw_sg_table.sgl, - SG_CHUNK_SIZE)) { - kfree(ctxt); - ctxt = NULL; - } -out: + SG_CHUNK_SIZE)) + goto out_free; return ctxt; + +out_free: + kfree(ctxt); +out_noctx: + trace_svcrdma_no_rwctx_err(rdma, sges); + return NULL; } -static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, - struct svc_rdma_rw_ctxt *ctxt) +static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt, + struct llist_head *list) { sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); + llist_add(&ctxt->rw_node, list); +} - spin_lock(&rdma->sc_rw_ctxt_lock); - list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts); - spin_unlock(&rdma->sc_rw_ctxt_lock); +static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt) +{ + __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts); } /** @@ -100,15 +106,44 @@ static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) { - list_del(&ctxt->rw_list); + while ((node = llist_del_first(&rdma->sc_rw_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); kfree(ctxt); } } +/** + * svc_rdma_rw_ctx_init - Prepare a R/W context for I/O + * @rdma: controlling transport instance + * @ctxt: R/W context to prepare + * @offset: RDMA offset + * @handle: RDMA tag/handle + * @direction: I/O direction + * + * Returns on success, the number of WQEs that will be needed + * on the workqueue, or a negative errno. + */ +static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt, + u64 offset, u32 handle, + enum dma_data_direction direction) +{ + int ret; + + ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, + ctxt->rw_sg_table.sgl, ctxt->rw_nents, + 0, offset, handle, direction); + if (unlikely(ret < 0)) { + svc_rdma_put_rw_ctxt(rdma, ctxt); + trace_svcrdma_dma_map_rw_err(rdma, ctxt->rw_nents, ret); + } + return ret; +} + /* A chunk context tracks all I/O for moving one Read or Write - * chunk. This is a a set of rdma_rw's that handle data movement + * chunk. This is a set of rdma_rw's that handle data movement * for all segments of one chunk. * * These are small, acquired with a single allocator call, and @@ -116,37 +151,62 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) * demand, and not cached. */ struct svc_rdma_chunk_ctxt { + struct rpc_rdma_cid cc_cid; struct ib_cqe cc_cqe; struct svcxprt_rdma *cc_rdma; struct list_head cc_rwctxts; + ktime_t cc_posttime; int cc_sqecount; + enum ib_wc_status cc_status; + struct completion cc_done; }; +static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_sq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc) { + svc_rdma_cc_cid_init(rdma, &cc->cc_cid); cc->cc_rdma = rdma; - svc_xprt_get(&rdma->sc_xprt); INIT_LIST_HEAD(&cc->cc_rwctxts); cc->cc_sqecount = 0; } +/* + * The consumed rw_ctx's are cleaned and placed on a local llist so + * that only one atomic llist operation is needed to put them all + * back on the free list. + */ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir) { struct svcxprt_rdma *rdma = cc->cc_rdma; + struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; + LLIST_HEAD(free); + first = last = NULL; while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, ctxt->rw_sg_table.sgl, ctxt->rw_nents, dir); - svc_rdma_put_rw_ctxt(rdma, ctxt); + __svc_rdma_put_rw_ctxt(rdma, ctxt, &free); + + ctxt->rw_node.next = first; + first = &ctxt->rw_node; + if (!last) + last = first; } - svc_xprt_put(&rdma->sc_xprt); + if (first) + llist_add_batch(first, last, &rdma->sc_rw_ctxts); } /* State for sending a Write or Reply chunk. @@ -154,14 +214,14 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, * - Stores arguments for the SGL constructor functions */ struct svc_rdma_write_info { + const struct svc_rdma_chunk *wi_chunk; + /* write state of this chunk */ unsigned int wi_seg_off; unsigned int wi_seg_no; - unsigned int wi_nsegs; - __be32 *wi_segs; /* SGL constructor arguments */ - struct xdr_buf *wi_xdr; + const struct xdr_buf *wi_xdr; unsigned char *wi_base; unsigned int wi_next_off; @@ -169,7 +229,8 @@ struct svc_rdma_write_info { }; static struct svc_rdma_write_info * -svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) +svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk) { struct svc_rdma_write_info *info; @@ -177,10 +238,9 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) if (!info) return info; + info->wi_chunk = chunk; info->wi_seg_off = 0; info->wi_seg_no = 0; - info->wi_nsegs = be32_to_cpup(++chunk); - info->wi_segs = ++chunk; svc_rdma_cc_init(rdma, &info->wi_cc); info->wi_cc.cc_cqe.done = svc_rdma_write_done; return info; @@ -208,13 +268,21 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_write_info *info = container_of(cc, struct svc_rdma_write_info, wi_cc); - trace_svcrdma_wc_write(wc); + switch (wc->status) { + case IB_WC_SUCCESS: + trace_svcrdma_wc_write(wc, &cc->cc_cid); + break; + case IB_WC_WR_FLUSH_ERR: + trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); + break; + default: + trace_svcrdma_wc_write_err(wc, &cc->cc_cid); + } - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); if (unlikely(wc->status != IB_WC_SUCCESS)) - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_xprt_deferred_close(&rdma->sc_xprt); svc_rdma_write_info_free(info); } @@ -222,11 +290,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) /* State for pulling a Read chunk. */ struct svc_rdma_read_info { + struct svc_rqst *ri_rqst; struct svc_rdma_recv_ctxt *ri_readctxt; - unsigned int ri_position; unsigned int ri_pageno; unsigned int ri_pageoff; - unsigned int ri_chunklen; + unsigned int ri_totalbytes; struct svc_rdma_chunk_ctxt ri_cc; }; @@ -262,30 +330,25 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svcxprt_rdma *rdma = cc->cc_rdma; - struct svc_rdma_read_info *info = - container_of(cc, struct svc_rdma_read_info, ri_cc); - - trace_svcrdma_wc_read(wc); - - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); + struct svc_rdma_read_info *info; - if (unlikely(wc->status != IB_WC_SUCCESS)) { - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt); - } else { - spin_lock(&rdma->sc_rq_dto_lock); - list_add_tail(&info->ri_readctxt->rc_list, - &rdma->sc_read_complete_q); - /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */ - set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); - spin_unlock(&rdma->sc_rq_dto_lock); - - svc_xprt_enqueue(&rdma->sc_xprt); + switch (wc->status) { + case IB_WC_SUCCESS: + info = container_of(cc, struct svc_rdma_read_info, ri_cc); + trace_svcrdma_wc_read(wc, &cc->cc_cid, info->ri_totalbytes, + cc->cc_posttime); + break; + case IB_WC_WR_FLUSH_ERR: + trace_svcrdma_wc_read_flush(wc, &cc->cc_cid); + break; + default: + trace_svcrdma_wc_read_err(wc, &cc->cc_cid); } - svc_rdma_read_info_free(info); + svc_rdma_wake_send_waiters(cc->cc_rdma, cc->cc_sqecount); + cc->cc_status = wc->status; + complete(&cc->cc_done); + return; } /* This function sleeps when the transport's Send Queue is congested. @@ -298,7 +361,6 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) { struct svcxprt_rdma *rdma = cc->cc_rdma; - struct svc_xprt *xprt = &rdma->sc_xprt; struct ib_send_wr *first_wr; const struct ib_send_wr *bad_wr; struct list_head *tmp; @@ -322,14 +384,14 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) do { if (atomic_sub_return(cc->cc_sqecount, &rdma->sc_sq_avail) > 0) { + cc->cc_posttime = ktime_get(); ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); - trace_svcrdma_post_rw(&cc->cc_cqe, - cc->cc_sqecount, ret); if (ret) break; return 0; } + percpu_counter_inc(&svcrdma_stat_sq_starve); trace_svcrdma_sq_full(rdma); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, @@ -337,7 +399,8 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) trace_svcrdma_sq_retry(rdma); } while (1); - set_bit(XPT_CLOSE, &xprt->xpt_flags); + trace_svcrdma_sq_post_err(rdma, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); /* If even one was posted, there will be a completion. */ if (bad_wr != first_wr) @@ -369,7 +432,7 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, struct svc_rdma_rw_ctxt *ctxt) { unsigned int sge_no, sge_bytes, page_off, page_no; - struct xdr_buf *xdr = info->wi_xdr; + const struct xdr_buf *xdr = info->wi_xdr; struct scatterlist *sg; struct page **page; @@ -407,43 +470,37 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, { struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; struct svcxprt_rdma *rdma = cc->cc_rdma; + const struct svc_rdma_segment *seg; struct svc_rdma_rw_ctxt *ctxt; - __be32 *seg; int ret; - seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; do { unsigned int write_len; - u32 seg_length, seg_handle; - u64 seg_offset; + u64 offset; - if (info->wi_seg_no >= info->wi_nsegs) + if (info->wi_seg_no >= info->wi_chunk->ch_segcount) goto out_overflow; - seg_handle = be32_to_cpup(seg); - seg_length = be32_to_cpup(seg + 1); - xdr_decode_hyper(seg + 2, &seg_offset); - seg_offset += info->wi_seg_off; - - write_len = min(remaining, seg_length - info->wi_seg_off); + seg = &info->wi_chunk->ch_segments[info->wi_seg_no]; + write_len = min(remaining, seg->rs_length - info->wi_seg_off); + if (!write_len) + goto out_overflow; ctxt = svc_rdma_get_rw_ctxt(rdma, (write_len >> PAGE_SHIFT) + 2); if (!ctxt) - goto out_noctx; + return -ENOMEM; constructor(info, write_len, ctxt); - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, ctxt->rw_sg_table.sgl, - ctxt->rw_nents, 0, seg_offset, - seg_handle, DMA_TO_DEVICE); + offset = seg->rs_offset + info->wi_seg_off; + ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle, + DMA_TO_DEVICE); if (ret < 0) - goto out_initerr; + return -EIO; + percpu_counter_inc(&svcrdma_stat_write); - trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; - if (write_len == seg_length - info->wi_seg_off) { - seg += 4; + if (write_len == seg->rs_length - info->wi_seg_off) { info->wi_seg_no++; info->wi_seg_off = 0; } else { @@ -455,51 +512,96 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, return 0; out_overflow: - dprintk("svcrdma: inadequate space in Write chunk (%u)\n", - info->wi_nsegs); + trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no, + info->wi_chunk->ch_segcount); return -E2BIG; - -out_noctx: - dprintk("svcrdma: no R/W ctxs available\n"); - return -ENOMEM; - -out_initerr: - svc_rdma_put_rw_ctxt(rdma, ctxt); - trace_svcrdma_dma_map_rwctx(rdma, ret); - return -EIO; } -/* Send one of an xdr_buf's kvecs by itself. To send a Reply - * chunk, the whole RPC Reply is written back to the client. - * This function writes either the head or tail of the xdr_buf - * containing the Reply. +/** + * svc_rdma_iov_write - Construct RDMA Writes from an iov + * @info: pointer to write arguments + * @iov: kvec to write + * + * Returns: + * On success, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred */ -static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info, - struct kvec *vec) +static int svc_rdma_iov_write(struct svc_rdma_write_info *info, + const struct kvec *iov) { - info->wi_base = vec->iov_base; + info->wi_base = iov->iov_base; return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, - vec->iov_len); + iov->iov_len); } -/* Send an xdr_buf's page list by itself. A Write chunk is - * just the page list. a Reply chunk is the head, page list, - * and tail. This function is shared between the two types - * of chunk. +/** + * svc_rdma_pages_write - Construct RDMA Writes from pages + * @info: pointer to write arguments + * @xdr: xdr_buf with pages to write + * @offset: offset into the content of @xdr + * @length: number of bytes to write + * + * Returns: + * On success, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred */ -static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, - struct xdr_buf *xdr) +static int svc_rdma_pages_write(struct svc_rdma_write_info *info, + const struct xdr_buf *xdr, + unsigned int offset, + unsigned long length) { info->wi_xdr = xdr; - info->wi_next_off = 0; + info->wi_next_off = offset - xdr->head[0].iov_len; return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, - xdr->page_len); + length); +} + +/** + * svc_rdma_xb_write - Construct RDMA Writes to write an xdr_buf + * @xdr: xdr_buf to write + * @data: pointer to write arguments + * + * Returns: + * On success, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred + */ +static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) +{ + struct svc_rdma_write_info *info = data; + int ret; + + if (xdr->head[0].iov_len) { + ret = svc_rdma_iov_write(info, &xdr->head[0]); + if (ret < 0) + return ret; + } + + if (xdr->page_len) { + ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len, + xdr->page_len); + if (ret < 0) + return ret; + } + + if (xdr->tail[0].iov_len) { + ret = svc_rdma_iov_write(info, &xdr->tail[0]); + if (ret < 0) + return ret; + } + + return xdr->len; } /** * svc_rdma_send_write_chunk - Write all segments in a Write chunk * @rdma: controlling RDMA transport - * @wr_ch: Write chunk provided by client + * @chunk: Write chunk provided by the client * @xdr: xdr_buf containing the data payload * * Returns a non-negative number of bytes the chunk consumed, or @@ -509,29 +611,28 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, - struct xdr_buf *xdr) +int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; + struct svc_rdma_chunk_ctxt *cc; int ret; - if (!xdr->page_len) - return 0; - - info = svc_rdma_write_info_alloc(rdma, wr_ch); + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; + cc = &info->wi_cc; - ret = svc_rdma_send_xdr_pagelist(info, xdr); - if (ret < 0) + ret = svc_rdma_xb_write(xdr, info); + if (ret != xdr->len) goto out_err; - ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); + ret = svc_rdma_post_chunk_ctxt(cc); if (ret < 0) goto out_err; - - trace_svcrdma_encode_write(xdr->page_len); - return xdr->page_len; + return xdr->len; out_err: svc_rdma_write_info_free(info); @@ -541,8 +642,7 @@ out_err: /** * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk * @rdma: controlling RDMA transport - * @rp_ch: Reply chunk provided by client - * @writelist: true if client provided a Write list + * @rctxt: Write and Reply chunks from client * @xdr: xdr_buf containing an RPC Reply * * Returns a non-negative number of bytes the chunk consumed, or @@ -552,65 +652,68 @@ out_err: * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, - bool writelist, struct xdr_buf *xdr) +int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; - int consumed, ret; + struct svc_rdma_chunk_ctxt *cc; + struct svc_rdma_chunk *chunk; + int ret; - info = svc_rdma_write_info_alloc(rdma, rp_ch); + if (pcl_is_empty(&rctxt->rc_reply_pcl)) + return 0; + + chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; + cc = &info->wi_cc; - ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]); + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_write, info); if (ret < 0) goto out_err; - consumed = xdr->head[0].iov_len; - - /* Send the page list in the Reply chunk only if the - * client did not provide Write chunks. - */ - if (!writelist && xdr->page_len) { - ret = svc_rdma_send_xdr_pagelist(info, xdr); - if (ret < 0) - goto out_err; - consumed += xdr->page_len; - } - - if (xdr->tail[0].iov_len) { - ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]); - if (ret < 0) - goto out_err; - consumed += xdr->tail[0].iov_len; - } - ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); + ret = svc_rdma_post_chunk_ctxt(cc); if (ret < 0) goto out_err; - trace_svcrdma_encode_reply(consumed); - return consumed; + return xdr->len; out_err: svc_rdma_write_info_free(info); return ret; } +/** + * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment + * @info: context for ongoing I/O + * @segment: co-ordinates of remote memory to be read + * + * Returns: + * %0: the Read WR chain was constructed successfully + * %-EINVAL: there were not enough rq_pages to finish + * %-ENOMEM: allocating a local resources failed + * %-EIO: a DMA mapping error occurred + */ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, - struct svc_rqst *rqstp, - u32 rkey, u32 len, u64 offset) + const struct svc_rdma_segment *segment) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; + struct svc_rqst *rqstp = info->ri_rqst; + unsigned int sge_no, seg_len, len; struct svc_rdma_rw_ctxt *ctxt; - unsigned int sge_no, seg_len; struct scatterlist *sg; int ret; + len = segment->rs_length; sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); if (!ctxt) - goto out_noctx; + return -ENOMEM; ctxt->rw_nents = sge_no; sg = ctxt->rw_sg_table.sgl; @@ -618,8 +721,6 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, seg_len = min_t(unsigned int, len, PAGE_SIZE - info->ri_pageoff); - head->rc_arg.pages[info->ri_pageno] = - rqstp->rq_pages[info->ri_pageno]; if (!info->ri_pageoff) head->rc_page_count++; @@ -640,102 +741,198 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, goto out_overrun; } - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp, - cc->cc_rdma->sc_port_num, - ctxt->rw_sg_table.sgl, ctxt->rw_nents, - 0, offset, rkey, DMA_FROM_DEVICE); + ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset, + segment->rs_handle, DMA_FROM_DEVICE); if (ret < 0) - goto out_initerr; + return -EIO; + percpu_counter_inc(&svcrdma_stat_read); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; return 0; -out_noctx: - dprintk("svcrdma: no R/W ctxs available\n"); - return -ENOMEM; - out_overrun: - dprintk("svcrdma: request overruns rq_pages\n"); + trace_svcrdma_page_overrun_err(cc->cc_rdma, rqstp, info->ri_pageno); return -EINVAL; - -out_initerr: - trace_svcrdma_dma_map_rwctx(cc->cc_rdma, ret); - svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt); - return -EIO; } -/* Walk the segments in the Read chunk starting at @p and construct - * RDMA Read operations to pull the chunk to the server. +/** + * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk + * @info: context for ongoing I/O + * @chunk: Read chunk to pull + * + * Return values: + * %0: the Read WR chain was constructed successfully + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: allocating a local resources failed + * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info, + const struct svc_rdma_chunk *chunk) { - unsigned int i; + const struct svc_rdma_segment *segment; int ret; ret = -EINVAL; - info->ri_chunklen = 0; - while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { - u32 rs_handle, rs_length; - u64 rs_offset; - - rs_handle = be32_to_cpup(p++); - rs_length = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &rs_offset); - - ret = svc_rdma_build_read_segment(info, rqstp, - rs_handle, rs_length, - rs_offset); + pcl_for_each_segment(segment, chunk) { + ret = svc_rdma_build_read_segment(info, segment); if (ret < 0) break; + info->ri_totalbytes += segment->rs_length; + } + return ret; +} + +/** + * svc_rdma_copy_inline_range - Copy part of the inline content into pages + * @info: context for RDMA Reads + * @offset: offset into the Receive buffer of region to copy + * @remaining: length of region to copy + * + * Take a page at a time from rqstp->rq_pages and copy the inline + * content from the Receive buffer into that page. Update + * info->ri_pageno and info->ri_pageoff so that the next RDMA Read + * result will land contiguously with the copied content. + * + * Return values: + * %0: Inline content was successfully copied + * %-EINVAL: offset or length was incorrect + */ +static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, + unsigned int offset, + unsigned int remaining) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + unsigned char *dst, *src = head->rc_recv_buf; + struct svc_rqst *rqstp = info->ri_rqst; + unsigned int page_no, numpages; - trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset); - info->ri_chunklen += rs_length; + numpages = PAGE_ALIGN(info->ri_pageoff + remaining) >> PAGE_SHIFT; + for (page_no = 0; page_no < numpages; page_no++) { + unsigned int page_len; + + page_len = min_t(unsigned int, remaining, + PAGE_SIZE - info->ri_pageoff); + + if (!info->ri_pageoff) + head->rc_page_count++; + + dst = page_address(rqstp->rq_pages[info->ri_pageno]); + memcpy(dst + info->ri_pageno, src + offset, page_len); + + info->ri_totalbytes += page_len; + info->ri_pageoff += page_len; + if (info->ri_pageoff == PAGE_SIZE) { + info->ri_pageno++; + info->ri_pageoff = 0; + } + remaining -= page_len; + offset += page_len; } - /* Pages under I/O have been copied to head->rc_pages. - * Prevent their premature release by svc_xprt_release() . - */ - for (i = 0; i < info->ri_readctxt->rc_page_count; i++) - rqstp->rq_pages[i] = NULL; + return -EINVAL; +} - return ret; +/** + * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks + * @info: context for RDMA Reads + * + * The chunk data lands in rqstp->rq_arg as a series of contiguous pages, + * like an incoming TCP call. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *info) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; + struct xdr_buf *buf = &info->ri_rqst->rq_arg; + struct svc_rdma_chunk *chunk, *next; + unsigned int start, length; + int ret; + + start = 0; + chunk = pcl_first_chunk(pcl); + length = chunk->ch_position; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; + + pcl_for_each_chunk(chunk, pcl) { + ret = svc_rdma_build_read_chunk(info, chunk); + if (ret < 0) + return ret; + + next = pcl_next_chunk(pcl, chunk); + if (!next) + break; + + start += length; + length = next->ch_position - info->ri_totalbytes; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; + } + + start += length; + length = head->rc_byte_len - start; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; + + buf->len += info->ri_totalbytes; + buf->buflen += info->ri_totalbytes; + + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->pages = &info->ri_rqst->rq_pages[1]; + buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; + return 0; } -/* Construct RDMA Reads to pull over a normal Read chunk. The chunk - * data lands in the page list of head->rc_arg.pages. +/** + * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks + * @info: context for RDMA Reads + * + * The chunk data lands in the page list of rqstp->rq_arg.pages. * - * Currently NFSD does not look at the head->rc_arg.tail[0] iovec. + * Currently NFSD does not look at the rqstp->rq_arg.tail[0] kvec. * Therefore, XDR round-up of the Read chunk and trailing * inline content must both be added at the end of the pagelist. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static int svc_rdma_read_data_item(struct svc_rdma_read_info *info) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + struct xdr_buf *buf = &info->ri_rqst->rq_arg; + struct svc_rdma_chunk *chunk; + unsigned int length; int ret; - ret = svc_rdma_build_read_chunk(rqstp, info, p); + chunk = pcl_first_chunk(&head->rc_read_pcl); + ret = svc_rdma_build_read_chunk(info, chunk); if (ret < 0) goto out; - trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position); - - head->rc_hdr_count = 0; - /* Split the Receive buffer between the head and tail * buffers at Read chunk's position. XDR roundup of the * chunk is not included in either the pagelist or in * the tail. */ - head->rc_arg.tail[0].iov_base = - head->rc_arg.head[0].iov_base + info->ri_position; - head->rc_arg.tail[0].iov_len = - head->rc_arg.head[0].iov_len - info->ri_position; - head->rc_arg.head[0].iov_len = info->ri_position; + buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position; + buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position; + buf->head[0].iov_len = chunk->ch_position; /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). * @@ -746,109 +943,221 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, * Currently these chunks always start at page offset 0, * thus the rounded-up length never crosses a page boundary. */ - info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2; - - head->rc_arg.page_len = info->ri_chunklen; - head->rc_arg.len += info->ri_chunklen; - head->rc_arg.buflen += info->ri_chunklen; + buf->pages = &info->ri_rqst->rq_pages[0]; + length = xdr_align_size(chunk->ch_length); + buf->page_len = length; + buf->len += length; + buf->buflen += length; out: return ret; } -/* Construct RDMA Reads to pull over a Position Zero Read chunk. - * The start of the data lands in the first page just after - * the Transport header, and the rest lands in the page list of - * head->rc_arg.pages. +/** + * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk + * @info: context for RDMA Reads + * @chunk: parsed Call chunk to pull + * @offset: offset of region to pull + * @length: length of region to pull * - * Assumptions: - * - A PZRC has an XDR-aligned length (no implicit round-up). - * - There can be no trailing inline content (IOW, we assume - * a PZRC is never sent in an RDMA_MSG message, though it's - * allowed by spec). + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, + const struct svc_rdma_chunk *chunk, + unsigned int offset, unsigned int length) +{ + const struct svc_rdma_segment *segment; + int ret; + + ret = -EINVAL; + pcl_for_each_segment(segment, chunk) { + struct svc_rdma_segment dummy; + + if (offset > segment->rs_length) { + offset -= segment->rs_length; + continue; + } + + dummy.rs_handle = segment->rs_handle; + dummy.rs_length = min_t(u32, length, segment->rs_length) - offset; + dummy.rs_offset = segment->rs_offset + offset; + + ret = svc_rdma_build_read_segment(info, &dummy); + if (ret < 0) + break; + + info->ri_totalbytes += dummy.rs_length; + length -= dummy.rs_length; + offset = 0; + } + return ret; +} + +/** + * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message + * @info: context for RDMA Reads + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + const struct svc_rdma_chunk *call_chunk = + pcl_first_chunk(&head->rc_call_pcl); + const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; + struct svc_rdma_chunk *chunk, *next; + unsigned int start, length; int ret; - ret = svc_rdma_build_read_chunk(rqstp, info, p); + if (pcl_is_empty(pcl)) + return svc_rdma_build_read_chunk(info, call_chunk); + + start = 0; + chunk = pcl_first_chunk(pcl); + length = chunk->ch_position; + ret = svc_rdma_read_chunk_range(info, call_chunk, start, length); if (ret < 0) - goto out; + return ret; - trace_svcrdma_encode_pzr(info->ri_chunklen); + pcl_for_each_chunk(chunk, pcl) { + ret = svc_rdma_build_read_chunk(info, chunk); + if (ret < 0) + return ret; - head->rc_arg.len += info->ri_chunklen; - head->rc_arg.buflen += info->ri_chunklen; + next = pcl_next_chunk(pcl, chunk); + if (!next) + break; - head->rc_hdr_count = 1; - head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]); - head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE, - info->ri_chunklen); + start += length; + length = next->ch_position - info->ri_totalbytes; + ret = svc_rdma_read_chunk_range(info, call_chunk, + start, length); + if (ret < 0) + return ret; + } - head->rc_arg.page_len = info->ri_chunklen - - head->rc_arg.head[0].iov_len; + start += length; + length = call_chunk->ch_length - start; + return svc_rdma_read_chunk_range(info, call_chunk, start, length); +} + +/** + * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message + * @info: context for RDMA Reads + * + * The start of the data lands in the first page just after the + * Transport header, and the rest lands in rqstp->rq_arg.pages. + * + * Assumptions: + * - A PZRC is never sent in an RDMA_MSG message, though it's + * allowed by spec. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info) +{ + struct xdr_buf *buf = &info->ri_rqst->rq_arg; + int ret; + + ret = svc_rdma_read_call_chunk(info); + if (ret < 0) + goto out; + + buf->len += info->ri_totalbytes; + buf->buflen += info->ri_totalbytes; + + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->pages = &info->ri_rqst->rq_pages[1]; + buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; out: return ret; } /** - * svc_rdma_recv_read_chunk - Pull a Read chunk from the client + * svc_rdma_process_read_list - Pull list of Read chunks from the client * @rdma: controlling RDMA transport * @rqstp: set of pages to use as Read sink buffers * @head: pages under I/O collect here - * @p: pointer to start of Read chunk * - * Returns: - * %0 if all needed RDMA Reads were posted successfully, - * %-EINVAL if client provided too many segments, - * %-ENOMEM if rdma_rw context pool was exhausted, - * %-ENOTCONN if posting failed (connection is lost), - * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + * The RPC/RDMA protocol assumes that the upper layer's XDR decoders + * pull each Read chunk as they decode an incoming RPC message. * - * Assumptions: - * - All Read segments in @p have the same Position value. + * On Linux, however, the server needs to have a fully-constructed RPC + * message in rqstp->rq_arg when there is a positive return code from + * ->xpo_recvfrom. So the Read list is safety-checked immediately when + * it is received, then here the whole Read list is pulled all at once. + * The ingress RPC message is fully reconstructed once all associated + * RDMA Reads have completed. + * + * Return values: + * %1: all needed RDMA Reads were posted successfully, + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, - struct svc_rdma_recv_ctxt *head, __be32 *p) +int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, + struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { struct svc_rdma_read_info *info; + struct svc_rdma_chunk_ctxt *cc; int ret; - /* The request (with page list) is constructed in - * head->rc_arg. Pages involved with RDMA Read I/O are - * transferred there. - */ - head->rc_arg.head[0] = rqstp->rq_arg.head[0]; - head->rc_arg.tail[0] = rqstp->rq_arg.tail[0]; - head->rc_arg.pages = head->rc_pages; - head->rc_arg.page_base = 0; - head->rc_arg.page_len = 0; - head->rc_arg.len = rqstp->rq_arg.len; - head->rc_arg.buflen = rqstp->rq_arg.buflen; - info = svc_rdma_read_info_alloc(rdma); if (!info) return -ENOMEM; + cc = &info->ri_cc; + info->ri_rqst = rqstp; info->ri_readctxt = head; info->ri_pageno = 0; info->ri_pageoff = 0; - - info->ri_position = be32_to_cpup(p + 1); - if (info->ri_position) - ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); - else - ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); + info->ri_totalbytes = 0; + + if (pcl_is_empty(&head->rc_call_pcl)) { + if (head->rc_read_pcl.cl_count == 1) + ret = svc_rdma_read_data_item(info); + else + ret = svc_rdma_read_multiple_chunks(info); + } else + ret = svc_rdma_read_special(info); if (ret < 0) goto out_err; - ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); + trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount); + init_completion(&cc->cc_done); + ret = svc_rdma_post_chunk_ctxt(cc); if (ret < 0) goto out_err; - return 0; + + ret = 1; + wait_for_completion(&cc->cc_done); + if (cc->cc_status != IB_WC_SUCCESS) + ret = -EIO; + + /* rq_respages starts after the last arg page */ + rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + /* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */ + head->rc_page_count = 0; out_err: svc_rdma_read_info_free(info); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index f3f108090aa4..22a871e6fe4d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -106,21 +106,18 @@ #include <rdma/rdma_cm.h> #include <linux/sunrpc/debug.h> -#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); -static inline struct svc_rdma_send_ctxt * -svc_rdma_next_send_ctxt(struct list_head *list) +static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) { - return list_first_entry_or_null(list, struct svc_rdma_send_ctxt, - sc_list); + cid->ci_queue_id = rdma->sc_sq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); } static struct svc_rdma_send_ctxt * @@ -145,12 +142,17 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) goto fail2; + svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); + ctxt->sc_send_wr.next = NULL; ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; + init_completion(&ctxt->sc_done); ctxt->sc_cqe.done = svc_rdma_wc_send; ctxt->sc_xprt_buf = buffer; + xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, + rdma->sc_max_req_size); ctxt->sc_sges[0].addr = addr; for (i = 0; i < rdma->sc_max_send_sges; i++) @@ -173,9 +175,10 @@ fail0: void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) { - list_del(&ctxt->sc_list); + while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); ib_dma_unmap_single(rdma->sc_pd->device, ctxt->sc_sges[0].addr, rdma->sc_max_req_size, @@ -195,18 +198,22 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_send_lock); - ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts); - if (!ctxt) + node = llist_del_first(&rdma->sc_send_ctxts); + if (!node) goto out_empty; - list_del(&ctxt->sc_list); + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); spin_unlock(&rdma->sc_send_lock); out: + rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0); + xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, + ctxt->sc_xprt_buf, NULL); + ctxt->sc_send_wr.num_sge = 0; ctxt->sc_cur_sge_no = 0; - ctxt->sc_page_count = 0; return ctxt; out_empty: @@ -221,8 +228,6 @@ out_empty: * svc_rdma_send_ctxt_put - Return send_ctxt to free list * @rdma: controlling svcxprt_rdma * @ctxt: object to return to the free list - * - * Pages left in sc_pages are DMA unmapped and released. */ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) @@ -243,12 +248,21 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, ctxt->sc_sges[i].length); } - for (i = 0; i < ctxt->sc_page_count; ++i) - put_page(ctxt->sc_pages[i]); + llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts); +} - spin_lock(&rdma->sc_send_lock); - list_add(&ctxt->sc_list, &rdma->sc_send_ctxts); - spin_unlock(&rdma->sc_send_lock); +/** + * svc_rdma_wake_send_waiters - manage Send Queue accounting + * @rdma: controlling transport + * @avail: Number of additional SQEs that are now available + * + */ +void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail) +{ + atomic_add(avail, &rdma->sc_sq_avail); + smp_mb__after_atomic(); + if (unlikely(waitqueue_active(&rdma->sc_send_wait))) + wake_up(&rdma->sc_send_wait); } /** @@ -263,42 +277,51 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { struct svcxprt_rdma *rdma = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_send_ctxt *ctxt; + struct svc_rdma_send_ctxt *ctxt = + container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); - trace_svcrdma_wc_send(wc); + svc_rdma_wake_send_waiters(rdma, 1); + complete(&ctxt->sc_done); - atomic_inc(&rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); + if (unlikely(wc->status != IB_WC_SUCCESS)) + goto flushed; - ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); - svc_rdma_send_ctxt_put(rdma, ctxt); + trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + return; - if (unlikely(wc->status != IB_WC_SUCCESS)) { - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_xprt_enqueue(&rdma->sc_xprt); - } - - svc_xprt_put(&rdma->sc_xprt); +flushed: + if (wc->status != IB_WC_WR_FLUSH_ERR) + trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid); + else + trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid); + svc_xprt_deferred_close(&rdma->sc_xprt); } /** * svc_rdma_send - Post a single Send WR * @rdma: transport on which to post the WR - * @wr: prepared Send WR to post + * @ctxt: send ctxt with a Send WR ready to post * - * Returns zero the Send WR was posted successfully. Otherwise, a + * Returns zero if the Send WR was posted successfully. Otherwise, a * negative errno is returned. */ -int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) +int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) { + struct ib_send_wr *wr = &ctxt->sc_send_wr; int ret; - might_sleep(); + reinit_completion(&ctxt->sc_done); + + /* Sync the transport header buffer */ + ib_dma_sync_single_for_device(rdma->sc_pd->device, + wr->sg_list[0].addr, + wr->sg_list[0].length, + DMA_TO_DEVICE); /* If the SQ is full, wait until an SQ entry is available */ while (1) { if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { - atomic_inc(&rdma_stat_sq_starve); + percpu_counter_inc(&svcrdma_stat_sq_starve); trace_svcrdma_sq_full(rdma); atomic_inc(&rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, @@ -309,423 +332,471 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) continue; } - svc_xprt_get(&rdma->sc_xprt); + trace_svcrdma_post_send(ctxt); ret = ib_post_send(rdma->sc_qp, wr, NULL); - trace_svcrdma_post_send(wr, ret); - if (ret) { - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_xprt_put(&rdma->sc_xprt); - wake_up(&rdma->sc_send_wait); - } - break; + if (ret) + break; + return 0; } + + trace_svcrdma_sq_post_err(rdma, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + wake_up(&rdma->sc_send_wait); return ret; } -static u32 xdr_padsize(u32 len) +/** + * svc_rdma_encode_read_list - Encode RPC Reply's Read chunk list + * @sctxt: Send context for the RPC Reply + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Reply Read list + * %-EMSGSIZE on XDR buffer overflow + */ +static ssize_t svc_rdma_encode_read_list(struct svc_rdma_send_ctxt *sctxt) { - return (len & 3) ? (4 - (len & 3)) : 0; + /* RPC-over-RDMA version 1 replies never have a Read list. */ + return xdr_stream_encode_item_absent(&sctxt->sc_stream); } -/* Returns length of transport header, in bytes. +/** + * svc_rdma_encode_write_segment - Encode one Write segment + * @sctxt: Send context for the RPC Reply + * @chunk: Write chunk to push + * @remaining: remaining bytes of the payload left in the Write chunk + * @segno: which segment in the chunk + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Write segment, and updates @remaining + * %-EMSGSIZE on XDR buffer overflow */ -static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp) +static ssize_t svc_rdma_encode_write_segment(struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk, + u32 *remaining, unsigned int segno) { - unsigned int nsegs; + const struct svc_rdma_segment *segment = &chunk->ch_segments[segno]; + const size_t len = rpcrdma_segment_maxsz * sizeof(__be32); + u32 length; __be32 *p; - p = rdma_resp; - - /* RPC-over-RDMA V1 replies never have a Read list. */ - p += rpcrdma_fixed_maxsz + 1; - - /* Skip Write list. */ - while (*p++ != xdr_zero) { - nsegs = be32_to_cpup(p++); - p += nsegs * rpcrdma_segment_maxsz; - } - - /* Skip Reply chunk. */ - if (*p++ != xdr_zero) { - nsegs = be32_to_cpup(p++); - p += nsegs * rpcrdma_segment_maxsz; - } - - return (unsigned long)p - (unsigned long)rdma_resp; + p = xdr_reserve_space(&sctxt->sc_stream, len); + if (!p) + return -EMSGSIZE; + + length = min_t(u32, *remaining, segment->rs_length); + *remaining -= length; + xdr_encode_rdma_segment(p, segment->rs_handle, length, + segment->rs_offset); + trace_svcrdma_encode_wseg(sctxt, segno, segment->rs_handle, length, + segment->rs_offset); + return len; } -/* One Write chunk is copied from Call transport header to Reply - * transport header. Each segment's length field is updated to - * reflect number of bytes consumed in the segment. +/** + * svc_rdma_encode_write_chunk - Encode one Write chunk + * @sctxt: Send context for the RPC Reply + * @chunk: Write chunk to push * - * Returns number of segments in this chunk. + * Copy a Write chunk from the Call transport header to the + * Reply transport header. Update each segment's length field + * to reflect the number of bytes written in that segment. + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Write chunk + * %-EMSGSIZE on XDR buffer overflow */ -static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src, - unsigned int remaining) +static ssize_t svc_rdma_encode_write_chunk(struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk) { - unsigned int i, nsegs; - u32 seg_len; - - /* Write list discriminator */ - *dst++ = *src++; - - /* number of segments in this chunk */ - nsegs = be32_to_cpup(src); - *dst++ = *src++; - - for (i = nsegs; i; i--) { - /* segment's RDMA handle */ - *dst++ = *src++; - - /* bytes returned in this segment */ - seg_len = be32_to_cpu(*src); - if (remaining >= seg_len) { - /* entire segment was consumed */ - *dst = *src; - remaining -= seg_len; - } else { - /* segment only partly filled */ - *dst = cpu_to_be32(remaining); - remaining = 0; - } - dst++; src++; + u32 remaining = chunk->ch_payload_length; + unsigned int segno; + ssize_t len, ret; - /* segment's RDMA offset */ - *dst++ = *src++; - *dst++ = *src++; - } + len = 0; + ret = xdr_stream_encode_item_present(&sctxt->sc_stream); + if (ret < 0) + return ret; + len += ret; - return nsegs; -} + ret = xdr_stream_encode_u32(&sctxt->sc_stream, chunk->ch_segcount); + if (ret < 0) + return ret; + len += ret; -/* The client provided a Write list in the Call message. Fill in - * the segments in the first Write chunk in the Reply's transport - * header with the number of bytes consumed in each segment. - * Remaining chunks are returned unused. - * - * Assumptions: - * - Client has provided only one Write chunk - */ -static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch, - unsigned int consumed) -{ - unsigned int nsegs; - __be32 *p, *q; - - /* RPC-over-RDMA V1 replies never have a Read list. */ - p = rdma_resp + rpcrdma_fixed_maxsz + 1; - - q = wr_ch; - while (*q != xdr_zero) { - nsegs = xdr_encode_write_chunk(p, q, consumed); - q += 2 + nsegs * rpcrdma_segment_maxsz; - p += 2 + nsegs * rpcrdma_segment_maxsz; - consumed = 0; + for (segno = 0; segno < chunk->ch_segcount; segno++) { + ret = svc_rdma_encode_write_segment(sctxt, chunk, &remaining, segno); + if (ret < 0) + return ret; + len += ret; } - /* Terminate Write list */ - *p++ = xdr_zero; - - /* Reply chunk discriminator; may be replaced later */ - *p = xdr_zero; + return len; } -/* The client provided a Reply chunk in the Call message. Fill in - * the segments in the Reply chunk in the Reply message with the - * number of bytes consumed in each segment. +/** + * svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list + * @rctxt: Reply context with information about the RPC Call + * @sctxt: Send context for the RPC Reply * - * Assumptions: - * - Reply can always fit in the provided Reply chunk + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Reply's Write list + * %-EMSGSIZE on XDR buffer overflow */ -static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch, - unsigned int consumed) +static ssize_t svc_rdma_encode_write_list(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt) { - __be32 *p; + struct svc_rdma_chunk *chunk; + ssize_t len, ret; - /* Find the Reply chunk in the Reply's xprt header. - * RPC-over-RDMA V1 replies never have a Read list. - */ - p = rdma_resp + rpcrdma_fixed_maxsz + 1; + len = 0; + pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { + ret = svc_rdma_encode_write_chunk(sctxt, chunk); + if (ret < 0) + return ret; + len += ret; + } - /* Skip past Write list */ - while (*p++ != xdr_zero) - p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + /* Terminate the Write list */ + ret = xdr_stream_encode_item_absent(&sctxt->sc_stream); + if (ret < 0) + return ret; - xdr_encode_write_chunk(p, rp_ch, consumed); + return len + ret; } -/* Parse the RPC Call's transport header. +/** + * svc_rdma_encode_reply_chunk - Encode RPC Reply's Reply chunk + * @rctxt: Reply context with information about the RPC Call + * @sctxt: Send context for the RPC Reply + * @length: size in bytes of the payload in the Reply chunk + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Reply's Reply chunk + * %-EMSGSIZE on XDR buffer overflow + * %-E2BIG if the RPC message is larger than the Reply chunk */ -static void svc_rdma_get_write_arrays(__be32 *rdma_argp, - __be32 **write, __be32 **reply) +static ssize_t +svc_rdma_encode_reply_chunk(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + unsigned int length) { - __be32 *p; - - p = rdma_argp + rpcrdma_fixed_maxsz; + struct svc_rdma_chunk *chunk; - /* Read list */ - while (*p++ != xdr_zero) - p += 5; + if (pcl_is_empty(&rctxt->rc_reply_pcl)) + return xdr_stream_encode_item_absent(&sctxt->sc_stream); - /* Write list */ - if (*p != xdr_zero) { - *write = p; - while (*p++ != xdr_zero) - p += 1 + be32_to_cpu(*p) * 4; - } else { - *write = NULL; - p++; - } + chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); + if (length > chunk->ch_length) + return -E2BIG; - /* Reply chunk */ - if (*p != xdr_zero) - *reply = p; - else - *reply = NULL; + chunk->ch_payload_length = length; + return svc_rdma_encode_write_chunk(sctxt, chunk); } -static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct page *page, - unsigned long offset, - unsigned int len) +struct svc_rdma_map_data { + struct svcxprt_rdma *md_rdma; + struct svc_rdma_send_ctxt *md_ctxt; +}; + +/** + * svc_rdma_page_dma_map - DMA map one page + * @data: pointer to arguments + * @page: struct page to DMA map + * @offset: offset into the page + * @len: number of bytes to map + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if the page cannot be DMA mapped + */ +static int svc_rdma_page_dma_map(void *data, struct page *page, + unsigned long offset, unsigned int len) { + struct svc_rdma_map_data *args = data; + struct svcxprt_rdma *rdma = args->md_rdma; + struct svc_rdma_send_ctxt *ctxt = args->md_ctxt; struct ib_device *dev = rdma->sc_cm_id->device; dma_addr_t dma_addr; + ++ctxt->sc_cur_sge_no; + dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); - trace_svcrdma_dma_map_page(rdma, dma_addr, len); if (ib_dma_mapping_error(dev, dma_addr)) goto out_maperr; + trace_svcrdma_dma_map_page(rdma, dma_addr, len); ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; ctxt->sc_send_wr.num_sge++; return 0; out_maperr: + trace_svcrdma_dma_map_err(rdma, dma_addr, len); return -EIO; } -/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() +/** + * svc_rdma_iov_dma_map - DMA map an iovec + * @data: pointer to arguments + * @iov: kvec to DMA map + * + * ib_dma_map_page() is used here because svc_rdma_dma_unmap() * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively. + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if the iovec cannot be DMA mapped */ -static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - unsigned char *base, - unsigned int len) +static int svc_rdma_iov_dma_map(void *data, const struct kvec *iov) { - return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base), - offset_in_page(base), len); + if (!iov->iov_len) + return 0; + return svc_rdma_page_dma_map(data, virt_to_page(iov->iov_base), + offset_in_page(iov->iov_base), + iov->iov_len); } /** - * svc_rdma_sync_reply_hdr - DMA sync the transport header buffer - * @rdma: controlling transport - * @ctxt: send_ctxt for the Send WR - * @len: length of transport header + * svc_rdma_xb_dma_map - DMA map all segments of an xdr_buf + * @xdr: xdr_buf containing portion of an RPC message to transmit + * @data: pointer to arguments + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if DMA mapping failed * + * On failure, any DMA mappings that have been already done must be + * unmapped by the caller. */ -void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - unsigned int len) +static int svc_rdma_xb_dma_map(const struct xdr_buf *xdr, void *data) { - ctxt->sc_sges[0].length = len; - ctxt->sc_send_wr.num_sge++; - ib_dma_sync_single_for_device(rdma->sc_pd->device, - ctxt->sc_sges[0].addr, len, - DMA_TO_DEVICE); -} + unsigned int len, remaining; + unsigned long pageoff; + struct page **ppages; + int ret; -/* If the xdr_buf has more elements than the device can - * transmit in a single RDMA Send, then the reply will - * have to be copied into a bounce buffer. - */ -static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, - struct xdr_buf *xdr, - __be32 *wr_lst) -{ - int elements; - - /* xdr->head */ - elements = 1; - - /* xdr->pages */ - if (!wr_lst) { - unsigned int remaining; - unsigned long pageoff; - - pageoff = xdr->page_base & ~PAGE_MASK; - remaining = xdr->page_len; - while (remaining) { - ++elements; - remaining -= min_t(u32, PAGE_SIZE - pageoff, - remaining); - pageoff = 0; - } + ret = svc_rdma_iov_dma_map(data, &xdr->head[0]); + if (ret < 0) + return ret; + + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + pageoff = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + len = min_t(u32, PAGE_SIZE - pageoff, remaining); + + ret = svc_rdma_page_dma_map(data, *ppages++, pageoff, len); + if (ret < 0) + return ret; + + remaining -= len; + pageoff = 0; } - /* xdr->tail */ - if (xdr->tail[0].iov_len) - ++elements; + ret = svc_rdma_iov_dma_map(data, &xdr->tail[0]); + if (ret < 0) + return ret; - /* assume 1 SGE is needed for the transport header */ - return elements >= rdma->sc_max_send_sges; + return xdr->len; } -/* The device is not capable of sending the reply directly. - * Assemble the elements of @xdr into the transport header - * buffer. +struct svc_rdma_pullup_data { + u8 *pd_dest; + unsigned int pd_length; + unsigned int pd_num_sges; +}; + +/** + * svc_rdma_xb_count_sges - Count how many SGEs will be needed + * @xdr: xdr_buf containing portion of an RPC message to transmit + * @data: pointer to arguments + * + * Returns: + * Number of SGEs needed to Send the contents of @xdr inline */ -static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct xdr_buf *xdr, __be32 *wr_lst) +static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr, + void *data) { - unsigned char *dst, *tailbase; - unsigned int taillen; - - dst = ctxt->sc_xprt_buf; - dst += ctxt->sc_sges[0].length; + struct svc_rdma_pullup_data *args = data; + unsigned int remaining; + unsigned long offset; - memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len); - dst += xdr->head[0].iov_len; + if (xdr->head[0].iov_len) + ++args->pd_num_sges; - tailbase = xdr->tail[0].iov_base; - taillen = xdr->tail[0].iov_len; - if (wr_lst) { - u32 xdrpad; - - xdrpad = xdr_padsize(xdr->page_len); - if (taillen && xdrpad) { - tailbase += xdrpad; - taillen -= xdrpad; - } - } else { - unsigned int len, remaining; - unsigned long pageoff; - struct page **ppages; - - ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - pageoff = xdr->page_base & ~PAGE_MASK; - remaining = xdr->page_len; - while (remaining) { - len = min_t(u32, PAGE_SIZE - pageoff, remaining); - - memcpy(dst, page_address(*ppages), len); - remaining -= len; - dst += len; - pageoff = 0; - } + offset = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + ++args->pd_num_sges; + remaining -= min_t(u32, PAGE_SIZE - offset, remaining); + offset = 0; } - if (taillen) - memcpy(dst, tailbase, taillen); - - ctxt->sc_sges[0].length += xdr->len; - ib_dma_sync_single_for_device(rdma->sc_pd->device, - ctxt->sc_sges[0].addr, - ctxt->sc_sges[0].length, - DMA_TO_DEVICE); + if (xdr->tail[0].iov_len) + ++args->pd_num_sges; + args->pd_length += xdr->len; return 0; } -/* svc_rdma_map_reply_msg - Map the buffer holding RPC message +/** + * svc_rdma_pull_up_needed - Determine whether to use pull-up * @rdma: controlling transport - * @ctxt: send_ctxt for the Send WR - * @xdr: prepared xdr_buf containing RPC message - * @wr_lst: pointer to Call header's Write list, or NULL - * - * Load the xdr_buf into the ctxt's sge array, and DMA map each - * element as it is added. + * @sctxt: send_ctxt for the Send WR + * @rctxt: Write and Reply chunks provided by client + * @xdr: xdr_buf containing RPC message to transmit * - * Returns zero on success, or a negative errno on failure. + * Returns: + * %true if pull-up must be used + * %false otherwise */ -int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct xdr_buf *xdr, __be32 *wr_lst) +static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma, + const struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) { - unsigned int len, remaining; - unsigned long page_off; - struct page **ppages; - unsigned char *base; - u32 xdr_pad; + /* Resources needed for the transport header */ + struct svc_rdma_pullup_data args = { + .pd_length = sctxt->sc_hdrbuf.len, + .pd_num_sges = 1, + }; int ret; - if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst)) - return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst); - - ++ctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_buf(rdma, ctxt, - xdr->head[0].iov_base, - xdr->head[0].iov_len); + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_count_sges, &args); if (ret < 0) - return ret; + return false; - /* If a Write chunk is present, the xdr_buf's page list - * is not included inline. However the Upper Layer may - * have added XDR padding in the tail buffer, and that - * should not be included inline. - */ - if (wr_lst) { - base = xdr->tail[0].iov_base; - len = xdr->tail[0].iov_len; - xdr_pad = xdr_padsize(xdr->page_len); - - if (len && xdr_pad) { - base += xdr_pad; - len -= xdr_pad; - } + if (args.pd_length < RPCRDMA_PULLUP_THRESH) + return true; + return args.pd_num_sges >= rdma->sc_max_send_sges; +} - goto tail; +/** + * svc_rdma_xb_linearize - Copy region of xdr_buf to flat buffer + * @xdr: xdr_buf containing portion of an RPC message to copy + * @data: pointer to arguments + * + * Returns: + * Always zero. + */ +static int svc_rdma_xb_linearize(const struct xdr_buf *xdr, + void *data) +{ + struct svc_rdma_pullup_data *args = data; + unsigned int len, remaining; + unsigned long pageoff; + struct page **ppages; + + if (xdr->head[0].iov_len) { + memcpy(args->pd_dest, xdr->head[0].iov_base, xdr->head[0].iov_len); + args->pd_dest += xdr->head[0].iov_len; } ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - page_off = xdr->page_base & ~PAGE_MASK; + pageoff = offset_in_page(xdr->page_base); remaining = xdr->page_len; while (remaining) { - len = min_t(u32, PAGE_SIZE - page_off, remaining); - - ++ctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++, - page_off, len); - if (ret < 0) - return ret; - + len = min_t(u32, PAGE_SIZE - pageoff, remaining); + memcpy(args->pd_dest, page_address(*ppages) + pageoff, len); remaining -= len; - page_off = 0; + args->pd_dest += len; + pageoff = 0; + ppages++; } - base = xdr->tail[0].iov_base; - len = xdr->tail[0].iov_len; -tail: - if (len) { - ++ctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len); - if (ret < 0) - return ret; + if (xdr->tail[0].iov_len) { + memcpy(args->pd_dest, xdr->tail[0].iov_base, xdr->tail[0].iov_len); + args->pd_dest += xdr->tail[0].iov_len; } + args->pd_length += xdr->len; return 0; } -/* The svc_rqst and all resources it owns are released as soon as - * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt - * so they are released by the Send completion handler. +/** + * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer + * @rdma: controlling transport + * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared + * @rctxt: Write and Reply chunks provided by client + * @xdr: prepared xdr_buf containing RPC message + * + * The device is not capable of sending the reply directly. + * Assemble the elements of @xdr into the transport header buffer. + * + * Assumptions: + * pull_up_needed has determined that @xdr will fit in the buffer. + * + * Returns: + * %0 if pull-up was successful + * %-EMSGSIZE if a buffer manipulation problem occurred */ -static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, - struct svc_rdma_send_ctxt *ctxt) +static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) { - int i, pages = rqstp->rq_next_page - rqstp->rq_respages; + struct svc_rdma_pullup_data args = { + .pd_dest = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len, + }; + int ret; - ctxt->sc_page_count += pages; - for (i = 0; i < pages; i++) { - ctxt->sc_pages[i] = rqstp->rq_respages[i]; - rqstp->rq_respages[i] = NULL; - } + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_linearize, &args); + if (ret < 0) + return ret; + + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len + args.pd_length; + trace_svcrdma_send_pullup(sctxt, args.pd_length); + return 0; +} + +/* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message + * @rdma: controlling transport + * @sctxt: send_ctxt for the Send WR + * @rctxt: Write and Reply chunks provided by client + * @xdr: prepared xdr_buf containing RPC message + * + * Returns: + * %0 if DMA mapping was successful. + * %-EMSGSIZE if a buffer manipulation problem occurred + * %-EIO if DMA mapping failed + * + * The Send WR's num_sge field is set in all cases. + */ +int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) +{ + struct svc_rdma_map_data args = { + .md_rdma = rdma, + .md_ctxt = sctxt, + }; - /* Prevent svc_xprt_release from releasing pages in rq_pages */ - rqstp->rq_next_page = rqstp->rq_respages; + /* Set up the (persistently-mapped) transport header SGE. */ + sctxt->sc_send_wr.num_sge = 1; + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; + + /* If there is a Reply chunk, nothing follows the transport + * header, and we're done here. + */ + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) + return 0; + + /* For pull-up, svc_rdma_send() will sync the transport header. + * No additional DMA mapping is necessary. + */ + if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr)) + return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr); + + return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_dma_map, &args); } /* Prepare the portion of the RPC Reply that will be transmitted @@ -748,20 +819,14 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, */ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - struct svc_rdma_recv_ctxt *rctxt, - struct svc_rqst *rqstp, - __be32 *wr_lst, __be32 *rp_ch) + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rqst *rqstp) { int ret; - if (!rp_ch) { - ret = svc_rdma_map_reply_msg(rdma, sctxt, - &rqstp->rq_res, wr_lst); - if (ret < 0) - return ret; - } - - svc_rdma_save_io_pages(rqstp, sctxt); + ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqstp->rq_res); + if (ret < 0) + return ret; if (rctxt->rc_inv_rkey) { sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; @@ -769,42 +834,84 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, } else { sctxt->sc_send_wr.opcode = IB_WR_SEND; } - dprintk("svcrdma: posting Send WR with %u sge(s)\n", - sctxt->sc_send_wr.num_sge); - return svc_rdma_send(rdma, &sctxt->sc_send_wr); + + ret = svc_rdma_send(rdma, sctxt); + if (ret < 0) + return ret; + + ret = wait_for_completion_killable(&sctxt->sc_done); + svc_rdma_send_ctxt_put(rdma, sctxt); + return ret; } -/* Given the client-provided Write and Reply chunks, the server was not - * able to form a complete reply. Return an RDMA_ERROR message so the - * client can retire this RPC transaction. As above, the Send completion - * routine releases payload pages that were part of a previous RDMA Write. +/** + * svc_rdma_send_error_msg - Send an RPC/RDMA v1 error response + * @rdma: controlling transport context + * @sctxt: Send context for the response + * @rctxt: Receive context for incoming bad message + * @status: negative errno indicating error that occurred * - * Remote Invalidation is skipped for simplicity. + * Given the client-provided Read, Write, and Reply chunks, the + * server was not able to parse the Call or form a complete Reply. + * Return an RDMA_ERROR message so the client can retire the RPC + * transaction. + * + * The caller does not have to release @sctxt. It is released by + * Send completion, or by this function on error. */ -static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct svc_rqst *rqstp) +void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_recv_ctxt *rctxt, + int status) { + __be32 *rdma_argp = rctxt->rc_recv_buf; __be32 *p; - int ret; - p = ctxt->sc_xprt_buf; - trace_svcrdma_err_chunk(*p); - p += 3; - *p++ = rdma_error; - *p = err_chunk; - svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_ERR); + rpcrdma_set_xdrlen(&sctxt->sc_hdrbuf, 0); + xdr_init_encode(&sctxt->sc_stream, &sctxt->sc_hdrbuf, + sctxt->sc_xprt_buf, NULL); - svc_rdma_save_io_pages(rqstp, ctxt); + p = xdr_reserve_space(&sctxt->sc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); + if (!p) + goto put_ctxt; - ctxt->sc_send_wr.opcode = IB_WR_SEND; - ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); - if (ret) { - svc_rdma_send_ctxt_put(rdma, ctxt); - return ret; + *p++ = *rdma_argp; + *p++ = *(rdma_argp + 1); + *p++ = rdma->sc_fc_credits; + *p = rdma_error; + + switch (status) { + case -EPROTONOSUPPORT: + p = xdr_reserve_space(&sctxt->sc_stream, 3 * sizeof(*p)); + if (!p) + goto put_ctxt; + + *p++ = err_vers; + *p++ = rpcrdma_version; + *p = rpcrdma_version; + trace_svcrdma_err_vers(*rdma_argp); + break; + default: + p = xdr_reserve_space(&sctxt->sc_stream, sizeof(*p)); + if (!p) + goto put_ctxt; + + *p = err_chunk; + trace_svcrdma_err_chunk(*rdma_argp); } - return 0; + /* Remote Invalidation is skipped for simplicity. */ + sctxt->sc_send_wr.num_sge = 1; + sctxt->sc_send_wr.opcode = IB_WR_SEND; + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; + if (svc_rdma_send(rdma, sctxt)) + goto put_ctxt; + + wait_for_completion_killable(&sctxt->sc_done); + +put_ctxt: + svc_rdma_send_ctxt_put(rdma, sctxt); } /** @@ -825,78 +932,113 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; - __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch; - struct xdr_buf *xdr = &rqstp->rq_res; + __be32 *rdma_argp = rctxt->rc_recv_buf; struct svc_rdma_send_ctxt *sctxt; + unsigned int rc_size; + __be32 *p; int ret; - rdma_argp = rctxt->rc_recv_buf; - svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch); + ret = -ENOTCONN; + if (svc_xprt_is_dead(xprt)) + goto drop_connection; - /* Create the RDMA response header. xprt->xpt_mutex, - * acquired in svc_send(), serializes RPC replies. The - * code path below that inserts the credit grant value - * into each transport header runs only inside this - * critical section. - */ ret = -ENOMEM; sctxt = svc_rdma_send_ctxt_get(rdma); if (!sctxt) - goto err0; - rdma_resp = sctxt->sc_xprt_buf; + goto drop_connection; + + ret = -EMSGSIZE; + p = xdr_reserve_space(&sctxt->sc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); + if (!p) + goto put_ctxt; + + ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); + if (ret < 0) + goto reply_chunk; + rc_size = ret; - p = rdma_resp; *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); *p++ = rdma->sc_fc_credits; - *p++ = rp_ch ? rdma_nomsg : rdma_msg; - - /* Start with empty chunks */ - *p++ = xdr_zero; - *p++ = xdr_zero; - *p = xdr_zero; + *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg; - if (wr_lst) { - /* XXX: Presume the client sent only one Write chunk */ - ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr); - if (ret < 0) - goto err2; - svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret); - } - if (rp_ch) { - ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr); - if (ret < 0) - goto err2; - svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret); - } + ret = svc_rdma_encode_read_list(sctxt); + if (ret < 0) + goto put_ctxt; + ret = svc_rdma_encode_write_list(rctxt, sctxt); + if (ret < 0) + goto put_ctxt; + ret = svc_rdma_encode_reply_chunk(rctxt, sctxt, rc_size); + if (ret < 0) + goto put_ctxt; - svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp)); - ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp, - wr_lst, rp_ch); + ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); if (ret < 0) - goto err1; - ret = 0; + goto put_ctxt; -out: - rqstp->rq_xprt_ctxt = NULL; - svc_rdma_recv_ctxt_put(rdma, rctxt); - return ret; + /* Prevent svc_xprt_release() from releasing the page backing + * rq_res.head[0].iov_base. It's no longer being accessed by + * the I/O device. */ + rqstp->rq_respages++; + return 0; - err2: +reply_chunk: if (ret != -E2BIG && ret != -EINVAL) - goto err1; + goto put_ctxt; - ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp); - if (ret < 0) - goto err1; - ret = 0; - goto out; + svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret); + return 0; - err1: +put_ctxt: svc_rdma_send_ctxt_put(rdma, sctxt); - err0: - trace_svcrdma_send_failed(rqstp, ret); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - ret = -ENOTCONN; - goto out; +drop_connection: + trace_svcrdma_send_err(rqstp, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + return -ENOTCONN; +} + +/** + * svc_rdma_result_payload - special processing for a result payload + * @rqstp: svc_rqst to operate on + * @offset: payload's byte offset in @xdr + * @length: size of payload, in bytes + * + * Return values: + * %0 if successful or nothing needed to be done + * %-EMSGSIZE on XDR buffer overflow + * %-E2BIG if the payload was larger than the Write chunk + * %-EINVAL if client provided too many segments + * %-ENOMEM if rdma_rw context pool was exhausted + * %-ENOTCONN if posting failed (connection is lost) + * %-EIO if rdma_rw initialization failed (DMA mapping, etc) + */ +int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) +{ + struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; + struct svc_rdma_chunk *chunk; + struct svcxprt_rdma *rdma; + struct xdr_buf subbuf; + int ret; + + chunk = rctxt->rc_cur_result_payload; + if (!length || !chunk) + return 0; + rctxt->rc_cur_result_payload = + pcl_next_chunk(&rctxt->rc_write_pcl, chunk); + if (length > chunk->ch_length) + return -E2BIG; + + chunk->ch_position = offset; + chunk->ch_payload_length = length; + + if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length)) + return -EMSGSIZE; + + rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); + ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf); + if (ret < 0) + return ret; + return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 145a3615c319..199fa012f18a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -55,7 +55,6 @@ #include <linux/sunrpc/addr.h> #include <linux/sunrpc/debug.h> -#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/svc_rdma.h> @@ -71,7 +70,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct sockaddr *sa, int salen, int flags); static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); -static void svc_rdma_release_rqst(struct svc_rqst *); static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); static int svc_rdma_has_wspace(struct svc_xprt *xprt); @@ -82,6 +80,7 @@ static const struct svc_xprt_ops svc_rdma_ops = { .xpo_create = svc_rdma_create, .xpo_recvfrom = svc_rdma_recvfrom, .xpo_sendto = svc_rdma_sendto, + .xpo_result_payload = svc_rdma_result_payload, .xpo_release_rqst = svc_rdma_release_rqst, .xpo_detach = svc_rdma_detach, .xpo_free = svc_rdma_free, @@ -120,8 +119,7 @@ static void qp_event_handler(struct ib_event *event, void *context) case IB_EVENT_QP_ACCESS_ERR: case IB_EVENT_DEVICE_FATAL: default: - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); + svc_xprt_deferred_close(xprt); break; } } @@ -138,10 +136,9 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); - INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); - INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); + init_llist_head(&cma_xprt->sc_send_ctxts); init_llist_head(&cma_xprt->sc_recv_ctxts); - INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); + init_llist_head(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); @@ -211,7 +208,12 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, newxprt->sc_ord = param->initiator_depth; sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; - svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + newxprt->sc_xprt.xpt_remotelen = svc_addr_len(sa); + memcpy(&newxprt->sc_xprt.xpt_remote, sa, + newxprt->sc_xprt.xpt_remotelen); + snprintf(newxprt->sc_xprt.xpt_remotebuf, + sizeof(newxprt->sc_xprt.xpt_remotebuf) - 1, "%pISc", sa); + /* The remote port is arbitrary and not under the control of the * client ULP. Set it to a fixed value so that the DRC continues * to be effective after a reconnect. @@ -233,72 +235,58 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, svc_xprt_enqueue(&listen_xprt->sc_xprt); } -/* - * Handles events generated on the listening endpoint. These events will be - * either be incoming connect requests or adapter removal events. +/** + * svc_rdma_listen_handler - Handle CM events generated on a listening endpoint + * @cma_id: the server's listener rdma_cm_id + * @event: details of the event + * + * Return values: + * %0: Do not destroy @cma_id + * %1: Destroy @cma_id (never returned here) + * + * NB: There is never a DEVICE_REMOVAL event for INADDR_ANY listeners. */ -static int rdma_listen_handler(struct rdma_cm_id *cma_id, - struct rdma_cm_event *event) +static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr; - - trace_svcrdma_cm_event(event, sap); - switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: - dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " - "event = %s (%d)\n", cma_id, cma_id->context, - rdma_event_msg(event->event), event->event); handle_connect_req(cma_id, &event->param.conn); break; default: - /* NB: No device removal upcall for INADDR_ANY listeners */ - dprintk("svcrdma: Unexpected event on listening endpoint %p, " - "event = %s (%d)\n", cma_id, - rdma_event_msg(event->event), event->event); break; } - return 0; } -static int rdma_cma_handler(struct rdma_cm_id *cma_id, - struct rdma_cm_event *event) +/** + * svc_rdma_cma_handler - Handle CM events on client connections + * @cma_id: the server's listener rdma_cm_id + * @event: details of the event + * + * Return values: + * %0: Do not destroy @cma_id + * %1: Destroy @cma_id (never returned here) + */ +static int svc_rdma_cma_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.dst_addr; struct svcxprt_rdma *rdma = cma_id->context; struct svc_xprt *xprt = &rdma->sc_xprt; - trace_svcrdma_cm_event(event, sap); - switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: - /* Accept complete */ - svc_xprt_get(xprt); - dprintk("svcrdma: Connection completed on DTO xprt=%p, " - "cm_id=%p\n", xprt, cma_id); clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); + + /* Handle any requests that were received while + * CONN_PENDING was set. */ svc_xprt_enqueue(xprt); break; case RDMA_CM_EVENT_DISCONNECTED: - dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", - xprt, cma_id); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); - break; case RDMA_CM_EVENT_DEVICE_REMOVAL: - dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " - "event = %s (%d)\n", cma_id, xprt, - rdma_event_msg(event->event), event->event); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); + svc_xprt_deferred_close(xprt); break; default: - dprintk("svcrdma: Unexpected event on DTO endpoint %p, " - "event = %s (%d)\n", cma_id, - rdma_event_msg(event->event), event->event); break; } return 0; @@ -316,22 +304,18 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct svcxprt_rdma *cma_xprt; int ret; - dprintk("svcrdma: Creating RDMA listener\n"); - if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) { - dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family); + if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6) return ERR_PTR(-EAFNOSUPPORT); - } cma_xprt = svc_rdma_create_xprt(serv, net); if (!cma_xprt) return ERR_PTR(-ENOMEM); set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener"); - listen_id = rdma_create_id(net, rdma_listen_handler, cma_xprt, + listen_id = rdma_create_id(net, svc_rdma_listen_handler, cma_xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(listen_id)) { ret = PTR_ERR(listen_id); - dprintk("svcrdma: rdma_create_id failed = %d\n", ret); goto err0; } @@ -340,23 +324,17 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, */ #if IS_ENABLED(CONFIG_IPV6) ret = rdma_set_afonly(listen_id, 1); - if (ret) { - dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret); + if (ret) goto err1; - } #endif ret = rdma_bind_addr(listen_id, sa); - if (ret) { - dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); + if (ret) goto err1; - } cma_xprt->sc_cm_id = listen_id; ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); - if (ret) { - dprintk("svcrdma: rdma_listen failed = %d\n", ret); + if (ret) goto err1; - } /* * We need to use the address from the cm_id in case the @@ -412,9 +390,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (!newxprt) return NULL; - dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", - newxprt, newxprt->sc_cm_id); - dev = newxprt->sc_cm_id->device; newxprt->sc_port_num = newxprt->sc_cm_id->port_num; @@ -429,11 +404,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = svcrdma_max_requests; newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; - rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; + newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH; + rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + + newxprt->sc_recv_batch; if (rq_depth > dev->attrs.max_qp_wr) { pr_warn("svcrdma: reducing receive depth to %d\n", dev->attrs.max_qp_wr); rq_depth = dev->attrs.max_qp_wr; + newxprt->sc_recv_batch = 1; newxprt->sc_max_requests = rq_depth - 2; newxprt->sc_max_bc_requests = 2; } @@ -450,21 +428,17 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { - dprintk("svcrdma: error creating PD for connect request\n"); + trace_svcrdma_pd_err(newxprt, PTR_ERR(newxprt->sc_pd)); goto errout; } newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth, IB_POLL_WORKQUEUE); - if (IS_ERR(newxprt->sc_sq_cq)) { - dprintk("svcrdma: error creating SQ CQ for connect request\n"); + if (IS_ERR(newxprt->sc_sq_cq)) goto errout; - } newxprt->sc_rq_cq = ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE); - if (IS_ERR(newxprt->sc_rq_cq)) { - dprintk("svcrdma: error creating RQ CQ for connect request\n"); + if (IS_ERR(newxprt->sc_rq_cq)) goto errout; - } memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; @@ -488,7 +462,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); if (ret) { - dprintk("svcrdma: failed to create QP, ret=%d\n", ret); + trace_svcrdma_qp_err(newxprt, ret); goto errout; } newxprt->sc_qp = newxprt->sc_cm_id->qp; @@ -496,15 +470,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) newxprt->sc_snd_w_inv = false; if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) && - !rdma_ib_or_roce(dev, newxprt->sc_port_num)) + !rdma_ib_or_roce(dev, newxprt->sc_port_num)) { + trace_svcrdma_fabric_err(newxprt, -EINVAL); goto errout; + } if (!svc_rdma_post_recvs(newxprt)) goto errout; - /* Swap out the handler */ - newxprt->sc_cm_id->event_handler = rdma_cma_handler; - /* Construct RDMA-CM private message */ pmsg.cp_magic = rpcrdma_cmp_magic; pmsg.cp_version = RPCRDMA_CMP_VERSION; @@ -519,15 +492,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) conn_param.initiator_depth = min_t(int, newxprt->sc_ord, dev->attrs.max_qp_init_rd_atom); if (!conn_param.initiator_depth) { - dprintk("svcrdma: invalid ORD setting\n"); ret = -EINVAL; + trace_svcrdma_initdepth_err(newxprt, ret); goto errout; } conn_param.private_data = &pmsg; conn_param.private_data_len = sizeof(pmsg); + rdma_lock_handler(newxprt->sc_cm_id); + newxprt->sc_cm_id->event_handler = svc_rdma_cma_handler; ret = rdma_accept(newxprt->sc_cm_id, &conn_param); - if (ret) + rdma_unlock_handler(newxprt->sc_cm_id); + if (ret) { + trace_svcrdma_accept_err(newxprt, ret); goto errout; + } #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) dprintk("svcrdma: new connection %p accepted:\n", newxprt); @@ -542,12 +520,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk(" ord : %d\n", conn_param.initiator_depth); #endif - trace_svcrdma_xprt_accept(&newxprt->sc_xprt); return &newxprt->sc_xprt; errout: - dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); - trace_svcrdma_xprt_fail(&newxprt->sc_xprt); /* Take a reference in case the DTO handler runs */ svc_xprt_get(&newxprt->sc_xprt); if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) @@ -558,28 +533,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) return NULL; } -static void svc_rdma_release_rqst(struct svc_rqst *rqstp) -{ -} - -/* - * When connected, an svc_xprt has at least two references: - * - * - A reference held by the cm_id between the ESTABLISHED and - * DISCONNECTED events. If the remote peer disconnected first, this - * reference could be gone. - * - * - A reference held by the svc_recv code that called this function - * as part of close processing. - * - * At a minimum one references should still be held. - */ static void svc_rdma_detach(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); - /* Disconnect and flush posted WQE */ rdma_disconnect(rdma->sc_cm_id); } @@ -587,21 +545,13 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); - struct svc_xprt *xprt = &rdma->sc_xprt; - - trace_svcrdma_xprt_free(xprt); + /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); svc_rdma_flush_recv_queues(rdma); - /* Final put of backchannel client transport */ - if (xprt->xpt_bc_xprt) { - xprt_put(xprt->xpt_bc_xprt); - xprt->xpt_bc_xprt = NULL; - } - svc_rdma_destroy_rw_ctxts(rdma); svc_rdma_send_ctxts_destroy(rdma); svc_rdma_recv_ctxts_destroy(rdma); @@ -652,7 +602,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) static void svc_rdma_secure_port(struct svc_rqst *rqstp) { - set_bit(RQ_SECURE, &rqstp->rq_flags); + __set_bit(RQ_SECURE, &rqstp->rq_flags); } static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 3cfeba68ee9a..10bb2b929c6d 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -60,19 +60,16 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - /* * tunables */ -unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; +static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; int xprt_rdma_pad_optimize; +static struct xprt_class xprt_rdma; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -238,26 +235,35 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; + unsigned int pflags = current->flags; int rc; - rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + if (atomic_read(&xprt->swapper)) + current->flags |= PF_MEMALLOC; + rc = rpcrdma_xprt_connect(r_xprt); xprt_clear_connecting(xprt); - if (r_xprt->rx_ep.rep_connected > 0) { + if (!rc) { + xprt->connect_cookie++; xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; xprt_set_connected(xprt); rc = -EAGAIN; - } + } else + rpcrdma_xprt_disconnect(r_xprt); + xprt_unlock_connect(xprt, r_xprt); xprt_wake_pending_tasks(xprt, rc); + current_restore_flags(pflags, PF_MEMALLOC); } /** * xprt_rdma_inject_disconnect - inject a connection fault * @xprt: transport context * - * If @xprt is connected, disconnect it to simulate spurious connection - * loss. + * If @xprt is connected, disconnect it to simulate spurious + * connection loss. Caller must hold @xprt's send lock to + * ensure that data structures and hardware resources are + * stable during the rdma_disconnect() call. */ static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) @@ -265,7 +271,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_op_inject_dsc(r_xprt); - rdma_disconnect(r_xprt->rx_ia.ri_id); + rdma_disconnect(r_xprt->rx_ep->re_id); } /** @@ -280,13 +286,10 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - trace_xprtrdma_op_destroy(r_xprt); - cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - rpcrdma_ep_destroy(r_xprt); + rpcrdma_xprt_disconnect(r_xprt); rpcrdma_buffer_destroy(&r_xprt->rx_buf); - rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); xprt_free(xprt); @@ -316,10 +319,15 @@ xprt_setup_rdma(struct xprt_create *args) if (args->addrlen > sizeof(xprt->addr)) return ERR_PTR(-EBADF); + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-EIO); + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, xprt_rdma_slot_table_entries); - if (!xprt) + if (!xprt) { + module_put(THIS_MODULE); return ERR_PTR(-ENOMEM); + } xprt->timeout = &xprt_rdma_default_timeout; xprt->connect_timeout = xprt->timeout->to_initval; @@ -339,6 +347,7 @@ xprt_setup_rdma(struct xprt_create *args) /* Ensure xprt->addr holds valid server TCP (not RDMA) * address, for any side protocols which peek at it */ xprt->prot = IPPROTO_TCP; + xprt->xprt_class = &xprt_rdma; xprt->addrlen = args->addrlen; memcpy(&xprt->addr, sap, xprt->addrlen); @@ -347,43 +356,20 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_format_addresses(xprt, sap); new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt); - if (rc) - goto out1; - - rc = rpcrdma_ep_create(new_xprt); - if (rc) - goto out2; - rc = rpcrdma_buffer_create(new_xprt); - if (rc) - goto out3; - - if (!try_module_get(THIS_MODULE)) - goto out4; + if (rc) { + xprt_rdma_free_addresses(xprt); + xprt_free(xprt); + module_put(THIS_MODULE); + return ERR_PTR(rc); + } INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); + xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; - dprintk("RPC: %s: %s:%s\n", __func__, - xprt->address_strings[RPC_DISPLAY_ADDR], - xprt->address_strings[RPC_DISPLAY_PORT]); - trace_xprtrdma_create(new_xprt); return xprt; - -out4: - rpcrdma_buffer_destroy(&new_xprt->rx_buf); - rc = -ENODEV; -out3: - rpcrdma_ep_destroy(new_xprt); -out2: - rpcrdma_ia_close(&new_xprt->rx_ia); -out1: - trace_xprtrdma_op_destroy(new_xprt); - xprt_rdma_free_addresses(xprt); - xprt_free(xprt); - return ERR_PTR(rc); } /** @@ -398,26 +384,9 @@ out1: void xprt_rdma_close(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - might_sleep(); - trace_xprtrdma_op_close(r_xprt); + rpcrdma_xprt_disconnect(r_xprt); - /* Prevent marshaling and sending of new requests */ - xprt_clear_connected(xprt); - - if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) { - rpcrdma_ia_remove(ia); - goto out; - } - - if (ep->rep_connected == -ENODEV) - return; - rpcrdma_ep_disconnect(ep, ia); - -out: xprt->reestablish_timeout = 0; ++xprt->connect_cookie; xprt_disconnect_done(xprt); @@ -445,9 +414,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); snprintf(buf, sizeof(buf), "%4hx", port); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); - - trace_xprtrdma_op_setport(container_of(xprt, struct rpcrdma_xprt, - rx_xprt)); } /** @@ -517,16 +483,18 @@ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned long delay; + WARN_ON_ONCE(!xprt_lock_connect(xprt, task, r_xprt)); + delay = 0; - if (r_xprt->rx_ep.rep_connected != 0) { + if (ep && ep->re_connect_status != 0) { delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } trace_xprtrdma_op_connect(r_xprt, delay); - queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker, - delay); + queue_delayed_work(system_long_wq, &r_xprt->rx_connect_worker, delay); } /** @@ -552,9 +520,8 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) return; out_sleep: - set_bit(XPRT_CONGESTED, &xprt->state); - rpc_sleep_on(&xprt->backlog, task, NULL); - task->tk_status = -EAGAIN; + task->tk_status = -ENOMEM; + xprt_add_backlog(xprt, task); } /** @@ -569,10 +536,11 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst) struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, rx_xprt); - memset(rqst, 0, sizeof(*rqst)); - rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); - if (unlikely(!rpc_wake_up_next(&xprt->backlog))) - clear_bit(XPRT_CONGESTED, &xprt->state); + rpcrdma_reply_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); + if (!xprt_wake_up_backlog(xprt, rqst)) { + memset(rqst, 0, sizeof(*rqst)); + rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); + } } static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt, @@ -602,11 +570,7 @@ xprt_rdma_allocate(struct rpc_task *task) struct rpc_rqst *rqst = task->tk_rqstp; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - gfp_t flags; - - flags = RPCRDMA_DEF_GFP; - if (RPC_IS_SWAPPER(task)) - flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; + gfp_t flags = rpc_task_gfp_mask(); if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize, flags)) @@ -617,11 +581,9 @@ xprt_rdma_allocate(struct rpc_task *task) rqst->rq_buffer = rdmab_data(req->rl_sendbuf); rqst->rq_rbuffer = rdmab_data(req->rl_recvbuf); - trace_xprtrdma_op_allocate(task, req); return 0; out_fail: - trace_xprtrdma_op_allocate(task, NULL); return -ENOMEM; } @@ -635,13 +597,12 @@ static void xprt_rdma_free(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - trace_xprtrdma_op_free(task, req); - - if (!list_empty(&req->rl_registered)) - frwr_unmap_sync(r_xprt, req); + if (unlikely(!list_empty(&req->rl_registered))) { + trace_xprtrdma_mrs_zap(task); + frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req); + } /* XXX: If the RPC is completing because of a signal and * not because a reply was received, we ought to ensure @@ -694,7 +655,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst) goto drop_connection; rqst->rq_xtime = ktime_get(); - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + if (frwr_send(r_xprt, req)) goto drop_connection; rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; @@ -806,6 +767,7 @@ static struct xprt_class xprt_rdma = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_RDMA, .setup = xprt_setup_rdma, + .netid = { "rdma", "rdma6", "" }, }; void xprt_rdma_cleanup(void) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 353f61ac8d51..44b87e4274b4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -63,17 +63,6 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -/* - * Globals/Macros - */ - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - -/* - * internal functions - */ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt); static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, @@ -84,9 +73,10 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_ep_get(struct rpcrdma_ep *ep); +static int rpcrdma_ep_put(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * -rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, - gfp_t flags); +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction); static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); @@ -96,72 +86,86 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); */ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rdma_cm_id *id = ep->re_id; + + /* Wait for rpcrdma_post_recvs() to leave its critical + * section. + */ + if (atomic_inc_return(&ep->re_receiving) > 1) + wait_for_completion(&ep->re_done); /* Flush Receives, then wait for deferred Reply work * to complete. */ - ib_drain_rq(ia->ri_id->qp); + ib_drain_rq(id->qp); /* Deferred Reply processing might have scheduled * local invalidations. */ - ib_drain_sq(ia->ri_id->qp); + ib_drain_sq(id->qp); + + rpcrdma_ep_put(ep); +} + +/* Ensure xprt_force_disconnect() is invoked exactly once when a + * connection is closed or lost. (The important thing is it needs + * to be invoked "at least" once). + */ +void rpcrdma_force_disconnect(struct rpcrdma_ep *ep) +{ + if (atomic_add_unless(&ep->re_force_disconnect, 1, 1)) + xprt_force_disconnect(ep->re_xprt); } /** - * rpcrdma_qp_event_handler - Handle one QP event (error notification) - * @event: details of the event - * @context: ep that owns QP where event occurred + * rpcrdma_flush_disconnect - Disconnect on flushed completion + * @r_xprt: transport to disconnect + * @wc: work completion entry * - * Called from the RDMA provider (device driver) possibly in an interrupt - * context. + * Must be called in process context. */ -static void -rpcrdma_qp_event_handler(struct ib_event *event, void *context) +void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc) { - struct rpcrdma_ep *ep = context; - struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, - rx_ep); - - trace_xprtrdma_qp_event(r_xprt, event); + if (wc->status != IB_WC_SUCCESS) + rpcrdma_force_disconnect(r_xprt->rx_ep); } /** * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: completion queue - * @wc: completed WR + * @wc: WCE for a completed Send WR * */ -static void -rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) +static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_sendctx *sc = container_of(cqe, struct rpcrdma_sendctx, sc_cqe); + struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_send(sc, wc); - rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc); + trace_xprtrdma_wc_send(wc, &sc->sc_cid); + rpcrdma_sendctx_put_locked(r_xprt, sc); + rpcrdma_flush_disconnect(r_xprt, wc); } /** * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed Receive WR * */ -static void -rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) +static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, rr_cqe); - struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_receive(wc); - --r_xprt->rx_ep.rep_receive_count; + trace_xprtrdma_wc_receive(wc, &rep->rr_cid); + --r_xprt->rx_ep->re_receive_count; if (wc->status != IB_WC_SUCCESS) goto out_flushed; @@ -178,35 +182,33 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) return; out_flushed: - rpcrdma_rep_destroy(rep); + rpcrdma_flush_disconnect(r_xprt, wc); + rpcrdma_rep_put(&r_xprt->rx_buf, rep); } -static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, +static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, struct rdma_conn_param *param) { const struct rpcrdma_connect_private *pmsg = param->private_data; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ - r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; if (pmsg && pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { - r_xprt->rx_ia.ri_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } - if (rsize < ep->rep_inline_recv) - ep->rep_inline_recv = rsize; - if (wsize < ep->rep_inline_send) - ep->rep_inline_send = wsize; + if (rsize < ep->re_inline_recv) + ep->re_inline_recv = rsize; + if (wsize < ep->re_inline_send) + ep->re_inline_send = wsize; - rpcrdma_set_max_header_sizes(r_xprt); + rpcrdma_set_max_header_sizes(ep); } /** @@ -220,116 +222,100 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct rpcrdma_xprt *r_xprt = id->context; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; + struct rpcrdma_ep *ep = id->context; might_sleep(); - trace_xprtrdma_cm_event(r_xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: - ia->ri_async_rc = 0; - complete(&ia->ri_done); + ep->re_async_rc = 0; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_ADDR_ERROR: - ia->ri_async_rc = -EPROTO; - complete(&ia->ri_done); + ep->re_async_rc = -EPROTO; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_ROUTE_ERROR: - ia->ri_async_rc = -ENETUNREACH; - complete(&ia->ri_done); + ep->re_async_rc = -ENETUNREACH; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: removing device %s for %s:%s\n", - ia->ri_id->device->name, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); -#endif - init_completion(&ia->ri_remove_done); - set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); - ep->rep_connected = -ENODEV; - xprt_force_disconnect(xprt); - wait_for_completion(&ia->ri_remove_done); - - ia->ri_id = NULL; - /* Return 1 to ensure the core destroys the id. */ - return 1; + pr_info("rpcrdma: removing device %s for %pISpc\n", + ep->re_id->device->name, sap); + fallthrough; + case RDMA_CM_EVENT_ADDR_CHANGE: + ep->re_connect_status = -ENODEV; + goto disconnected; case RDMA_CM_EVENT_ESTABLISHED: - ++xprt->connect_cookie; - ep->rep_connected = 1; - rpcrdma_update_cm_private(r_xprt, &event->param.conn); - trace_xprtrdma_inline_thresh(r_xprt); - wake_up_all(&ep->rep_connect_wait); + rpcrdma_ep_get(ep); + ep->re_connect_status = 1; + rpcrdma_update_cm_private(ep, &event->param.conn); + trace_xprtrdma_inline_thresh(ep); + wake_up_all(&ep->re_connect_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: - ep->rep_connected = -ENOTCONN; - goto disconnected; + ep->re_connect_status = -ENOTCONN; + goto wake_connect_worker; case RDMA_CM_EVENT_UNREACHABLE: - ep->rep_connected = -ENETUNREACH; - goto disconnected; + ep->re_connect_status = -ENETUNREACH; + goto wake_connect_worker; case RDMA_CM_EVENT_REJECTED: - dprintk("rpcrdma: connection to %s:%s rejected: %s\n", - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - rdma_reject_msg(id, event->status)); - ep->rep_connected = -ECONNREFUSED; + ep->re_connect_status = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) - ep->rep_connected = -EAGAIN; - goto disconnected; + ep->re_connect_status = -ENOTCONN; +wake_connect_worker: + wake_up_all(&ep->re_connect_wait); + return 0; case RDMA_CM_EVENT_DISCONNECTED: - ep->rep_connected = -ECONNABORTED; + ep->re_connect_status = -ECONNABORTED; disconnected: - xprt_force_disconnect(xprt); - wake_up_all(&ep->rep_connect_wait); - break; + rpcrdma_force_disconnect(ep); + return rpcrdma_ep_put(ep); default: break; } - dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - ia->ri_id->device->name, rdma_event_msg(event->event)); return 0; } -static struct rdma_cm_id * -rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) +static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_ep *ep) { unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rdma_cm_id *id; int rc; - init_completion(&ia->ri_done); + init_completion(&ep->re_done); - id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, - xprt, RDMA_PS_TCP, IB_QPT_RC); + id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep, + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) return id; - ia->ri_async_rc = -ETIMEDOUT; - rc = rdma_resolve_addr(id, NULL, - (struct sockaddr *)&xprt->rx_xprt.addr, + ep->re_async_rc = -ETIMEDOUT; + rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr, RDMA_RESOLVE_TIMEOUT); if (rc) goto out; - rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); if (rc < 0) goto out; - rc = ia->ri_async_rc; + rc = ep->re_async_rc; if (rc) goto out; - ia->ri_async_rc = -ETIMEDOUT; + ep->re_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) goto out; - rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); if (rc < 0) goto out; - rc = ia->ri_async_rc; + rc = ep->re_async_rc; if (rc) goto out; @@ -340,423 +326,248 @@ out: return ERR_PTR(rc); } -/* - * Exported functions. - */ - -/** - * rpcrdma_ia_open - Open and initialize an Interface Adapter. - * @xprt: transport with IA to (re)initialize - * - * Returns 0 on success, negative errno if an appropriate - * Interface Adapter could not be found and opened. - */ -int -rpcrdma_ia_open(struct rpcrdma_xprt *xprt) +static void rpcrdma_ep_destroy(struct kref *kref) { - struct rpcrdma_ia *ia = &xprt->rx_ia; - int rc; + struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref); - ia->ri_id = rpcrdma_create_id(xprt, ia); - if (IS_ERR(ia->ri_id)) { - rc = PTR_ERR(ia->ri_id); - goto out_err; + if (ep->re_id->qp) { + rdma_destroy_qp(ep->re_id); + ep->re_id->qp = NULL; } - ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0); - if (IS_ERR(ia->ri_pd)) { - rc = PTR_ERR(ia->ri_pd); - pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); - goto out_err; - } + if (ep->re_attr.recv_cq) + ib_free_cq(ep->re_attr.recv_cq); + ep->re_attr.recv_cq = NULL; + if (ep->re_attr.send_cq) + ib_free_cq(ep->re_attr.send_cq); + ep->re_attr.send_cq = NULL; - return 0; + if (ep->re_pd) + ib_dealloc_pd(ep->re_pd); + ep->re_pd = NULL; -out_err: - rpcrdma_ia_close(ia); - return rc; + kfree(ep); + module_put(THIS_MODULE); } -/** - * rpcrdma_ia_remove - Handle device driver unload - * @ia: interface adapter being removed - * - * Divest transport H/W resources associated with this adapter, - * but allow it to be restored later. - * - * Caller must hold the transport send lock. - */ -void -rpcrdma_ia_remove(struct rpcrdma_ia *ia) +static noinline void rpcrdma_ep_get(struct rpcrdma_ep *ep) { - struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, - rx_ia); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - - /* This is similar to rpcrdma_ep_destroy, but: - * - Don't cancel the connect worker. - * - Don't call rpcrdma_ep_disconnect, which waits - * for another conn upcall, which will deadlock. - * - rdma_disconnect is unneeded, the underlying - * connection is already gone. - */ - if (ia->ri_id->qp) { - rpcrdma_xprt_drain(r_xprt); - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - ib_free_cq(ep->rep_attr.recv_cq); - ep->rep_attr.recv_cq = NULL; - ib_free_cq(ep->rep_attr.send_cq); - ep->rep_attr.send_cq = NULL; - - /* The ULP is responsible for ensuring all DMA - * mappings and MRs are gone. - */ - rpcrdma_reps_unmap(r_xprt); - rpcrdma_reqs_reset(r_xprt); - rpcrdma_mrs_destroy(r_xprt); - rpcrdma_sendctxs_destroy(r_xprt); - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; - - /* Allow waiters to continue */ - complete(&ia->ri_remove_done); - - trace_xprtrdma_remove(r_xprt); + kref_get(&ep->re_kref); } -/** - * rpcrdma_ia_close - Clean up/close an IA. - * @ia: interface adapter to close - * +/* Returns: + * %0 if @ep still has a positive kref count, or + * %1 if @ep was destroyed successfully. */ -void -rpcrdma_ia_close(struct rpcrdma_ia *ia) +static noinline int rpcrdma_ep_put(struct rpcrdma_ep *ep) { - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { - if (ia->ri_id->qp) - rdma_destroy_qp(ia->ri_id); - rdma_destroy_id(ia->ri_id); - } - ia->ri_id = NULL; - - /* If the pd is still busy, xprtrdma missed freeing a resource */ - if (ia->ri_pd && !IS_ERR(ia->ri_pd)) - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; + return kref_put(&ep->re_kref, rpcrdma_ep_destroy); } -/** - * rpcrdma_ep_create - Create unconnected endpoint - * @r_xprt: transport to instantiate - * - * Returns zero on success, or a negative errno. - */ -int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) +static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; - struct ib_cq *sendcq, *recvcq; + struct rpcrdma_connect_private *pmsg; + struct ib_device *device; + struct rdma_cm_id *id; + struct rpcrdma_ep *ep; int rc; - ep->rep_max_requests = r_xprt->rx_xprt.max_reqs; - ep->rep_inline_send = xprt_rdma_max_inline_write; - ep->rep_inline_recv = xprt_rdma_max_inline_read; + ep = kzalloc(sizeof(*ep), XPRTRDMA_GFP_FLAGS); + if (!ep) + return -ENOTCONN; + ep->re_xprt = &r_xprt->rx_xprt; + kref_init(&ep->re_kref); - rc = frwr_query_device(r_xprt, ia->ri_id->device); - if (rc) - return rc; - r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests); - - ep->rep_attr.event_handler = rpcrdma_qp_event_handler; - ep->rep_attr.qp_context = ep; - ep->rep_attr.srq = NULL; - ep->rep_attr.cap.max_inline_data = 0; - ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; - ep->rep_attr.qp_type = IB_QPT_RC; - ep->rep_attr.port_num = ~0; - - dprintk("RPC: %s: requested max: dtos: send %d recv %d; " - "iovs: send %d recv %d\n", - __func__, - ep->rep_attr.cap.max_send_wr, - ep->rep_attr.cap.max_recv_wr, - ep->rep_attr.cap.max_send_sge, - ep->rep_attr.cap.max_recv_sge); - - ep->rep_send_batch = ep->rep_max_requests >> 3; - ep->rep_send_count = ep->rep_send_batch; - init_waitqueue_head(&ep->rep_connect_wait); - ep->rep_receive_count = 0; - - sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt, - ep->rep_attr.cap.max_send_wr + 1, - IB_POLL_WORKQUEUE); - if (IS_ERR(sendcq)) { - rc = PTR_ERR(sendcq); - goto out1; + id = rpcrdma_create_id(r_xprt, ep); + if (IS_ERR(id)) { + kfree(ep); + return PTR_ERR(id); } + __module_get(THIS_MODULE); + device = id->device; + ep->re_id = id; + reinit_completion(&ep->re_done); + + ep->re_max_requests = r_xprt->rx_xprt.max_reqs; + ep->re_inline_send = xprt_rdma_max_inline_write; + ep->re_inline_recv = xprt_rdma_max_inline_read; + rc = frwr_query_device(ep, device); + if (rc) + goto out_destroy; - recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL, - ep->rep_attr.cap.max_recv_wr + 1, - IB_POLL_WORKQUEUE); - if (IS_ERR(recvcq)) { - rc = PTR_ERR(recvcq); - goto out2; + r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests); + + ep->re_attr.srq = NULL; + ep->re_attr.cap.max_inline_data = 0; + ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + ep->re_attr.qp_type = IB_QPT_RC; + ep->re_attr.port_num = ~0; + + ep->re_send_batch = ep->re_max_requests >> 3; + ep->re_send_count = ep->re_send_batch; + init_waitqueue_head(&ep->re_connect_wait); + + ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt, + ep->re_attr.cap.max_send_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->re_attr.send_cq)) { + rc = PTR_ERR(ep->re_attr.send_cq); + ep->re_attr.send_cq = NULL; + goto out_destroy; } - ep->rep_attr.send_cq = sendcq; - ep->rep_attr.recv_cq = recvcq; + ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt, + ep->re_attr.cap.max_recv_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->re_attr.recv_cq)) { + rc = PTR_ERR(ep->re_attr.recv_cq); + ep->re_attr.recv_cq = NULL; + goto out_destroy; + } + ep->re_receive_count = 0; /* Initialize cma parameters */ - memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); + memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma)); /* Prepare RDMA-CM private message */ + pmsg = &ep->re_cm_private; pmsg->cp_magic = rpcrdma_cmp_magic; pmsg->cp_version = RPCRDMA_CMP_VERSION; pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; - pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send); - pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv); - ep->rep_remote_cma.private_data = pmsg; - ep->rep_remote_cma.private_data_len = sizeof(*pmsg); + pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv); + ep->re_remote_cma.private_data = pmsg; + ep->re_remote_cma.private_data_len = sizeof(*pmsg); /* Client offers RDMA Read but does not initiate */ - ep->rep_remote_cma.initiator_depth = 0; - ep->rep_remote_cma.responder_resources = - min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom); + ep->re_remote_cma.initiator_depth = 0; + ep->re_remote_cma.responder_resources = + min_t(int, U8_MAX, device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing * transport connection and retransmission. */ - ep->rep_remote_cma.retry_count = 6; + ep->re_remote_cma.retry_count = 6; /* RPC-over-RDMA handles its own flow control. In addition, * make all RNR NAKs visible so we know that RPC-over-RDMA * flow control is working correctly (no NAKs should be seen). */ - ep->rep_remote_cma.flow_control = 0; - ep->rep_remote_cma.rnr_retry_count = 0; - - return 0; - -out2: - ib_free_cq(sendcq); -out1: - return rc; -} - -/** - * rpcrdma_ep_destroy - Disconnect and destroy endpoint. - * @r_xprt: transport instance to shut down - * - */ -void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - if (ia->ri_id && ia->ri_id->qp) { - rpcrdma_ep_disconnect(ep, ia); - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - - if (ep->rep_attr.recv_cq) - ib_free_cq(ep->rep_attr.recv_cq); - if (ep->rep_attr.send_cq) - ib_free_cq(ep->rep_attr.send_cq); -} - -/* Re-establish a connection after a device removal event. - * Unlike a normal reconnection, a fresh PD and a new set - * of MRs and buffers is needed. - */ -static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, - struct ib_qp_init_attr *qp_init_attr) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - int rc, err; - - trace_xprtrdma_reinsert(r_xprt); - - rc = -EHOSTUNREACH; - if (rpcrdma_ia_open(r_xprt)) - goto out1; - - rc = -ENOMEM; - err = rpcrdma_ep_create(r_xprt); - if (err) { - pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); - goto out2; - } - memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr)); - - rc = -ENETUNREACH; - err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr); - if (err) { - pr_err("rpcrdma: rdma_create_qp returned %d\n", err); - goto out3; - } - return 0; - -out3: - rpcrdma_ep_destroy(r_xprt); -out2: - rpcrdma_ia_close(ia); -out1: - return rc; -} - -static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, - struct ib_qp_init_attr *qp_init_attr) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rdma_cm_id *id, *old; - int err, rc; - - rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia); + ep->re_remote_cma.flow_control = 0; + ep->re_remote_cma.rnr_retry_count = 0; - rc = -EHOSTUNREACH; - id = rpcrdma_create_id(r_xprt, ia); - if (IS_ERR(id)) - goto out; - - /* As long as the new ID points to the same device as the - * old ID, we can reuse the transport's existing PD and all - * previously allocated MRs. Also, the same device means - * the transport's previous DMA mappings are still valid. - * - * This is a sanity check only. There should be no way these - * point to two different devices here. - */ - old = id; - rc = -ENETUNREACH; - if (ia->ri_id->device != id->device) { - pr_err("rpcrdma: can't reconnect on different device!\n"); + ep->re_pd = ib_alloc_pd(device, 0); + if (IS_ERR(ep->re_pd)) { + rc = PTR_ERR(ep->re_pd); + ep->re_pd = NULL; goto out_destroy; } - err = rdma_create_qp(id, ia->ri_pd, qp_init_attr); - if (err) + rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr); + if (rc) goto out_destroy; - /* Atomically replace the transport's ID and QP. */ - rc = 0; - old = ia->ri_id; - ia->ri_id = id; - rdma_destroy_qp(old); + r_xprt->rx_ep = ep; + return 0; out_destroy: - rdma_destroy_id(old); -out: + rpcrdma_ep_put(ep); + rdma_destroy_id(id); return rc; } -/* - * Connect unconnected endpoint. +/** + * rpcrdma_xprt_connect - Connect an unconnected transport + * @r_xprt: controlling transport instance + * + * Returns 0 on success or a negative errno. */ -int -rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, - rx_ia); struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct ib_qp_init_attr qp_init_attr; + struct rpcrdma_ep *ep; int rc; -retry: - memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr)); - switch (ep->rep_connected) { - case 0: - rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr); - if (rc) { - rc = -ENETUNREACH; - goto out_noupdate; - } - break; - case -ENODEV: - rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr); - if (rc) - goto out_noupdate; - break; - default: - rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr); - if (rc) - goto out; - } + rc = rpcrdma_ep_create(r_xprt); + if (rc) + return rc; + ep = r_xprt->rx_ep; - ep->rep_connected = 0; xprt_clear_connected(xprt); - rpcrdma_reset_cwnd(r_xprt); - rpcrdma_post_recvs(r_xprt, true); - rc = rpcrdma_sendctxs_create(r_xprt); - if (rc) - goto out; + /* Bump the ep's reference count while there are + * outstanding Receives. + */ + rpcrdma_ep_get(ep); + rpcrdma_post_recvs(r_xprt, 1, true); - rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); + rc = rdma_connect(ep->re_id, &ep->re_remote_cma); if (rc) goto out; if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; - wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); - if (ep->rep_connected <= 0) { - if (ep->rep_connected == -EAGAIN) - goto retry; - rc = ep->rep_connected; + wait_event_interruptible(ep->re_connect_wait, + ep->re_connect_status != 0); + if (ep->re_connect_status <= 0) { + rc = ep->re_connect_status; + goto out; + } + + rc = rpcrdma_sendctxs_create(r_xprt); + if (rc) { + rc = -ENOTCONN; goto out; } rc = rpcrdma_reqs_setup(r_xprt); if (rc) { - rpcrdma_ep_disconnect(ep, ia); + rc = -ENOTCONN; goto out; } rpcrdma_mrs_create(r_xprt); + frwr_wp_create(r_xprt); out: - if (rc) - ep->rep_connected = rc; - -out_noupdate: trace_xprtrdma_connect(r_xprt, rc); return rc; } /** - * rpcrdma_ep_disconnect - Disconnect underlying transport - * @ep: endpoint to disconnect - * @ia: associated interface adapter + * rpcrdma_xprt_disconnect - Disconnect underlying transport + * @r_xprt: controlling transport instance * * Caller serializes. Either the transport send lock is held, * or we're being called to destroy the transport. + * + * On return, @r_xprt is completely divested of all hardware + * resources and prepared for the next ->connect operation. */ -void -rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, - rx_ep); + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rdma_cm_id *id; int rc; - /* returns without wait if ID is not connected */ - rc = rdma_disconnect(ia->ri_id); - if (!rc) - wait_event_interruptible(ep->rep_connect_wait, - ep->rep_connected != 1); - else - ep->rep_connected = rc; + if (!ep) + return; + + id = ep->re_id; + rc = rdma_disconnect(id); trace_xprtrdma_disconnect(r_xprt, rc); rpcrdma_xprt_drain(r_xprt); + rpcrdma_reps_unmap(r_xprt); rpcrdma_reqs_reset(r_xprt); rpcrdma_mrs_destroy(r_xprt); rpcrdma_sendctxs_destroy(r_xprt); + + if (rpcrdma_ep_put(ep)) + rdma_destroy_id(id); + + r_xprt->rx_ep = NULL; } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -793,12 +604,15 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) { struct rpcrdma_sendctx *sc; - sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge), - GFP_KERNEL); + sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge), + XPRTRDMA_GFP_FLAGS); if (!sc) return NULL; sc->sc_cqe.done = rpcrdma_wc_send; + sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id; + sc->sc_cid.ci_completion_id = + atomic_inc_return(&ep->re_completion_ids); return sc; } @@ -813,14 +627,14 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * the ->send_request call to fail temporarily before too many * Sends are posted. */ - i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS; - buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); + i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS; + buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), XPRTRDMA_GFP_FLAGS); if (!buf->rb_sc_ctxs) return -ENOMEM; buf->rb_sc_last = i - 1; for (i = 0; i <= buf->rb_sc_last; i++) { - sc = rpcrdma_sendctx_create(&r_xprt->rx_ep); + sc = rpcrdma_sendctx_create(r_xprt->rx_ep); if (!sc) return -ENOMEM; @@ -924,25 +738,26 @@ static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct ib_device *device = ep->re_id->device; unsigned int count; - for (count = 0; count < ia->ri_max_rdma_segs; count++) { + /* Try to allocate enough to perform one full-sized I/O */ + for (count = 0; count < ep->re_max_rdma_segs; count++) { struct rpcrdma_mr *mr; int rc; - mr = kzalloc(sizeof(*mr), GFP_NOFS); + mr = kzalloc_node(sizeof(*mr), XPRTRDMA_GFP_FLAGS, + ibdev_to_node(device)); if (!mr) break; - rc = frwr_init_mr(ia, mr); + rc = frwr_mr_init(r_xprt, mr); if (rc) { kfree(mr); break; } - mr->mr_xprt = r_xprt; - spin_lock(&buf->rb_lock); rpcrdma_mr_push(mr, &buf->rb_mrs); list_add(&mr->mr_all, &buf->rb_all_mrs); @@ -973,43 +788,38 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; - /* If there is no underlying device, it's no use to - * wake the refresh worker. + /* If there is no underlying connection, it's no use + * to wake the refresh worker. */ - if (ep->rep_connected != -ENODEV) { - /* The work is scheduled on a WQ_MEM_RECLAIM - * workqueue in order to prevent MR allocation - * from recursing into NFS during direct reclaim. - */ - queue_work(xprtiod_workqueue, &buf->rb_refresh_worker); - } + if (ep->re_connect_status != 1) + return; + queue_work(system_highpri_wq, &buf->rb_refresh_worker); } /** * rpcrdma_req_create - Allocate an rpcrdma_req object * @r_xprt: controlling r_xprt * @size: initial size, in bytes, of send and receive buffers - * @flags: GFP flags passed to memory allocators * * Returns an allocated and fully initialized rpcrdma_req or NULL. */ -struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, - gfp_t flags) +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, + size_t size) { struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_req *req; - req = kzalloc(sizeof(*req), flags); + req = kzalloc(sizeof(*req), XPRTRDMA_GFP_FLAGS); if (req == NULL) goto out1; - req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags); + req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE); if (!req->rl_sendbuf) goto out2; - req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags); + req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE); if (!req->rl_recvbuf) goto out3; @@ -1042,10 +852,10 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Compute maximum header buffer size in bytes */ maxhdrsize = rpcrdma_fixed_maxsz + 3 + - r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz; + r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz; maxhdrsize *= sizeof(__be32); rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), - DMA_TO_DEVICE, GFP_KERNEL); + DMA_TO_DEVICE); if (!rb) goto out; @@ -1091,6 +901,8 @@ static void rpcrdma_req_reset(struct rpcrdma_req *req) rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); + + frwr_reset(req); } /* ASSUMPTION: the rb_allreqs list is stable for the duration, @@ -1107,27 +919,28 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt) rpcrdma_req_reset(req); } -/* No locking needed here. This function is called only by the - * Receive completion handler. - */ static noinline struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) { + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - rep = kzalloc(sizeof(*rep), GFP_KERNEL); + rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS); if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv, - DMA_FROM_DEVICE, GFP_KERNEL); + rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv, + DMA_FROM_DEVICE); if (!rep->rr_rdmabuf) goto out_free; if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) goto out_free_regbuf; + rep->rr_cid.ci_completion_id = + atomic_inc_return(&r_xprt->rx_ep->re_completion_ids); + xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf), rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; @@ -1137,7 +950,10 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; rep->rr_temp = temp; - list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps); + + spin_lock(&buf->rb_lock); + list_add(&rep->rr_all, &buf->rb_all_reps); + spin_unlock(&buf->rb_lock); return rep; out_free_regbuf: @@ -1148,16 +964,23 @@ out: return NULL; } -/* No locking needed here. This function is invoked only by the - * Receive completion handler, or during transport shutdown. - */ -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +static void rpcrdma_rep_free(struct rpcrdma_rep *rep) { - list_del(&rep->rr_all); rpcrdma_regbuf_free(rep->rr_rdmabuf); kfree(rep); } +static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +{ + struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf; + + spin_lock(&buf->rb_lock); + list_del(&rep->rr_all); + spin_unlock(&buf->rb_lock); + + rpcrdma_rep_free(rep); +} + static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) { struct llist_node *node; @@ -1169,12 +992,21 @@ static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) return llist_entry(node, struct rpcrdma_rep, rr_node); } -static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, - struct rpcrdma_rep *rep) +/** + * rpcrdma_rep_put - Release rpcrdma_rep back to free list + * @buf: buffer pool + * @rep: rep to release + * + */ +void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep) { llist_add(&rep->rr_node, &buf->rb_free_reps); } +/* Caller must ensure the QP is quiescent (RQ is drained) before + * invoking this function, to guarantee rb_all_reps is not + * changing. + */ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; @@ -1182,7 +1014,7 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) list_for_each_entry(rep, &buf->rb_all_reps, rr_all) { rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); - rep->rr_temp = true; + rep->rr_temp = true; /* Mark this rep for destruction */ } } @@ -1190,8 +1022,18 @@ static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_rep *rep; - while ((rep = rpcrdma_rep_get_locked(buf)) != NULL) - rpcrdma_rep_destroy(rep); + spin_lock(&buf->rb_lock); + while ((rep = list_first_entry_or_null(&buf->rb_all_reps, + struct rpcrdma_rep, + rr_all)) != NULL) { + list_del(&rep->rr_all); + spin_unlock(&buf->rb_lock); + + rpcrdma_rep_free(rep); + + spin_lock(&buf->rb_lock); + } + spin_unlock(&buf->rb_lock); } /** @@ -1219,8 +1061,8 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) for (i = 0; i < r_xprt->rx_xprt.max_reqs; i++) { struct rpcrdma_req *req; - req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2, - GFP_KERNEL); + req = rpcrdma_req_create(r_xprt, + RPCRDMA_V1_DEF_INLINE_SIZE * 2); if (!req) goto out; list_add(&req->rl_list, &buf->rb_send_bufs); @@ -1254,7 +1096,7 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req) list_del(&mr->mr_all); spin_unlock(&buf->rb_lock); - frwr_release_mr(mr); + frwr_mr_release(mr); } rpcrdma_regbuf_free(req->rl_recvbuf); @@ -1285,7 +1127,7 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt) list_del(&mr->mr_all); spin_unlock(&buf->rb_lock); - frwr_release_mr(mr); + frwr_mr_release(mr); spin_lock(&buf->rb_lock); } @@ -1335,22 +1177,17 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) } /** - * rpcrdma_mr_put - DMA unmap an MR and release it - * @mr: MR to release + * rpcrdma_reply_put - Put reply buffers back into pool + * @buffers: buffer pool + * @req: object to return * */ -void rpcrdma_mr_put(struct rpcrdma_mr *mr) +void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) { - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - if (mr->mr_dir != DMA_NONE) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; + if (req->rl_reply) { + rpcrdma_rep_put(buffers, req->rl_reply); + req->rl_reply = NULL; } - - rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); } /** @@ -1381,26 +1218,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) */ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) { - if (req->rl_reply) - rpcrdma_rep_put(buffers, req->rl_reply); - req->rl_reply = NULL; + rpcrdma_reply_put(buffers, req); spin_lock(&buffers->rb_lock); list_add(&req->rl_list, &buffers->rb_send_bufs); spin_unlock(&buffers->rb_lock); } -/** - * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list - * @rep: rep to release - * - * Used after error conditions. - */ -void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) -{ - rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep); -} - /* Returns a pointer to a rpcrdma_regbuf object, or NULL. * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for @@ -1408,15 +1232,14 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * or Replies they may be registered externally via frwr_map. */ static struct rpcrdma_regbuf * -rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, - gfp_t flags) +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) { struct rpcrdma_regbuf *rb; - rb = kmalloc(sizeof(*rb), flags); + rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS); if (!rb) return NULL; - rb->rg_data = kmalloc(size, flags); + rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS); if (!rb->rg_data) { kfree(rb); return NULL; @@ -1463,7 +1286,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_regbuf *rb) { - struct ib_device *device = r_xprt->rx_ia.ri_id->device; + struct ib_device *device = r_xprt->rx_ep->re_id->device; if (rb->rg_direction == DMA_NONE) return false; @@ -1476,7 +1299,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, } rb->rg_device = device; - rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey; + rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey; return true; } @@ -1502,61 +1325,32 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) } /** - * rpcrdma_ep_post - Post WRs to a transport's Send Queue - * @ia: transport's device information - * @ep: transport's RDMA endpoint information - * @req: rpcrdma_req containing the Send WR to post - * - * Returns 0 if the post was successful, otherwise -ENOTCONN - * is returned. - */ -int -rpcrdma_ep_post(struct rpcrdma_ia *ia, - struct rpcrdma_ep *ep, - struct rpcrdma_req *req) -{ - struct ib_send_wr *send_wr = &req->rl_wr; - int rc; - - if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) { - send_wr->send_flags |= IB_SEND_SIGNALED; - ep->rep_send_count = ep->rep_send_batch; - } else { - send_wr->send_flags &= ~IB_SEND_SIGNALED; - --ep->rep_send_count; - } - - rc = frwr_send(ia, req); - trace_xprtrdma_post_send(req, rc); - if (rc) - return -ENOTCONN; - return 0; -} - -/** * rpcrdma_post_recvs - Refill the Receive Queue * @r_xprt: controlling transport instance - * @temp: mark Receive buffers to be deleted after use + * @needed: current credit grant + * @temp: mark Receive buffers to be deleted after one use * */ -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_recv_wr *wr, *bad_wr; struct rpcrdma_rep *rep; - int needed, count, rc; + int count, rc; rc = 0; count = 0; - needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); - if (likely(ep->rep_receive_count > needed)) + if (likely(ep->re_receive_count > needed)) goto out; - needed -= ep->rep_receive_count; + needed -= ep->re_receive_count; if (!temp) needed += RPCRDMA_MAX_RECV_BATCH; + if (atomic_inc_return(&ep->re_receiving) > 1) + goto out; + /* fast path: all needed reps can be found on the free list */ wr = NULL; while (needed) { @@ -1570,6 +1364,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!rep) break; + rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; trace_xprtrdma_post_recv(rep); rep->rr_recv_wr.next = wr; wr = &rep->rr_recv_wr; @@ -1579,20 +1374,24 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!wr) goto out; - rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, + rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); -out: - trace_xprtrdma_post_recvs(r_xprt, count, rc); if (rc) { + trace_xprtrdma_post_recvs_err(r_xprt, rc); for (wr = bad_wr; wr;) { struct rpcrdma_rep *rep; rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr); wr = wr->next; - rpcrdma_recv_buffer_put(rep); + rpcrdma_rep_put(buf, rep); --count; } } - ep->rep_receive_count += count; + if (atomic_dec_return(&ep->re_receiving) > 0) + complete(&ep->re_done); + +out: + trace_xprtrdma_post_recvs(r_xprt, count); + ep->re_receive_count += count; return; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 37d5080c250b..5e5ff6784ef5 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -53,6 +53,7 @@ #include <rdma/ib_verbs.h> /* RDMA verbs api */ #include <linux/sunrpc/clnt.h> /* rpc_xprt */ +#include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ @@ -65,48 +66,45 @@ #define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) /* - * Interface Adapter -- one per transport instance + * RDMA Endpoint -- connection endpoint details */ -struct rpcrdma_ia { - struct rdma_cm_id *ri_id; - struct ib_pd *ri_pd; - int ri_async_rc; - unsigned int ri_max_rdma_segs; - unsigned int ri_max_frwr_depth; - bool ri_implicit_roundup; - enum ib_mr_type ri_mrtype; - unsigned long ri_flags; - struct completion ri_done; - struct completion ri_remove_done; -}; - -enum { - RPCRDMA_IAF_REMOVING = 0, -}; - -/* - * RDMA Endpoint -- one per transport instance - */ - +struct rpcrdma_mr; struct rpcrdma_ep { - unsigned int rep_send_count; - unsigned int rep_send_batch; - unsigned int rep_max_inline_send; - unsigned int rep_max_inline_recv; - int rep_connected; - struct ib_qp_init_attr rep_attr; - wait_queue_head_t rep_connect_wait; - struct rpcrdma_connect_private rep_cm_private; - struct rdma_conn_param rep_remote_cma; - unsigned int rep_max_requests; /* depends on device */ - unsigned int rep_inline_send; /* negotiated */ - unsigned int rep_inline_recv; /* negotiated */ - int rep_receive_count; + struct kref re_kref; + struct rdma_cm_id *re_id; + struct ib_pd *re_pd; + unsigned int re_max_rdma_segs; + unsigned int re_max_fr_depth; + struct rpcrdma_mr *re_write_pad_mr; + enum ib_mr_type re_mrtype; + struct completion re_done; + unsigned int re_send_count; + unsigned int re_send_batch; + unsigned int re_max_inline_send; + unsigned int re_max_inline_recv; + int re_async_rc; + int re_connect_status; + atomic_t re_receiving; + atomic_t re_force_disconnect; + struct ib_qp_init_attr re_attr; + wait_queue_head_t re_connect_wait; + struct rpc_xprt *re_xprt; + struct rpcrdma_connect_private + re_cm_private; + struct rdma_conn_param re_remote_cma; + int re_receive_count; + unsigned int re_max_requests; /* depends on device */ + unsigned int re_inline_send; /* negotiated */ + unsigned int re_inline_recv; /* negotiated */ + + atomic_t re_completion_ids; + + char re_write_pad[XDR_UNIT]; }; -/* Pre-allocate extra Work Requests for handling backward receives - * and sends. This is a fixed value because the Work Queues are - * allocated when the forward channel is set up, long before the +/* Pre-allocate extra Work Requests for handling reverse-direction + * Receives and Sends. This is a fixed value because the Work Queues + * are allocated when the forward channel is set up, long before the * backchannel is provisioned. This value is two times * NFS4_DEF_CB_SLOT_TABLE_SIZE. */ @@ -151,7 +149,11 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb) return rb->rg_data; } -#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) +/* Do not use emergency memory reserves, and fail quickly if memory + * cannot be allocated easily. These flags may be used wherever there + * is robust logic to handle a failure to allocate. + */ +#define XPRTRDMA_GFP_FLAGS (__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) /* To ensure a transport can always make forward progress, * the number of RDMA segments allowed in header chunk lists @@ -189,6 +191,8 @@ enum { struct rpcrdma_rep { struct ib_cqe rr_cqe; + struct rpc_rdma_cid rr_cid; + __be32 rr_xid; __be32 rr_vers; __be32 rr_proc; @@ -220,6 +224,7 @@ enum { struct rpcrdma_req; struct rpcrdma_sendctx { struct ib_cqe sc_cqe; + struct rpc_rdma_cid sc_cid; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; struct ib_sge sc_sges[]; @@ -231,29 +236,28 @@ struct rpcrdma_sendctx { * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). */ -struct rpcrdma_frwr { - struct ib_mr *fr_mr; - struct ib_cqe fr_cqe; - struct completion fr_linv_done; - union { - struct ib_reg_wr fr_regwr; - struct ib_send_wr fr_invwr; - }; -}; - struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; struct rpcrdma_req *mr_req; + + struct ib_mr *mr_ibmr; + struct ib_device *mr_device; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; - struct rpcrdma_frwr frwr; + struct ib_cqe mr_cqe; + struct completion mr_linv_done; + union { + struct ib_reg_wr mr_regwr; + struct ib_send_wr mr_invwr; + }; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; u32 mr_length; u64 mr_offset; struct list_head mr_all; + struct rpc_rdma_cid mr_cid; }; /* @@ -284,10 +288,11 @@ enum { RPCRDMA_MAX_IOV_SEGS, }; -struct rpcrdma_mr_seg { /* chunk descriptors */ - u32 mr_len; /* length of chunk or segment */ - struct page *mr_page; /* owning page, if any */ - char *mr_offset; /* kva if no page, else offset */ +/* Arguments for DMA mapping and registration */ +struct rpcrdma_mr_seg { + u32 mr_len; /* length of segment */ + struct page *mr_page; /* underlying struct page */ + u64 mr_offset; /* IN: page offset, OUT: iova */ }; /* The Send SGE array is provisioned to send a maximum size @@ -422,8 +427,7 @@ struct rpcrdma_stats { */ struct rpcrdma_xprt { struct rpc_xprt rx_xprt; - struct rpcrdma_ia rx_ia; - struct rpcrdma_ep rx_ep; + struct rpcrdma_ep *rx_ep; struct rpcrdma_buffer rx_buf; struct delayed_work rx_connect_worker; struct rpc_timeout rx_timeout; @@ -455,29 +459,20 @@ extern int xprt_rdma_pad_optimize; extern unsigned int xprt_rdma_memreg_strategy; /* - * Interface Adapter calls - xprtrdma/verbs.c - */ -int rpcrdma_ia_open(struct rpcrdma_xprt *xprt); -void rpcrdma_ia_remove(struct rpcrdma_ia *ia); -void rpcrdma_ia_close(struct rpcrdma_ia *); - -/* * Endpoint calls - xprtrdma/verbs.c */ -int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt); -void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt); -int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_force_disconnect(struct rpcrdma_ep *ep); +void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc); +int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); +void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); -int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, - struct rpcrdma_req *); -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); /* * Buffer calls - xprtrdma/verbs.c */ -struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, - gfp_t flags); +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, + size_t size); int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); @@ -485,13 +480,13 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); -void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req); -void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); +void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep); +void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req); bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags); @@ -536,18 +531,18 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ void frwr_reset(struct rpcrdma_req *req); -int frwr_query_device(struct rpcrdma_xprt *r_xprt, - const struct ib_device *device); -int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); -void frwr_release_mr(struct rpcrdma_mr *mr); +int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device); +int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); +void frwr_mr_release(struct rpcrdma_mr *mr); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, struct rpcrdma_mr *mr); -int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req); +int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); +int frwr_wp_create(struct rpcrdma_xprt *r_xprt); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c @@ -569,9 +564,10 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype rtype); void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); -void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); +void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep); void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); +void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct rpcrdma_rep *rep); static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) |