From cace564f8b6260e806f5e28d7f192fd0e0c603ed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Sep 2016 10:52:50 -0400 Subject: svcrdma: Tail iovec leaves an orphaned DMA mapping The ctxt's count field is overloaded to mean the number of pages in the ctxt->page array and the number of SGEs in the ctxt->sge array. Typically these two numbers are the same. However, when an inline RPC reply is constructed from an xdr_buf with a tail iovec, the head and tail often occupy the same page, but each are DMA mapped independently. In that case, ->count equals the number of pages, but it does not equal the number of SGEs. There's one more SGE, for the tail iovec. Hence there is one more DMA mapping than there are pages in the ctxt->page array. This isn't a real problem until the server's iommu is enabled. Then each RPC reply that has content in that iovec orphans a DMA mapping that consists of real resources. krb5i and krb5p always populate that tail iovec. After a couple million sent krb5i/p RPC replies, the NFS server starts behaving erratically. Reboot is needed to clear the problem. Fixes: 9d11b51ce7c1 ("svcrdma: Fix send_reply() scatter/gather set-up") Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 22 ++++------------------ net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 ++++++++++++------ 4 files changed, 18 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a2a7519b0f23..cd0c5581498c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -129,7 +129,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, ret = -EIO; goto out_unmap; } - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); memset(&send_wr, 0, sizeof(send_wr)); ctxt->cqe.done = svc_rdma_wc_send; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 2c25606f2561..ad1df979b3f0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -159,7 +159,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, ctxt->sge[pno].addr); if (ret) goto err; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[pno].length = len; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 54d533300620..e5b49e62420c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -280,7 +280,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge[sge_no].addr)) goto err; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count++; sge_off = 0; @@ -489,7 +489,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[0].length, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) goto err; - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); ctxt->direction = DMA_TO_DEVICE; @@ -505,7 +505,7 @@ static int send_reply(struct svcxprt_rdma *rdma, if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[sge_no].addr)) goto err; - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } @@ -523,23 +523,9 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; ctxt->count++; rqstp->rq_respages[page_no] = NULL; - /* - * If there are more pages than SGE, terminate SGE - * list so that svc_rdma_unmap_dma doesn't attempt to - * unmap garbage. - */ - if (page_no+1 >= sge_no) - ctxt->sge[page_no+1].length = 0; } rqstp->rq_next_page = rqstp->rq_respages + 1; - /* The loop above bumps sc_dma_used for each sge. The - * xdr_buf.tail gets a separate sge, but resides in the - * same page as xdr_buf.head. Don't count it twice. - */ - if (sge_no > ctxt->count) - atomic_dec(&rdma->sc_dma_used); - if (sge_no > rdma->sc_max_sge) { pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err; @@ -692,7 +678,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, svc_rdma_put_context(ctxt, 1); return; } - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); /* Prepare SEND WR */ memset(&err_wr, 0, sizeof(err_wr)); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index dd9440137834..924271c9ef3e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -198,6 +198,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) out: ctxt->count = 0; + ctxt->mapped_sges = 0; ctxt->frmr = NULL; return ctxt; @@ -221,22 +222,27 @@ out_empty: void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) { struct svcxprt_rdma *xprt = ctxt->xprt; - int i; - for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { + struct ib_device *device = xprt->sc_cm_id->device; + u32 lkey = xprt->sc_pd->local_dma_lkey; + unsigned int i, count; + + for (count = 0, i = 0; i < ctxt->mapped_sges; i++) { /* * Unmap the DMA addr in the SGE if the lkey matches * the local_dma_lkey, otherwise, ignore it since it is * an FRMR lkey and will be unmapped later when the * last WR that uses it completes. */ - if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) { - atomic_dec(&xprt->sc_dma_used); - ib_dma_unmap_page(xprt->sc_cm_id->device, + if (ctxt->sge[i].lkey == lkey) { + count++; + ib_dma_unmap_page(device, ctxt->sge[i].addr, ctxt->sge[i].length, ctxt->direction); } } + ctxt->mapped_sges = 0; + atomic_sub(count, &xprt->sc_dma_used); } void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) @@ -600,7 +606,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) DMA_FROM_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) goto err_put_ctxt; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; -- cgit v1.2.3-59-g8ed1b From 9995237bba702281e0e8e677edd5bb225f4f6c30 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Sep 2016 10:52:59 -0400 Subject: svcrdma: Skip put_page() when send_reply() fails Message from syslogd@klimt at Aug 18 17:00:37 ... kernel:page:ffffea0020639b00 count:0 mapcount:0 mapping: (null) index:0x0 Aug 18 17:00:37 klimt kernel: flags: 0x2fffff80000000() Aug 18 17:00:37 klimt kernel: page dumped because: VM_BUG_ON_PAGE(page_ref_count(page) == 0) Aug 18 17:00:37 klimt kernel: kernel BUG at /home/cel/src/linux/linux-2.6/include/linux/mm.h:445! Aug 18 17:00:37 klimt kernel: RIP: 0010:[] svc_rdma_sendto+0x641/0x820 [rpcrdma] send_reply() assigns its page argument as the first page of ctxt. On error, send_reply() already invokes svc_rdma_put_context(ctxt, 1); which does a put_page() on that very page. No need to do that again as svc_rdma_sendto exits. Fixes: 3e1eeb980822 ("svcrdma: Close connection when a send error occurs") Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index e5b49e62420c..3b95b19fcf72 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -621,7 +621,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec, inline_bytes); if (ret < 0) - goto err1; + goto err0; svc_rdma_put_req_map(rdma, vec); dprintk("svcrdma: send_reply returns %d\n", ret); -- cgit v1.2.3-59-g8ed1b From cc9d83408b52265ddab2874cf19d1611da4ca7ee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Sep 2016 10:53:15 -0400 Subject: svcrdma: Server-side support for rpcrdma_connect_private Prepare to receive an RDMA-CM private message when handling a new connection attempt, and send a similar message as part of connection acceptance. Both sides can communicate their various implementation limits. Implementations that don't support this sideband protocol ignore it. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 34 ++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 924271c9ef3e..f51e98a25263 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -648,6 +648,21 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags) return ret; } +static void +svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt, + struct rdma_conn_param *param) +{ + const struct rpcrdma_connect_private *pmsg = param->private_data; + + if (pmsg && + pmsg->cp_magic == rpcrdma_cmp_magic && + pmsg->cp_version == RPCRDMA_CMP_VERSION) { + dprintk("svcrdma: client send_size %u, recv_size %u\n", + rpcrdma_decode_buffer_size(pmsg->cp_send_size), + rpcrdma_decode_buffer_size(pmsg->cp_recv_size)); + } +} + /* * This function handles the CONNECT_REQUEST event on a listening * endpoint. It is passed the cma_id for the _new_ connection. The context in @@ -659,7 +674,8 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags) * will call the recvfrom method on the listen xprt which will accept the new * connection. */ -static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) +static void handle_connect_req(struct rdma_cm_id *new_cma_id, + struct rdma_conn_param *param) { struct svcxprt_rdma *listen_xprt = new_cma_id->context; struct svcxprt_rdma *newxprt; @@ -675,9 +691,10 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) new_cma_id->context = newxprt; dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", newxprt, newxprt->sc_cm_id, listen_xprt); + svc_rdma_parse_connect_private(newxprt, param); /* Save client advertised inbound read limit for use later in accept. */ - newxprt->sc_ord = client_ird; + newxprt->sc_ord = param->initiator_depth; /* Set the local and remote addresses in the transport */ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; @@ -712,8 +729,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " "event = %s (%d)\n", cma_id, cma_id->context, rdma_event_msg(event->event), event->event); - handle_connect_req(cma_id, - event->param.conn.initiator_depth); + handle_connect_req(cma_id, &event->param.conn); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -947,6 +963,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct svcxprt_rdma *listen_rdma; struct svcxprt_rdma *newxprt = NULL; struct rdma_conn_param conn_param; + struct rpcrdma_connect_private pmsg; struct ib_qp_init_attr qp_attr; struct ib_device *dev; unsigned int i; @@ -1100,11 +1117,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Swap out the handler */ newxprt->sc_cm_id->event_handler = rdma_cma_handler; + /* Construct RDMA-CM private message */ + pmsg.cp_magic = rpcrdma_cmp_magic; + pmsg.cp_version = RPCRDMA_CMP_VERSION; + pmsg.cp_flags = 0; + pmsg.cp_send_size = pmsg.cp_recv_size = + rpcrdma_encode_buffer_size(newxprt->sc_max_req_size); + /* Accept Connection */ set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 0; conn_param.initiator_depth = newxprt->sc_ord; + conn_param.private_data = &pmsg; + conn_param.private_data_len = sizeof(pmsg); ret = rdma_accept(newxprt->sc_cm_id, &conn_param); if (ret) { dprintk("svcrdma: failed to accept new connection, ret=%d\n", -- cgit v1.2.3-59-g8ed1b From 25d55296dd3eac23adb2ae46b67b65bf73b22fb2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Sep 2016 10:53:23 -0400 Subject: svcrdma: support Remote Invalidation Support Remote Invalidation. A private message is exchanged with the client upon RDMA transport connect that indicates whether Send With Invalidation may be used by the server to send RPC replies. The invalidate_rkey is arbitrarily chosen from among rkeys present in the RPC-over-RDMA header's chunk lists. Send With Invalidate improves performance only when clients can recognize, while processing an RPC reply, that an rkey has already been invalidated. That has been submitted as a separate change. In the future, the RPC-over-RDMA protocol might support Remote Invalidation properly. The protocol needs to enable signaling between peers to indicate when Remote Invalidation can be used for each individual RPC. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_sendto.c | 58 ++++++++++++++++++++++++++++++-- net/sunrpc/xprtrdma/svc_rdma_transport.c | 12 +++++-- 3 files changed, 65 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 3584bc8864c4..cc3ae16eac68 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -137,6 +137,7 @@ struct svcxprt_rdma { int sc_ord; /* RDMA read limit */ int sc_max_sge; int sc_max_sge_rd; /* max sge for read target */ + bool sc_snd_w_inv; /* OK to use Send With Invalidate */ atomic_t sc_sq_count; /* Number of SQ WR on queue */ unsigned int sc_sq_depth; /* Depth of SQ */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 3b95b19fcf72..f5a91edcd233 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -225,6 +225,48 @@ svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp, return rp_ary; } +/* RPC-over-RDMA Version One private extension: Remote Invalidation. + * Responder's choice: requester signals it can handle Send With + * Invalidate, and responder chooses one rkey to invalidate. + * + * Find a candidate rkey to invalidate when sending a reply. Picks the + * first rkey it finds in the chunks lists. + * + * Returns zero if RPC's chunk lists are empty. + */ +static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp, + struct rpcrdma_write_array *wr_ary, + struct rpcrdma_write_array *rp_ary) +{ + struct rpcrdma_read_chunk *rd_ary; + struct rpcrdma_segment *arg_ch; + u32 inv_rkey; + + inv_rkey = 0; + + rd_ary = svc_rdma_get_read_chunk(rdma_argp); + if (rd_ary) { + inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle); + goto out; + } + + if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) { + arg_ch = &wr_ary->wc_array[0].wc_target; + inv_rkey = be32_to_cpu(arg_ch->rs_handle); + goto out; + } + + if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) { + arg_ch = &rp_ary->wc_array[0].wc_target; + inv_rkey = be32_to_cpu(arg_ch->rs_handle); + goto out; + } + +out: + dprintk("svcrdma: Send With Invalidate rkey=%08x\n", inv_rkey); + return inv_rkey; +} + /* Assumptions: * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ @@ -464,7 +506,8 @@ static int send_reply(struct svcxprt_rdma *rdma, struct page *page, struct rpcrdma_msg *rdma_resp, struct svc_rdma_req_map *vec, - int byte_count) + int byte_count, + u32 inv_rkey) { struct svc_rdma_op_ctxt *ctxt; struct ib_send_wr send_wr; @@ -535,7 +578,11 @@ static int send_reply(struct svcxprt_rdma *rdma, send_wr.wr_cqe = &ctxt->cqe; send_wr.sg_list = ctxt->sge; send_wr.num_sge = sge_no; - send_wr.opcode = IB_WR_SEND; + if (inv_rkey) { + send_wr.opcode = IB_WR_SEND_WITH_INV; + send_wr.ex.invalidate_rkey = inv_rkey; + } else + send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; ret = svc_rdma_send(rdma, &send_wr); @@ -567,6 +614,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) int inline_bytes; struct page *res_page; struct svc_rdma_req_map *vec; + u32 inv_rkey; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); @@ -577,6 +625,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) wr_ary = svc_rdma_get_write_array(rdma_argp); rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary); + inv_rkey = 0; + if (rdma->sc_snd_w_inv) + inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary); + /* Build an req vec for the XDR */ vec = svc_rdma_get_req_map(rdma); ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL); @@ -619,7 +671,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) goto err1; ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec, - inline_bytes); + inline_bytes, inv_rkey); if (ret < 0) goto err0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index f51e98a25263..b2464fc4c455 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -657,9 +657,14 @@ svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt, if (pmsg && pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { - dprintk("svcrdma: client send_size %u, recv_size %u\n", + newxprt->sc_snd_w_inv = pmsg->cp_flags & + RPCRDMA_CMP_F_SND_W_INV_OK; + + dprintk("svcrdma: client send_size %u, recv_size %u " + "remote inv %ssupported\n", rpcrdma_decode_buffer_size(pmsg->cp_send_size), - rpcrdma_decode_buffer_size(pmsg->cp_recv_size)); + rpcrdma_decode_buffer_size(pmsg->cp_recv_size), + newxprt->sc_snd_w_inv ? "" : "un"); } } @@ -1093,7 +1098,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dev->attrs.max_fast_reg_page_list_len; newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; newxprt->sc_reader = rdma_read_chunk_frmr; - } + } else + newxprt->sc_snd_w_inv = false; /* * Determine if a DMA MR is required and if so, what privs are required -- cgit v1.2.3-59-g8ed1b