From 90a9b1473df72a8b356e7ad6c9b9c9608927b103 Mon Sep 17 00:00:00 2001 From: James Ettle Date: Sun, 28 Jan 2018 20:34:16 +0000 Subject: sunrpc: Fix unaligned access on sparc64 Fix unaligned access in gss_{get,verify}_mic_v2() on sparc64 Signed-off-by: James Ettle Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_krb5_seal.c | 5 ++++- net/sunrpc/auth_gss/gss_krb5_unseal.c | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 1d74d653e6c0..94a2b3f082a8 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -177,6 +177,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, u64 seq_send; u8 *cksumkey; unsigned int cksum_usage; + __be64 seq_send_be64; dprintk("RPC: %s\n", __func__); @@ -187,7 +188,9 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, spin_lock(&krb5_seq_lock); seq_send = ctx->seq_send64++; spin_unlock(&krb5_seq_lock); - *((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send); + + seq_send_be64 = cpu_to_be64(seq_send); + memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8); if (ctx->initiate) { cksumkey = ctx->initiator_sign; diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index dcf9515d9aef..b601a73cc9db 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -155,10 +155,12 @@ gss_verify_mic_v2(struct krb5_ctx *ctx, u8 flags; int i; unsigned int cksum_usage; + __be16 be16_ptr; dprintk("RPC: %s\n", __func__); - if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_MIC) + memcpy(&be16_ptr, (char *) ptr, 2); + if (be16_to_cpu(be16_ptr) != KG2_TOK_MIC) return GSS_S_DEFECTIVE_TOKEN; flags = ptr[2]; -- cgit v1.2.3-59-g8ed1b From 3b68e6ee3cbd4a474bcc7d2ac26812f86cdf333d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 14 Feb 2018 12:15:06 +1100 Subject: SUNRPC: cache: ignore timestamp written to 'flush' file. The interface for flushing the sunrpc auth cache was poorly designed and has caused problems a number of times. The design is that you write a timestamp, and all entries created before that time are discarded. The most obvious problem is that this is not what people actually want. They want to just flush the whole cache. The 1-second granularity can be a problem, as can the use of wall-clock time. A current problem is that code will write the current time to this file - expecting it to clear everything - and if the seconds number ticks over before this timestamp is checked, the test "then >= now" fails, and a full flush isn't forced. So lets just drop the subtleties and always flush the whole cache. The worst this could do is impose an extra cost refilling it, but that would require someone to be using non-standard tools. We still report an error if the string written is not a number, but we cause any valid number to flush the whole cache. Reported-by: "Wang, Alan 1. (NSB - CN/Hangzhou)" Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- net/sunrpc/cache.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 8a7e1c774f9c..26582e27be6a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1450,8 +1450,8 @@ static ssize_t write_flush(struct file *file, const char __user *buf, struct cache_detail *cd) { char tbuf[20]; - char *bp, *ep; - time_t then, now; + char *ep; + time_t now; if (*ppos || count > sizeof(tbuf)-1) return -EINVAL; @@ -1461,24 +1461,24 @@ static ssize_t write_flush(struct file *file, const char __user *buf, simple_strtoul(tbuf, &ep, 0); if (*ep && *ep != '\n') return -EINVAL; + /* Note that while we check that 'buf' holds a valid number, + * we always ignore the value and just flush everything. + * Making use of the number leads to races. + */ - bp = tbuf; - then = get_expiry(&bp); now = seconds_since_boot(); - cd->nextcheck = now; - /* Can only set flush_time to 1 second beyond "now", or - * possibly 1 second beyond flushtime. This is because - * flush_time never goes backwards so it mustn't get too far - * ahead of time. + /* Always flush everything, so behave like cache_purge() + * Do this by advancing flush_time to the current time, + * or by one second if it has already reached the current time. + * Newly added cache entries will always have ->last_refresh greater + * that ->flush_time, so they don't get flushed prematurely. */ - if (then >= now) { - /* Want to flush everything, so behave like cache_purge() */ - if (cd->flush_time >= now) - now = cd->flush_time + 1; - then = now; - } - cd->flush_time = then; + if (cd->flush_time >= now) + now = cd->flush_time + 1; + + cd->flush_time = now; + cd->nextcheck = now; cache_flush(); *ppos += count; -- cgit v1.2.3-59-g8ed1b From 0c4398ff8b581b68ee02c5194654134acc31f4a7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Mar 2018 17:05:09 -0400 Subject: svcrdma: Use pr_err to report Receive errors Clean up: Other completion handlers use pr_err, not pr_warn. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 9ad12a215b51..135ae175ca8e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -330,9 +330,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) flushed: if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_warn("svcrdma: receive: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); + pr_err("svcrdma: Recv: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); svc_rdma_put_context(ctxt, 1); -- cgit v1.2.3-59-g8ed1b From 97cc3264508f33783ba21573204d7e0bf5b197e7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Mar 2018 17:05:15 -0400 Subject: svcrdma: Consult max_qp_init_rd_atom when accepting connections The target needs to return the lesser of the client's Inbound RDMA Read Queue Depth (IRD), provided in the connection parameters, and the local device's Outbound RDMA Read Queue Depth (ORD). The latter limit is max_qp_init_rd_atom, not max_qp_rd_atom. The svcrdma_ord value caps the ORD value for iWARP transports, which do not exchange ORD/IRD values at connection time. Since no other Linux kernel RDMA-enabled storage target sees fit to provide this cap, I'm removing it here too. initiator_depth is a u8, so ensure the computed ORD value does not overflow that field. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 3 --- net/sunrpc/xprtrdma/svc_rdma.c | 4 ++-- net/sunrpc/xprtrdma/svc_rdma_transport.c | 22 +++++++++------------- 3 files changed, 11 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 4b731b046bcd..7337e1221590 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -132,9 +132,6 @@ struct svcxprt_rdma { #define RDMAXPRT_CONN_PENDING 3 #define RPCRDMA_LISTEN_BACKLOG 10 -/* The default ORD value is based on two outstanding full-size writes with a - * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */ -#define RPCRDMA_ORD (64/4) #define RPCRDMA_MAX_REQUESTS 32 /* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index a4a8f6989ee7..dd8a431dc2ae 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -51,9 +51,9 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT /* RPC/RDMA parameters */ -unsigned int svcrdma_ord = RPCRDMA_ORD; +unsigned int svcrdma_ord = 16; /* historical default */ static unsigned int min_ord = 1; -static unsigned int max_ord = 4096; +static unsigned int max_ord = 255; unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS; static unsigned int min_max_requests = 4; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 135ae175ca8e..7b2f4d3a2543 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -762,13 +762,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (!svc_rdma_prealloc_ctxts(newxprt)) goto errout; - /* - * Limit ORD based on client limit, local device limit, and - * configured svcrdma limit. - */ - newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord); - newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); - newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { dprintk("svcrdma: error creating PD for connect request\n"); @@ -843,15 +836,18 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 0; - conn_param.initiator_depth = newxprt->sc_ord; + conn_param.initiator_depth = min_t(int, newxprt->sc_ord, + dev->attrs.max_qp_init_rd_atom); + if (!conn_param.initiator_depth) { + dprintk("svcrdma: invalid ORD setting\n"); + ret = -EINVAL; + goto errout; + } conn_param.private_data = &pmsg; conn_param.private_data_len = sizeof(pmsg); ret = rdma_accept(newxprt->sc_cm_id, &conn_param); - if (ret) { - dprintk("svcrdma: failed to accept new connection, ret=%d\n", - ret); + if (ret) goto errout; - } dprintk("svcrdma: new connection %p accepted:\n", newxprt); sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; @@ -862,7 +858,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); dprintk(" rdma_rw_ctxs : %d\n", ctxts); dprintk(" max_requests : %d\n", newxprt->sc_max_requests); - dprintk(" ord : %d\n", newxprt->sc_ord); + dprintk(" ord : %d\n", conn_param.initiator_depth); return &newxprt->sc_xprt; -- cgit v1.2.3-59-g8ed1b From 6f29d07ca4b999c36d42f9cf8d1dddf9ddca3250 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 20 Mar 2018 17:05:20 -0400 Subject: svcrdma: Clean up rdma_build_arg_xdr Clean up: The value of the byte_count parameter is already passed to rdma_build_arg_xdr as part of the svc_rdma_op_ctxt structure. Further, without the parameter called "byte_count" there is no need to have the abbreviated "bc" automatic variable. "bc" can now be called something more intuitive. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 19e9c6b33042..3d45015dca97 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -110,15 +110,16 @@ * the RDMA_RECV completion. The SGL should contain full pages up until the * last one. */ -static void rdma_build_arg_xdr(struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *ctxt, - u32 byte_count) +static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *ctxt) { struct page *page; - u32 bc; int sge_no; + u32 len; - /* Swap the page in the SGE with the page in argpages */ + /* The reply path assumes the Call's transport header resides + * in rqstp->rq_pages[0]. + */ page = ctxt->pages[0]; put_page(rqstp->rq_pages[0]); rqstp->rq_pages[0] = page; @@ -126,35 +127,35 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, /* Set up the XDR head */ rqstp->rq_arg.head[0].iov_base = page_address(page); rqstp->rq_arg.head[0].iov_len = - min_t(size_t, byte_count, ctxt->sge[0].length); - rqstp->rq_arg.len = byte_count; - rqstp->rq_arg.buflen = byte_count; + min_t(size_t, ctxt->byte_len, ctxt->sge[0].length); + rqstp->rq_arg.len = ctxt->byte_len; + rqstp->rq_arg.buflen = ctxt->byte_len; /* Compute bytes past head in the SGL */ - bc = byte_count - rqstp->rq_arg.head[0].iov_len; + len = ctxt->byte_len - rqstp->rq_arg.head[0].iov_len; /* If data remains, store it in the pagelist */ - rqstp->rq_arg.page_len = bc; + rqstp->rq_arg.page_len = len; rqstp->rq_arg.page_base = 0; sge_no = 1; - while (bc && sge_no < ctxt->count) { + while (len && sge_no < ctxt->count) { page = ctxt->pages[sge_no]; put_page(rqstp->rq_pages[sge_no]); rqstp->rq_pages[sge_no] = page; - bc -= min_t(u32, bc, ctxt->sge[sge_no].length); + len -= min_t(u32, len, ctxt->sge[sge_no].length); sge_no++; } rqstp->rq_respages = &rqstp->rq_pages[sge_no]; rqstp->rq_next_page = rqstp->rq_respages + 1; /* If not all pages were used from the SGL, free the remaining ones */ - bc = sge_no; + len = sge_no; while (sge_no < ctxt->count) { page = ctxt->pages[sge_no++]; put_page(page); } - ctxt->count = bc; + ctxt->count = len; /* Set up tail */ rqstp->rq_arg.tail[0].iov_base = NULL; @@ -534,10 +535,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) ctxt, rdma_xprt, rqstp); atomic_inc(&rdma_stat_recv); - /* Build up the XDR from the receive buffers. */ - rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len); + svc_rdma_build_arg_xdr(rqstp, ctxt); - /* Decode the RDMA header. */ p = (__be32 *)rqstp->rq_arg.head[0].iov_base; ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); if (ret < 0) -- cgit v1.2.3-59-g8ed1b From 63a1b1569372860fdef9e25edfc2320766b2f4c2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Mar 2018 10:49:22 -0400 Subject: sunrpc: Remove unneeded pointer dereference Clean up: Noticed during code inspection that there is already a local automatic variable "xprt" so dereferencing rqst->rq_xprt again is unnecessary. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index f9307bd6644b..6dca0f513c95 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -492,7 +492,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp) { struct svc_xprt *xprt = rqstp->rq_xprt; - rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); + xprt->xpt_ops->xpo_release_rqst(rqstp); kfree(rqstp->rq_deferred); rqstp->rq_deferred = NULL; @@ -889,7 +889,7 @@ int svc_send(struct svc_rqst *rqstp) goto out; /* release the receive skb before sending the reply */ - rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); + xprt->xpt_ops->xpo_release_rqst(rqstp); /* calculate over-all length */ xb = &rqstp->rq_res; -- cgit v1.2.3-59-g8ed1b From 989f881ebf77d70e883dd0fbcfa04a058d97f771 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Mar 2018 10:49:38 -0400 Subject: svc: Simplify ->xpo_secure_port Clean up: Instead of returning a value that is used to set or clear a bit, just make ->xpo_secure_port mangle that bit, and return void. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 2 +- net/sunrpc/svc_xprt.c | 5 +---- net/sunrpc/svcsock.c | 7 +++++-- net/sunrpc/xprtrdma/svc_rdma_transport.c | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 1caf7bc83306..19475acb68ea 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -25,7 +25,7 @@ struct svc_xprt_ops { void (*xpo_release_rqst)(struct svc_rqst *); void (*xpo_detach)(struct svc_xprt *); void (*xpo_free)(struct svc_xprt *); - int (*xpo_secure_port)(struct svc_rqst *); + void (*xpo_secure_port)(struct svc_rqst *rqstp); void (*xpo_kill_temp_xprt)(struct svc_xprt *); }; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 6dca0f513c95..4e3b4c596bae 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -844,10 +844,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) clear_bit(XPT_OLD, &xprt->xpt_flags); - if (xprt->xpt_ops->xpo_secure_port(rqstp)) - set_bit(RQ_SECURE, &rqstp->rq_flags); - else - clear_bit(RQ_SECURE, &rqstp->rq_flags); + xprt->xpt_ops->xpo_secure_port(rqstp); rqstp->rq_chandle.defer = svc_defer; rqstp->rq_xid = svc_getu32(&rqstp->rq_arg.head[0]); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 943f2a745cd5..9b6703588e35 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -391,9 +391,12 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, release_sock(sock->sk); } -static int svc_sock_secure_port(struct svc_rqst *rqstp) +static void svc_sock_secure_port(struct svc_rqst *rqstp) { - return svc_port_is_privileged(svc_addr(rqstp)); + if (svc_port_is_privileged(svc_addr(rqstp))) + set_bit(RQ_SECURE, &rqstp->rq_flags); + else + clear_bit(RQ_SECURE, &rqstp->rq_flags); } /* diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 7b2f4d3a2543..17da06d6b8e5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -69,7 +69,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *); static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); static int svc_rdma_has_wspace(struct svc_xprt *xprt); -static int svc_rdma_secure_port(struct svc_rqst *); +static void svc_rdma_secure_port(struct svc_rqst *); static void svc_rdma_kill_temp_xprt(struct svc_xprt *); static const struct svc_xprt_ops svc_rdma_ops = { @@ -988,9 +988,9 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) return 1; } -static int svc_rdma_secure_port(struct svc_rqst *rqstp) +static void svc_rdma_secure_port(struct svc_rqst *rqstp) { - return 1; + set_bit(RQ_SECURE, &rqstp->rq_flags); } static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) -- cgit v1.2.3-59-g8ed1b From caa3e106dc623eb41542e6221abecf9956e8a0e6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Mar 2018 10:50:07 -0400 Subject: sunrpc: Move trace_svc_xprt_dequeue() Reduce the amount of noise generated by trace_svc_xprt_dequeue by moving it to the end of svc_get_next_xprt. This generates exactly one trace event when a ready xprt is found, rather than spurious events when there is no work to do. The empty events contain no information that can't be obtained simply by tracing function calls to svc_xprt_dequeue. A small additional benefit is simplification of the svc_xprt_event trace class, which no longer has to handle the case when the @xprt parameter is NULL. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/trace/events/sunrpc.h | 16 +++++----------- net/sunrpc/svc_xprt.c | 5 +---- 2 files changed, 6 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index a23144471b6b..9bba3070f873 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -634,24 +634,18 @@ DECLARE_EVENT_CLASS(svc_xprt_event, TP_STRUCT__entry( __field(struct svc_xprt *, xprt) __field(unsigned long, flags) - __dynamic_array(unsigned char, addr, xprt != NULL ? - xprt->xpt_remotelen : 0) + __dynamic_array(unsigned char, addr, xprt->xpt_remotelen) ), TP_fast_assign( __entry->xprt = xprt; - if (xprt) { - memcpy(__get_dynamic_array(addr), - &xprt->xpt_remote, - xprt->xpt_remotelen); - __entry->flags = xprt->xpt_flags; - } else - __entry->flags = 0; + __entry->flags = xprt->xpt_flags; + memcpy(__get_dynamic_array(addr), + &xprt->xpt_remote, xprt->xpt_remotelen); ), TP_printk("xprt=0x%p addr=%pIScp flags=%s", __entry->xprt, - __get_dynamic_array_len(addr) != 0 ? - (struct sockaddr *)__get_dynamic_array(addr) : NULL, + (struct sockaddr *)__get_dynamic_array(addr), show_svc_xprt_flags(__entry->flags)) ); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 4e3b4c596bae..71f47187b4ec 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -454,13 +454,9 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) struct svc_xprt, xpt_ready); list_del_init(&xprt->xpt_ready); svc_xprt_get(xprt); - - dprintk("svc: transport %p dequeued, inuse=%d\n", - xprt, kref_read(&xprt->xpt_ref)); } spin_unlock_bh(&pool->sp_lock); out: - trace_svc_xprt_dequeue(xprt); return xprt; } @@ -734,6 +730,7 @@ out_found: rqstp->rq_chandle.thread_wait = 5*HZ; else rqstp->rq_chandle.thread_wait = 1*HZ; + trace_svc_xprt_dequeue(rqstp->rq_xprt); return rqstp->rq_xprt; } -- cgit v1.2.3-59-g8ed1b From 7dbb53baed3c3969dea43e3cee261a75adde123c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Mar 2018 10:50:27 -0400 Subject: sunrpc: Simplify do_enqueue tracing There are three cases where svc_xprt_do_enqueue() returns without waking an nfsd thread: 1. There is no work to do 2. The transport is already busy 3. There are no available nfsd threads Only 3. is truly interesting. Move the trace point so it records that there was work to do and either an nfsd thread was awoken, or a free one could not found. As an additional clean up, remove a redundant comment and a couple of dprintk call sites. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 71f47187b4ec..5fe150c78d0a 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -382,25 +382,21 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) int cpu; if (!svc_xprt_has_something_to_do(xprt)) - goto out; + return; /* Mark transport as busy. It will remain in this state until * the provider calls svc_xprt_received. We update XPT_BUSY * atomically because it also guards against trying to enqueue * the transport twice. */ - if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { - /* Don't enqueue transport while already enqueued */ - dprintk("svc: transport %p busy, not enqueued\n", xprt); - goto out; - } + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) + return; cpu = get_cpu(); pool = svc_pool_for_cpu(xprt->xpt_server, cpu); atomic_long_inc(&pool->sp_stats.packets); - dprintk("svc: transport %p put into queue\n", xprt); spin_lock_bh(&pool->sp_lock); list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); pool->sp_stats.sockets_queued++; @@ -420,7 +416,6 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) out_unlock: rcu_read_unlock(); put_cpu(); -out: trace_svc_xprt_do_enqueue(xprt, rqstp); } EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); -- cgit v1.2.3-59-g8ed1b From 41f306d0c287e0cc04054135f9f4ceb003ad6795 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Mar 2018 10:50:42 -0400 Subject: sunrpc: Simplify trace_svc_recv There doesn't seem to be a lot of value in calling trace_svc_recv in the failing case. 1. There are two very common cases: one is the transport is not ready, and the other is shutdown. Neither is terribly interesting. 2. The trace record for the failing case contains nothing but the status code. Therefore the trace point call site in the error exit is removed. Since the trace point is now recording a length instead of a status, rename the status field and remove the case that records a zero XID. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/trace/events/sunrpc.h | 14 +++++++------- net/sunrpc/svc_xprt.c | 1 - 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 9bba3070f873..5849bfb3ece2 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -485,28 +485,28 @@ TRACE_EVENT(xs_tcp_data_recv, { (1UL << RQ_BUSY), "RQ_BUSY"}) TRACE_EVENT(svc_recv, - TP_PROTO(struct svc_rqst *rqst, int status), + TP_PROTO(struct svc_rqst *rqst, int len), - TP_ARGS(rqst, status), + TP_ARGS(rqst, len), TP_STRUCT__entry( __field(u32, xid) - __field(int, status) + __field(int, len) __field(unsigned long, flags) __dynamic_array(unsigned char, addr, rqst->rq_addrlen) ), TP_fast_assign( - __entry->xid = status > 0 ? be32_to_cpu(rqst->rq_xid) : 0; - __entry->status = status; + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->len = len; __entry->flags = rqst->rq_flags; memcpy(__get_dynamic_array(addr), &rqst->rq_addr, rqst->rq_addrlen); ), - TP_printk("addr=%pIScp xid=0x%08x status=%d flags=%s", + TP_printk("addr=%pIScp xid=0x%08x len=%d flags=%s", (struct sockaddr *)__get_dynamic_array(addr), - __entry->xid, __entry->status, + __entry->xid, __entry->len, show_rqstp_flags(__entry->flags)) ); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5fe150c78d0a..47384d0b1673 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -848,7 +848,6 @@ out_release: rqstp->rq_res.len = 0; svc_xprt_release(rqstp); out: - trace_svc_recv(rqstp, err); return err; } EXPORT_SYMBOL_GPL(svc_recv); -- cgit v1.2.3-59-g8ed1b From ece200ddd54b9ce840cfee554fb812560c545c7d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Mar 2018 10:51:00 -0400 Subject: sunrpc: Save remote presentation address in svc_xprt for trace events TP_printk defines a format string that is passed to user space for converting raw trace event records to something human-readable. My user space's printf (Oracle Linux 7), however, does not have a %pI format specifier. The result is that what is supposed to be an IP address in the output of "trace-cmd report" is just a string that says the field couldn't be displayed. To fix this, adopt the same approach as the client: maintain a pre- formated presentation address for occasions when %pI is not available. The location of the trace_svc_send trace point is adjusted so that rqst->rq_xprt is not NULL when the trace event is recorded. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 4 ++ include/trace/events/sunrpc.h | 89 ++++++++++++-------------------- net/sunrpc/svc_xprt.c | 3 +- net/sunrpc/svcsock.c | 1 + net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 +- 5 files changed, 43 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 19475acb68ea..c3d72066d4b1 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -83,6 +83,7 @@ struct svc_xprt { size_t xpt_locallen; /* length of address */ struct sockaddr_storage xpt_remote; /* remote peer's address */ size_t xpt_remotelen; /* length of address */ + char xpt_remotebuf[INET6_ADDRSTRLEN + 10]; struct rpc_wait_queue xpt_bc_pending; /* backchannel wait queue */ struct list_head xpt_users; /* callbacks on free */ @@ -152,7 +153,10 @@ static inline void svc_xprt_set_remote(struct svc_xprt *xprt, { memcpy(&xprt->xpt_remote, sa, salen); xprt->xpt_remotelen = salen; + snprintf(xprt->xpt_remotebuf, sizeof(xprt->xpt_remotebuf) - 1, + "%pISpc", sa); } + static inline unsigned short svc_addr_port(const struct sockaddr *sa) { const struct sockaddr_in *sin = (const struct sockaddr_in *)sa; diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5849bfb3ece2..1ec8c4c45766 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -493,20 +493,18 @@ TRACE_EVENT(svc_recv, __field(u32, xid) __field(int, len) __field(unsigned long, flags) - __dynamic_array(unsigned char, addr, rqst->rq_addrlen) + __string(addr, rqst->rq_xprt->xpt_remotebuf) ), TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->len = len; __entry->flags = rqst->rq_flags; - memcpy(__get_dynamic_array(addr), - &rqst->rq_addr, rqst->rq_addrlen); + __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); ), - TP_printk("addr=%pIScp xid=0x%08x len=%d flags=%s", - (struct sockaddr *)__get_dynamic_array(addr), - __entry->xid, __entry->len, + TP_printk("addr=%s xid=0x%08x len=%d flags=%s", + __get_str(addr), __entry->xid, __entry->len, show_rqstp_flags(__entry->flags)) ); @@ -519,20 +517,18 @@ DECLARE_EVENT_CLASS(svc_rqst_event, TP_STRUCT__entry( __field(u32, xid) __field(unsigned long, flags) - __dynamic_array(unsigned char, addr, rqst->rq_addrlen) + __string(addr, rqst->rq_xprt->xpt_remotebuf) ), TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->flags = rqst->rq_flags; - memcpy(__get_dynamic_array(addr), - &rqst->rq_addr, rqst->rq_addrlen); + __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); ), - TP_printk("addr=%pIScp rq_xid=0x%08x flags=%s", - (struct sockaddr *)__get_dynamic_array(addr), - __entry->xid, - show_rqstp_flags(__entry->flags)) + TP_printk("addr=%s xid=0x%08x flags=%s", + __get_str(addr), __entry->xid, + show_rqstp_flags(__entry->flags)) ); DEFINE_EVENT(svc_rqst_event, svc_defer, @@ -553,21 +549,19 @@ DECLARE_EVENT_CLASS(svc_rqst_status, __field(u32, xid) __field(int, status) __field(unsigned long, flags) - __dynamic_array(unsigned char, addr, rqst->rq_addrlen) + __string(addr, rqst->rq_xprt->xpt_remotebuf) ), TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->status = status; __entry->flags = rqst->rq_flags; - memcpy(__get_dynamic_array(addr), - &rqst->rq_addr, rqst->rq_addrlen); + __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); ), - TP_printk("addr=%pIScp rq_xid=0x%08x status=%d flags=%s", - (struct sockaddr *)__get_dynamic_array(addr), - __entry->xid, - __entry->status, show_rqstp_flags(__entry->flags)) + TP_printk("addr=%s xid=0x%08x status=%d flags=%s", + __get_str(addr), __entry->xid, + __entry->status, show_rqstp_flags(__entry->flags)) ); DEFINE_EVENT(svc_rqst_status, svc_process, @@ -604,26 +598,19 @@ TRACE_EVENT(svc_xprt_do_enqueue, __field(struct svc_xprt *, xprt) __field(int, pid) __field(unsigned long, flags) - __dynamic_array(unsigned char, addr, xprt != NULL ? - xprt->xpt_remotelen : 0) + __string(addr, xprt->xpt_remotebuf) ), TP_fast_assign( __entry->xprt = xprt; __entry->pid = rqst? rqst->rq_task->pid : 0; - if (xprt) { - memcpy(__get_dynamic_array(addr), - &xprt->xpt_remote, - xprt->xpt_remotelen); - __entry->flags = xprt->xpt_flags; - } else - __entry->flags = 0; + __entry->flags = xprt->xpt_flags; + __assign_str(addr, xprt->xpt_remotebuf); ), - TP_printk("xprt=0x%p addr=%pIScp pid=%d flags=%s", __entry->xprt, - __get_dynamic_array_len(addr) != 0 ? - (struct sockaddr *)__get_dynamic_array(addr) : NULL, - __entry->pid, show_svc_xprt_flags(__entry->flags)) + TP_printk("xprt=%p addr=%s pid=%d flags=%s", + __entry->xprt, __get_str(addr), + __entry->pid, show_svc_xprt_flags(__entry->flags)) ); DECLARE_EVENT_CLASS(svc_xprt_event, @@ -634,19 +621,18 @@ DECLARE_EVENT_CLASS(svc_xprt_event, TP_STRUCT__entry( __field(struct svc_xprt *, xprt) __field(unsigned long, flags) - __dynamic_array(unsigned char, addr, xprt->xpt_remotelen) + __string(addr, xprt->xpt_remotebuf) ), TP_fast_assign( __entry->xprt = xprt; __entry->flags = xprt->xpt_flags; - memcpy(__get_dynamic_array(addr), - &xprt->xpt_remote, xprt->xpt_remotelen); + __assign_str(addr, xprt->xpt_remotebuf); ), - TP_printk("xprt=0x%p addr=%pIScp flags=%s", __entry->xprt, - (struct sockaddr *)__get_dynamic_array(addr), - show_svc_xprt_flags(__entry->flags)) + TP_printk("xprt=%p addr=%s flags=%s", + __entry->xprt, __get_str(addr), + show_svc_xprt_flags(__entry->flags)) ); DEFINE_EVENT(svc_xprt_event, svc_xprt_dequeue, @@ -682,25 +668,18 @@ TRACE_EVENT(svc_handle_xprt, __field(struct svc_xprt *, xprt) __field(int, len) __field(unsigned long, flags) - __dynamic_array(unsigned char, addr, xprt != NULL ? - xprt->xpt_remotelen : 0) + __string(addr, xprt->xpt_remotebuf) ), TP_fast_assign( __entry->xprt = xprt; __entry->len = len; - if (xprt) { - memcpy(__get_dynamic_array(addr), - &xprt->xpt_remote, - xprt->xpt_remotelen); - __entry->flags = xprt->xpt_flags; - } else - __entry->flags = 0; + __entry->flags = xprt->xpt_flags; + __assign_str(addr, xprt->xpt_remotebuf); ), - TP_printk("xprt=0x%p addr=%pIScp len=%d flags=%s", __entry->xprt, - __get_dynamic_array_len(addr) != 0 ? - (struct sockaddr *)__get_dynamic_array(addr) : NULL, + TP_printk("xprt=%p addr=%s len=%d flags=%s", + __entry->xprt, __get_str(addr), __entry->len, show_svc_xprt_flags(__entry->flags)) ); @@ -712,18 +691,16 @@ DECLARE_EVENT_CLASS(svc_deferred_event, TP_STRUCT__entry( __field(u32, xid) - __dynamic_array(unsigned char, addr, dr->addrlen) + __string(addr, dr->xprt->xpt_remotebuf) ), TP_fast_assign( __entry->xid = be32_to_cpu(*(__be32 *)(dr->args + (dr->xprt_hlen>>2))); - memcpy(__get_dynamic_array(addr), &dr->addr, dr->addrlen); + __assign_str(addr, dr->xprt->xpt_remotebuf); ), - TP_printk("addr=%pIScp xid=0x%08x", - (struct sockaddr *)__get_dynamic_array(addr), - __entry->xid) + TP_printk("addr=%s xid=0x%08x", __get_str(addr), __entry->xid) ); DEFINE_EVENT(svc_deferred_event, svc_drop_deferred, diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 47384d0b1673..f745754a55ea 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -173,6 +173,7 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl, set_bit(XPT_BUSY, &xprt->xpt_flags); rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending"); xprt->xpt_net = get_net(net); + strcpy(xprt->xpt_remotebuf, "uninitialized"); } EXPORT_SYMBOL_GPL(svc_xprt_init); @@ -894,12 +895,12 @@ int svc_send(struct svc_rqst *rqstp) len = xprt->xpt_ops->xpo_sendto(rqstp); mutex_unlock(&xprt->xpt_mutex); rpc_wake_up(&xprt->xpt_bc_pending); + trace_svc_send(rqstp, len); svc_xprt_release(rqstp); if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) len = 0; out: - trace_svc_send(rqstp, len); return len; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 9b6703588e35..4ca1d92b531a 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1310,6 +1310,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags); if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); + strcpy(svsk->sk_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 17da06d6b8e5..96cc8f6597d3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -401,8 +401,10 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, */ set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags); - if (listener) + if (listener) { + strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); + } return cma_xprt; } -- cgit v1.2.3-59-g8ed1b From 0b9547bf6b94317b3f8e2496dc2b44cb6e599b01 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Mar 2018 10:51:22 -0400 Subject: sunrpc: Re-purpose trace_svc_process Currently, trace_svc_process has two call sites: 1. Just after a call to svc_send. svc_send already invokes trace_svc_send with the same arguments just before returning 2. Just before a call to svc_drop. svc_drop already invokes trace_svc_drop with the same arguments just after it is called Therefore trace_svc_process does not provide any additional information not already provided by these other trace points. However, it would be useful to record the incoming RPC procedure. So reuse trace_svc_process for this purpose. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/trace/events/sunrpc.h | 30 ++++++++++++++++++++++++++---- net/sunrpc/svc.c | 9 +++------ 2 files changed, 29 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 1ec8c4c45766..5a8157c04900 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -508,6 +508,32 @@ TRACE_EVENT(svc_recv, show_rqstp_flags(__entry->flags)) ); +TRACE_EVENT(svc_process, + TP_PROTO(const struct svc_rqst *rqst, const char *name), + + TP_ARGS(rqst, name), + + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, vers) + __field(u32, proc) + __string(service, name) + __string(addr, rqst->rq_xprt->xpt_remotebuf) + ), + + TP_fast_assign( + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->vers = rqst->rq_vers; + __entry->proc = rqst->rq_proc; + __assign_str(service, name); + __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); + ), + + TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%u", + __get_str(addr), __entry->xid, + __get_str(service), __entry->vers, __entry->proc) +); + DECLARE_EVENT_CLASS(svc_rqst_event, TP_PROTO(struct svc_rqst *rqst), @@ -564,10 +590,6 @@ DECLARE_EVENT_CLASS(svc_rqst_status, __entry->status, show_rqstp_flags(__entry->flags)) ); -DEFINE_EVENT(svc_rqst_status, svc_process, - TP_PROTO(struct svc_rqst *rqst, int status), - TP_ARGS(rqst, status)); - DEFINE_EVENT(svc_rqst_status, svc_send, TP_PROTO(struct svc_rqst *rqst, int status), TP_ARGS(rqst, status)); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 387cc4add6f6..f19987f5d3b5 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1255,6 +1255,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) /* Syntactic check complete */ serv->sv_stats->rpccnt++; + trace_svc_process(rqstp, progp->pg_name); /* Build the reply header. */ statp = resv->iov_base +resv->iov_len; @@ -1431,14 +1432,10 @@ svc_process(struct svc_rqst *rqstp) } /* Returns 1 for send, 0 for drop */ - if (likely(svc_process_common(rqstp, argv, resv))) { - int ret = svc_send(rqstp); + if (likely(svc_process_common(rqstp, argv, resv))) + return svc_send(rqstp); - trace_svc_process(rqstp, ret); - return ret; - } out_drop: - trace_svc_process(rqstp, 0); svc_drop(rqstp); return 0; } -- cgit v1.2.3-59-g8ed1b From aaba72cd4e793fbf1c04e06dee3d2c3710339678 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Mar 2018 10:51:39 -0400 Subject: sunrpc: Report per-RPC execution stats Introduce a mechanism to report the server-side execution latency of each RPC. The goal is to enable user space to filter the trace record for latency outliers, build histograms, etc. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 1 + include/trace/events/sunrpc.h | 21 +++++++++++++++++++++ net/sunrpc/svc_xprt.c | 3 ++- 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 786ae2255f05..3bd7504066e1 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -283,6 +283,7 @@ struct svc_rqst { int rq_reserved; /* space on socket outq * reserved for this request */ + ktime_t rq_stime; /* start time */ struct cache_req rq_chandle; /* handle passed to caches for * request delaying diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5a8157c04900..1b383f71ccd7 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -705,6 +705,27 @@ TRACE_EVENT(svc_handle_xprt, __entry->len, show_svc_xprt_flags(__entry->flags)) ); +TRACE_EVENT(svc_stats_latency, + TP_PROTO(const struct svc_rqst *rqst), + + TP_ARGS(rqst), + + TP_STRUCT__entry( + __field(u32, xid) + __field(unsigned long, execute) + __string(addr, rqst->rq_xprt->xpt_remotebuf) + ), + + TP_fast_assign( + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->execute = ktime_to_us(ktime_sub(ktime_get(), + rqst->rq_stime)); + __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); + ), + + TP_printk("addr=%s xid=0x%08x execute-us=%lu", + __get_str(addr), __entry->xid, __entry->execute) +); DECLARE_EVENT_CLASS(svc_deferred_event, TP_PROTO(struct svc_deferred_req *dr), diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index f745754a55ea..a7425da14f5b 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -782,7 +782,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) len = svc_deferred_recv(rqstp); else len = xprt->xpt_ops->xpo_recvfrom(rqstp); - dprintk("svc: got len=%d\n", len); + rqstp->rq_stime = ktime_get(); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); } @@ -888,6 +888,7 @@ int svc_send(struct svc_rqst *rqstp) /* Grab mutex to serialize outgoing data. */ mutex_lock(&xprt->xpt_mutex); + trace_svc_stats_latency(rqstp); if (test_bit(XPT_DEAD, &xprt->xpt_flags) || test_bit(XPT_CLOSE, &xprt->xpt_flags)) len = -ENOTCONN; -- cgit v1.2.3-59-g8ed1b From 55f5088c22cc83dbc64394abfbf76cd1ff5e7cd0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Mar 2018 10:52:27 -0400 Subject: svc: Report xprt dequeue latency Record the time between when a rqstp is enqueued on a transport and when it is dequeued. This includes how long the rqstp waits on the queue and how long it takes the kernel scheduler to wake a nfsd thread to service it. The svc_xprt_dequeue trace point is altered to include the number of microseconds between xprt_enqueue and xprt_dequeue. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 1 + include/trace/events/sunrpc.h | 30 ++++++++++++++++++++++++++---- net/sunrpc/svc_xprt.c | 4 ++-- 3 files changed, 29 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 3bd7504066e1..dc4c009deec1 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -272,6 +272,7 @@ struct svc_rqst { #define RQ_BUSY (6) /* request is busy */ #define RQ_DATA (7) /* request has data */ unsigned long rq_flags; /* flags field */ + ktime_t rq_qtime; /* enqueue time */ void * rq_argp; /* decoded arguments */ void * rq_resp; /* xdr'd results */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 1b383f71ccd7..922cb8968fb2 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -657,14 +657,36 @@ DECLARE_EVENT_CLASS(svc_xprt_event, show_svc_xprt_flags(__entry->flags)) ); -DEFINE_EVENT(svc_xprt_event, svc_xprt_dequeue, - TP_PROTO(struct svc_xprt *xprt), - TP_ARGS(xprt)); - DEFINE_EVENT(svc_xprt_event, svc_xprt_no_write_space, TP_PROTO(struct svc_xprt *xprt), TP_ARGS(xprt)); +TRACE_EVENT(svc_xprt_dequeue, + TP_PROTO(struct svc_rqst *rqst), + + TP_ARGS(rqst), + + TP_STRUCT__entry( + __field(struct svc_xprt *, xprt) + __field(unsigned long, flags) + __field(unsigned long, wakeup) + __string(addr, rqst->rq_xprt->xpt_remotebuf) + ), + + TP_fast_assign( + __entry->xprt = rqst->rq_xprt; + __entry->flags = rqst->rq_xprt->xpt_flags; + __entry->wakeup = ktime_to_us(ktime_sub(ktime_get(), + rqst->rq_qtime)); + __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); + ), + + TP_printk("xprt=%p addr=%s flags=%s wakeup-us=%lu", + __entry->xprt, __get_str(addr), + show_svc_xprt_flags(__entry->flags), + __entry->wakeup) +); + TRACE_EVENT(svc_wake_up, TP_PROTO(int pid), diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index a7425da14f5b..5185efb9027b 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -409,6 +409,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) continue; atomic_long_inc(&pool->sp_stats.threads_woken); + rqstp->rq_qtime = ktime_get(); wake_up_process(rqstp->rq_task); goto out_unlock; } @@ -530,7 +531,6 @@ void svc_wake_up(struct svc_serv *serv) if (test_bit(RQ_BUSY, &rqstp->rq_flags)) continue; rcu_read_unlock(); - dprintk("svc: daemon %p woken up.\n", rqstp); wake_up_process(rqstp->rq_task); trace_svc_wake_up(rqstp->rq_task->pid); return; @@ -726,7 +726,7 @@ out_found: rqstp->rq_chandle.thread_wait = 5*HZ; else rqstp->rq_chandle.thread_wait = 1*HZ; - trace_svc_xprt_dequeue(rqstp->rq_xprt); + trace_svc_xprt_dequeue(rqstp); return rqstp->rq_xprt; } -- cgit v1.2.3-59-g8ed1b From 8154ef2776aa512a3eaa0e7db030dc4803354d61 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Mar 2018 10:54:07 -0400 Subject: NFSD: Clean up legacy NFS WRITE argument XDR decoders Move common code in NFSD's legacy NFS WRITE decoders into a helper. The immediate benefit is reduction of code duplication and some nice micro-optimizations (see below). In the long term, this helper can perform a per-transport call-out to fill the rq_vec (say, using RDMA Reads). The legacy WRITE decoders and procs are changed to work like NFSv4, which constructs the rq_vec just before it is about to call vfs_writev. Why? Calling a transport call-out from the proc instead of the XDR decoder means that the incoming FH can be resolved to a particular filesystem and file. This would allow pages from the backing file to be presented to the transport to be filled, rather than presenting anonymous pages and copying or flipping them into the file's page cache later. I also prefer using the pages in rq_arg.pages, instead of pulling the data pages directly out of the rqstp::rq_pages array. This is currently the way the NFSv3 write decoder works, but the other two do not seem to take this approach. Fixing this removes the only reference to rq_pages found in NFSD, eliminating an NFSD assumption about how transports use the pages in rq_pages. Lastly, avoid setting up the first element of rq_vec as a zero- length buffer. This happens with an RDMA transport when a normal Read chunk is present because the data payload is in rq_arg's page list (none of it is in the head buffer). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 8 ++++++-- fs/nfsd/nfs3xdr.c | 16 ++++------------ fs/nfsd/nfsproc.c | 9 +++++++-- fs/nfsd/nfsxdr.c | 14 ++------------ fs/nfsd/xdr.h | 2 +- fs/nfsd/xdr3.h | 2 +- include/linux/sunrpc/svc.h | 2 ++ net/sunrpc/svc.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 65 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 1d0ce3c57d93..2dd95ebf4935 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -192,6 +192,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp) struct nfsd3_writeres *resp = rqstp->rq_resp; __be32 nfserr; unsigned long cnt = argp->len; + unsigned int nvecs; dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", SVCFH_fmt(&argp->fh), @@ -201,9 +202,12 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; + nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt); + if (!nvecs) + RETURN_STATUS(nfserr_io); nfserr = nfsd_write(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, argp->vlen, - &cnt, resp->committed); + rqstp->rq_vec, nvecs, &cnt, + resp->committed); resp->count = cnt; RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 1a70581e1cb2..e19fc5d8bcb5 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -391,7 +391,7 @@ int nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_writeargs *args = rqstp->rq_argp; - unsigned int len, v, hdr, dlen; + unsigned int len, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); struct kvec *head = rqstp->rq_arg.head; struct kvec *tail = rqstp->rq_arg.tail; @@ -433,17 +433,9 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) args->count = max_blocksize; len = args->len = max_blocksize; } - rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = head->iov_len - hdr; - v = 0; - while (len > rqstp->rq_vec[v].iov_len) { - len -= rqstp->rq_vec[v].iov_len; - v++; - rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); - rqstp->rq_vec[v].iov_len = PAGE_SIZE; - } - rqstp->rq_vec[v].iov_len = len; - args->vlen = v + 1; + + args->first.iov_base = (void *)p; + args->first.iov_len = head->iov_len - hdr; return 1; } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 43c0419b8ddb..1995ea6bfd2b 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -212,13 +212,18 @@ nfsd_proc_write(struct svc_rqst *rqstp) struct nfsd_attrstat *resp = rqstp->rq_resp; __be32 nfserr; unsigned long cnt = argp->len; + unsigned int nvecs; dprintk("nfsd: WRITE %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, - rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC); + nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt); + if (!nvecs) + return nfserr_io; + nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), + argp->offset, rqstp->rq_vec, nvecs, + &cnt, NFS_DATA_SYNC); return nfsd_return_attrs(nfserr, resp); } diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 79b6064f8977..db24ae8b67e0 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -287,7 +287,6 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) struct nfsd_writeargs *args = rqstp->rq_argp; unsigned int len, hdr, dlen; struct kvec *head = rqstp->rq_arg.head; - int v; p = decode_fh(p, &args->fh); if (!p) @@ -323,17 +322,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) if (dlen < XDR_QUADLEN(len)*4) return 0; - rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = head->iov_len - hdr; - v = 0; - while (len > rqstp->rq_vec[v].iov_len) { - len -= rqstp->rq_vec[v].iov_len; - v++; - rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); - rqstp->rq_vec[v].iov_len = PAGE_SIZE; - } - rqstp->rq_vec[v].iov_len = len; - args->vlen = v + 1; + args->first.iov_base = (void *)p; + args->first.iov_len = head->iov_len - hdr; return 1; } diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 2f4f22e6b8cb..a765c414015e 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -34,7 +34,7 @@ struct nfsd_writeargs { svc_fh fh; __u32 offset; int len; - int vlen; + struct kvec first; }; struct nfsd_createargs { diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 056bf8a7364e..deccf7f691e9 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -41,7 +41,7 @@ struct nfsd3_writeargs { __u32 count; int stable; __u32 len; - int vlen; + struct kvec first; }; struct nfsd3_createargs { diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index dc4c009deec1..fb3fcacc1e98 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -495,6 +495,8 @@ void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); char * svc_print_addr(struct svc_rqst *, char *, size_t); +unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, + struct kvec *first, size_t total); #define RPC_MAX_ADDRBUFLEN (63U) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index f19987f5d3b5..a155e2de19aa 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1533,3 +1533,45 @@ u32 svc_max_payload(const struct svc_rqst *rqstp) return max; } EXPORT_SYMBOL_GPL(svc_max_payload); + +/** + * svc_fill_write_vector - Construct data argument for VFS write call + * @rqstp: svc_rqst to operate on + * @first: buffer containing first section of write payload + * @total: total number of bytes of write payload + * + * Returns the number of elements populated in the data argument array. + */ +unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first, + size_t total) +{ + struct kvec *vec = rqstp->rq_vec; + struct page **pages; + unsigned int i; + + /* Some types of transport can present the write payload + * entirely in rq_arg.pages. In this case, @first is empty. + */ + i = 0; + if (first->iov_len) { + vec[i].iov_base = first->iov_base; + vec[i].iov_len = min_t(size_t, total, first->iov_len); + total -= vec[i].iov_len; + ++i; + } + + WARN_ON_ONCE(rqstp->rq_arg.page_base != 0); + pages = rqstp->rq_arg.pages; + while (total) { + vec[i].iov_base = page_address(*pages); + vec[i].iov_len = min_t(size_t, total, PAGE_SIZE); + total -= vec[i].iov_len; + ++i; + + ++pages; + } + + WARN_ON_ONCE(i > ARRAY_SIZE(rqstp->rq_vec)); + return i; +} +EXPORT_SYMBOL_GPL(svc_fill_write_vector); -- cgit v1.2.3-59-g8ed1b From 38a70315599dedacd9ff3bd1016f9048c9d0ad12 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Mar 2018 10:54:21 -0400 Subject: NFSD: Clean up legacy NFS SYMLINK argument XDR decoders Move common code in NFSD's legacy SYMLINK decoders into a helper. The immediate benefits include: - one fewer data copies on transports that support DDP - consistent error checking across all versions - reduction of code duplication - support for both legal forms of SYMLINK requests on RDMA transports for all versions of NFS (in particular, NFSv2, for completeness) In the long term, this helper is an appropriate spot to perform a per-transport call-out to fill the pathname argument using, say, RDMA Reads. Filling the pathname in the proc function also means that eventually the incoming filehandle can be interpreted so that filesystem- specific memory can be allocated as a sink for the pathname argument, rather than using anonymous pages. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 10 +++++++ fs/nfsd/nfs3xdr.c | 51 +++++++++-------------------------- fs/nfsd/nfsproc.c | 14 +++++----- fs/nfsd/nfsxdr.c | 49 +++++++++++++++++++-------------- fs/nfsd/xdr.h | 1 + fs/nfsd/xdr3.h | 1 + fs/nfsd/xdr4.h | 2 ++ include/linux/sunrpc/svc.h | 2 ++ net/sunrpc/svc.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 132 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 2dd95ebf4935..6259a4b8579f 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -283,6 +283,16 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp) struct nfsd3_diropres *resp = rqstp->rq_resp; __be32 nfserr; + if (argp->tlen == 0) + RETURN_STATUS(nfserr_inval); + if (argp->tlen > NFS3_MAXPATHLEN) + RETURN_STATUS(nfserr_nametoolong); + + argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, + argp->tlen); + if (IS_ERR(argp->tname)) + RETURN_STATUS(nfserrno(PTR_ERR(argp->tname))); + dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index e19fc5d8bcb5..3192b544a441 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -481,51 +481,24 @@ int nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_symlinkargs *args = rqstp->rq_argp; - unsigned int len, avail; - char *old, *new; - struct kvec *vec; + char *base = (char *)p; + size_t dlen; if (!(p = decode_fh(p, &args->ffh)) || - !(p = decode_filename(p, &args->fname, &args->flen)) - ) + !(p = decode_filename(p, &args->fname, &args->flen))) return 0; p = decode_sattr3(p, &args->attrs); - /* now decode the pathname, which might be larger than the first page. - * As we have to check for nul's anyway, we copy it into a new page - * This page appears in the rq_res.pages list, but as pages_len is always - * 0, it won't get in the way - */ - len = ntohl(*p++); - if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE) - return 0; - args->tname = new = page_address(*(rqstp->rq_next_page++)); - args->tlen = len; - /* first copy and check from the first page */ - old = (char*)p; - vec = &rqstp->rq_arg.head[0]; - if ((void *)old > vec->iov_base + vec->iov_len) - return 0; - avail = vec->iov_len - (old - (char*)vec->iov_base); - while (len && avail && *old) { - *new++ = *old++; - len--; - avail--; - } - /* now copy next page if there is one */ - if (len && !avail && rqstp->rq_arg.page_len) { - avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE); - old = page_address(rqstp->rq_arg.pages[0]); - } - while (len && avail && *old) { - *new++ = *old++; - len--; - avail--; - } - *new = '\0'; - if (len) - return 0; + args->tlen = ntohl(*p++); + args->first.iov_base = p; + args->first.iov_len = rqstp->rq_arg.head[0].iov_len; + args->first.iov_len -= (char *)p - base; + + dlen = args->first.iov_len + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + if (dlen < XDR_QUADLEN(args->tlen) << 2) + return 0; return 1; } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 1995ea6bfd2b..f107f9fa8e15 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -449,17 +449,19 @@ nfsd_proc_symlink(struct svc_rqst *rqstp) struct svc_fh newfh; __be32 nfserr; + if (argp->tlen > NFS_MAXPATHLEN) + return nfserr_nametoolong; + + argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, + argp->tlen); + if (IS_ERR(argp->tname)) + return nfserrno(PTR_ERR(argp->tname)); + dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, argp->tlen, argp->tname); fh_init(&newfh, NFS_FHSIZE); - /* - * Crazy hack: the request fits in a page, and already-decoded - * attributes follow argp->tname, so it's safe to just write a - * null to ensure it's null-terminated: - */ - argp->tname[argp->tlen] = '\0'; nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, argp->tname, &newfh); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index db24ae8b67e0..a43e8260520a 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -70,22 +70,6 @@ decode_filename(__be32 *p, char **namp, unsigned int *lenp) return p; } -static __be32 * -decode_pathname(__be32 *p, char **namp, unsigned int *lenp) -{ - char *name; - unsigned int i; - - if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) { - for (i = 0, name = *namp; i < *lenp; i++, name++) { - if (*name == '\0') - return NULL; - } - } - - return p; -} - static __be32 * decode_sattr(__be32 *p, struct iattr *iap) { @@ -384,14 +368,39 @@ int nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_symlinkargs *args = rqstp->rq_argp; + char *base = (char *)p; + size_t xdrlen; if ( !(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen)) - || !(p = decode_pathname(p, &args->tname, &args->tlen))) + || !(p = decode_filename(p, &args->fname, &args->flen))) return 0; - p = decode_sattr(p, &args->attrs); - return xdr_argsize_check(rqstp, p); + args->tlen = ntohl(*p++); + if (args->tlen == 0) + return 0; + + args->first.iov_base = p; + args->first.iov_len = rqstp->rq_arg.head[0].iov_len; + args->first.iov_len -= (char *)p - base; + + /* This request is never larger than a page. Therefore, + * transport will deliver either: + * 1. pathname in the pagelist -> sattr is in the tail. + * 2. everything in the head buffer -> sattr is in the head. + */ + if (rqstp->rq_arg.page_len) { + if (args->tlen != rqstp->rq_arg.page_len) + return 0; + p = rqstp->rq_arg.tail[0].iov_base; + } else { + xdrlen = XDR_QUADLEN(args->tlen); + if (xdrlen > args->first.iov_len - (8 * sizeof(__be32))) + return 0; + p += xdrlen; + } + decode_sattr(p, &args->attrs); + + return 1; } int diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index a765c414015e..ea7cca3a64b7 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -72,6 +72,7 @@ struct nfsd_symlinkargs { char * tname; unsigned int tlen; struct iattr attrs; + struct kvec first; }; struct nfsd_readdirargs { diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index deccf7f691e9..2cb29e961a76 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -90,6 +90,7 @@ struct nfsd3_symlinkargs { char * tname; unsigned int tlen; struct iattr attrs; + struct kvec first; }; struct nfsd3_readdirargs { diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 7cbc129092fe..468020fe2a07 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -110,6 +110,7 @@ struct nfsd4_create { struct { u32 datalen; char *data; + struct kvec first; } link; /* NF4LNK */ struct { u32 specdata1; @@ -124,6 +125,7 @@ struct nfsd4_create { }; #define cr_datalen u.link.datalen #define cr_data u.link.data +#define cr_first u.link.first #define cr_specdata1 u.dev.specdata1 #define cr_specdata2 u.dev.specdata2 diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index fb3fcacc1e98..574368e8a16f 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -497,6 +497,8 @@ struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); char * svc_print_addr(struct svc_rqst *, char *, size_t); unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first, size_t total); +char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, + struct kvec *first, size_t total); #define RPC_MAX_ADDRBUFLEN (63U) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index a155e2de19aa..30a4226baf03 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1575,3 +1575,70 @@ unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first, return i; } EXPORT_SYMBOL_GPL(svc_fill_write_vector); + +/** + * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call + * @rqstp: svc_rqst to operate on + * @first: buffer containing first section of pathname + * @total: total length of the pathname argument + * + * Returns pointer to a NUL-terminated string, or an ERR_PTR. The buffer is + * released automatically when @rqstp is recycled. + */ +char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, + size_t total) +{ + struct xdr_buf *arg = &rqstp->rq_arg; + struct page **pages; + char *result; + + /* VFS API demands a NUL-terminated pathname. This function + * uses a page from @rqstp as the pathname buffer, to enable + * direct placement. Thus the total buffer size is PAGE_SIZE. + * Space in this buffer for NUL-termination requires that we + * cap the size of the returned symlink pathname just a + * little early. + */ + if (total > PAGE_SIZE - 1) + return ERR_PTR(-ENAMETOOLONG); + + /* Some types of transport can present the pathname entirely + * in rq_arg.pages. If not, then copy the pathname into one + * page. + */ + pages = arg->pages; + WARN_ON_ONCE(arg->page_base != 0); + if (first->iov_base == 0) { + result = page_address(*pages); + result[total] = '\0'; + } else { + size_t len, remaining; + char *dst; + + result = page_address(*(rqstp->rq_next_page++)); + dst = result; + remaining = total; + + len = min_t(size_t, total, first->iov_len); + memcpy(dst, first->iov_base, len); + dst += len; + remaining -= len; + + /* No more than one page left */ + if (remaining) { + len = min_t(size_t, remaining, PAGE_SIZE); + memcpy(dst, page_address(*pages), len); + dst += len; + } + + *dst = '\0'; + } + + /* Sanity check: we don't allow the pathname argument to + * contain a NUL byte. + */ + if (strlen(result) != total) + return ERR_PTR(-EINVAL); + return result; +} +EXPORT_SYMBOL_GPL(svc_fill_symlink_pathname); -- cgit v1.2.3-59-g8ed1b From f3aefb6a7066e24bfea7fcf1b07907576de69d63 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 28 Mar 2018 10:57:22 -0700 Subject: sunrpc: remove incorrect HMAC request initialization make_checksum_hmac_md5() is allocating an HMAC transform and doing crypto API calls in the following order: crypto_ahash_init() crypto_ahash_setkey() crypto_ahash_digest() This is wrong because it makes no sense to init() the request before a key has been set, given that the initial state depends on the key. And digest() is short for init() + update() + final(), so in this case there's no need to explicitly call init() at all. Before commit 9fa68f620041 ("crypto: hash - prevent using keyed hashes without setting key") the extra init() had no real effect, at least for the software HMAC implementation. (There are also hardware drivers that implement HMAC-MD5, and it's not immediately obvious how gracefully they handle init() before setkey().) But now the crypto API detects this incorrect initialization and returns -ENOKEY. This is breaking NFS mounts in some cases. Fix it by removing the incorrect call to crypto_ahash_init(). Reported-by: Michael Young Fixes: 9fa68f620041 ("crypto: hash - prevent using keyed hashes without setting key") Fixes: fffdaef2eb4a ("gss_krb5: Add support for rc4-hmac encryption") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_krb5_crypto.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 12649c9fedab..8654494b4d0a 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -237,9 +237,6 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); - err = crypto_ahash_init(req); - if (err) - goto out; err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength); if (err) goto out; -- cgit v1.2.3-59-g8ed1b