diff options
Diffstat (limited to 'net/sunrpc')
60 files changed, 7742 insertions, 5622 deletions
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 3bcf985507be..bbbb5af0af13 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -21,7 +21,6 @@ config RPCSEC_GSS_KRB5 depends on SUNRPC && CRYPTO depends on CRYPTO_MD5 && CRYPTO_DES && CRYPTO_CBC && CRYPTO_CTS depends on CRYPTO_ECB && CRYPTO_HMAC && CRYPTO_SHA1 && CRYPTO_AES - depends on CRYPTO_ARC4 default y select SUNRPC_GSS help diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 9488600451e8..1c8de397d6ad 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ - sunrpc_syms.o cache.o rpc_pipe.o \ + sunrpc_syms.o cache.o rpc_pipe.o sysfs.o \ svc_xprt.o \ xprtmultipath.o sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index 8b4d72b1a066..d435bffc6199 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -82,11 +82,11 @@ static size_t rpc_ntop6(const struct sockaddr *sap, rc = snprintf(scopebuf, sizeof(scopebuf), "%c%u", IPV6_SCOPE_DELIMITER, sin6->sin6_scope_id); - if (unlikely((size_t)rc > sizeof(scopebuf))) + if (unlikely((size_t)rc >= sizeof(scopebuf))) return 0; len += rc; - if (unlikely(len > buflen)) + if (unlikely(len >= buflen)) return 0; strcat(buf, scopebuf); @@ -162,8 +162,10 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, const size_t buflen, const char *delim, struct sockaddr_in6 *sin6) { - char *p; + char p[IPV6_SCOPE_ID_LEN + 1]; size_t len; + u32 scope_id = 0; + struct net_device *dev; if ((buf + buflen) == delim) return 1; @@ -175,29 +177,23 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, return 0; len = (buf + buflen) - delim - 1; - p = kmemdup_nul(delim + 1, len, GFP_KERNEL); - if (p) { - u32 scope_id = 0; - struct net_device *dev; - - dev = dev_get_by_name(net, p); - if (dev != NULL) { - scope_id = dev->ifindex; - dev_put(dev); - } else { - if (kstrtou32(p, 10, &scope_id) == 0) { - kfree(p); - return 0; - } - } - - kfree(p); - - sin6->sin6_scope_id = scope_id; - return 1; + if (len > IPV6_SCOPE_ID_LEN) + return 0; + + memcpy(p, delim + 1, len); + p[len] = 0; + + dev = dev_get_by_name(net, p); + if (dev != NULL) { + scope_id = dev->ifindex; + dev_put(dev); + } else { + if (kstrtou32(p, 10, &scope_id) != 0) + return 0; } - return 0; + sin6->sin6_scope_id = scope_id; + return 1; } static size_t rpc_pton6(struct net *net, const char *buf, const size_t buflen, diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 5748ad0ba1bd..fb75a883503f 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -81,7 +81,7 @@ static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp) unsigned int nbits; nbits = *(unsigned int *)kp->arg; - return sprintf(buffer, "%u", 1U << nbits); + return sprintf(buffer, "%u\n", 1U << nbits); } #define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int); @@ -445,7 +445,7 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) * Enforce a 60 second garbage collection moratorium * Note that the cred_unused list must be time-ordered. */ - if (!time_in_range(cred->cr_expire, expired, jiffies)) + if (time_in_range(cred->cr_expire, expired, jiffies)) continue; if (!rpcauth_unhash_cred(cred)) continue; @@ -615,6 +615,8 @@ rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) }; struct rpc_cred *ret; + if (RPC_IS_ASYNC(task)) + lookupflags |= RPCAUTH_LOOKUP_ASYNC; ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags); put_cred(acred.cred); return ret; @@ -631,6 +633,8 @@ rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags) if (!acred.principal) return NULL; + if (RPC_IS_ASYNC(task)) + lookupflags |= RPCAUTH_LOOKUP_ASYNC; return auth->au_ops->lookup_cred(auth, &acred, lookupflags); } @@ -654,7 +658,7 @@ rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags) }; if (flags & RPC_TASK_ASYNC) - lookupflags |= RPCAUTH_LOOKUP_NEW; + lookupflags |= RPCAUTH_LOOKUP_NEW | RPCAUTH_LOOKUP_ASYNC; if (task->tk_op_cred) /* Task must use exactly this rpc_cred */ new = get_rpccred(task->tk_op_cred); @@ -666,7 +670,7 @@ rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags) /* If machine cred couldn't be bound, try a root cred */ if (new) ; - else if (cred == &machine_cred || (flags & RPC_TASK_ROOTCREDS)) + else if (cred == &machine_cred) new = rpcauth_bind_root_cred(task, lookupflags); else if (flags & RPC_TASK_NULLCREDS) new = authnull_ops.lookup_cred(NULL, NULL, 0); @@ -870,7 +874,7 @@ int __init rpcauth_init_module(void) err = rpc_init_authunix(); if (err < 0) goto out1; - err = register_shrinker(&rpc_cred_shrinker); + err = register_shrinker(&rpc_cred_shrinker, "sunrpc_cred"); if (err < 0) goto out2; return 0; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 24ca861815b1..7bb247c51e2f 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -20,6 +20,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/auth_gss.h> +#include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/gss_err.h> #include <linux/workqueue.h> @@ -28,6 +29,7 @@ #include <linux/uaccess.h> #include <linux/hashtable.h> +#include "auth_gss_internal.h" #include "../netns.h" #include <trace/events/rpcgss.h> @@ -70,7 +72,8 @@ struct gss_auth { struct gss_api_mech *mech; enum rpc_gss_svc service; struct rpc_clnt *client; - struct net *net; + struct net *net; + netns_tracker ns_tracker; /* * There are two upcall pipes; dentry[1], named "gssd", is used * for the new text-based upcall; dentry[0] is named after the @@ -124,35 +127,6 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); } -static const void * -simple_get_bytes(const void *p, const void *end, void *res, size_t len) -{ - const void *q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - memcpy(res, p, len); - return q; -} - -static inline const void * -simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) -{ - const void *q; - unsigned int len; - - p = simple_get_bytes(p, end, &len, sizeof(len)); - if (IS_ERR(p)) - return p; - q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - dest->data = kmemdup(p, len, GFP_NOFS); - if (unlikely(dest->data == NULL)) - return ERR_PTR(-ENOMEM); - dest->len = len; - return q; -} - static struct gss_cl_ctx * gss_cred_get_ctx(struct rpc_cred *cred) { @@ -172,7 +146,7 @@ gss_alloc_context(void) { struct gss_cl_ctx *ctx; - ctx = kzalloc(sizeof(*ctx), GFP_NOFS); + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { ctx->gc_proc = RPC_GSS_PROC_DATA; ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ @@ -235,7 +209,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct p = ERR_PTR(-EFAULT); goto err; } - ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_NOFS); + ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_KERNEL); if (ret < 0) { trace_rpcgss_import_ctx(ret); p = ERR_PTR(ret); @@ -253,7 +227,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct if (IS_ERR(p)) goto err; done: - trace_rpcgss_context(ctx->gc_expiry, now, timeout, + trace_rpcgss_context(window_size, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len, ctx->gc_acceptor.data); err: return p; @@ -537,7 +511,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, int vers; int err = -ENOMEM; - gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS); + gss_msg = kzalloc(sizeof(*gss_msg), GFP_KERNEL); if (gss_msg == NULL) goto err; vers = get_pipe_version(gss_auth->net); @@ -553,7 +527,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, gss_msg->auth = gss_auth; kref_get(&gss_auth->kref); if (service_name) { - gss_msg->service_name = kstrdup_const(service_name, GFP_NOFS); + gss_msg->service_name = kstrdup_const(service_name, GFP_KERNEL); if (!gss_msg->service_name) { err = -ENOMEM; goto err_put_pipe_version; @@ -696,10 +670,12 @@ retry: } schedule(); } - if (gss_msg->ctx) + if (gss_msg->ctx) { + trace_rpcgss_ctx_init(gss_cred); gss_cred_set_ctx(cred, gss_msg->ctx); - else + } else { err = gss_msg->msg.errno; + } spin_unlock(&pipe->lock); out_intr: finish_wait(&gss_msg->waitqueue, &wait); @@ -727,7 +703,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (mlen > MSG_BUF_MAXSIZE) goto out; err = -ENOMEM; - buf = kmalloc(mlen, GFP_NOFS); + buf = kmalloc(mlen, GFP_KERNEL); if (!buf) goto out; @@ -1038,7 +1014,8 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) goto err_free; } gss_auth->client = clnt; - gss_auth->net = get_net(rpc_net_ns(clnt)); + gss_auth->net = get_net_track(rpc_net_ns(clnt), &gss_auth->ns_tracker, + GFP_KERNEL); err = -EINVAL; gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); if (!gss_auth->mech) @@ -1050,14 +1027,14 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) goto err_put_mech; auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; - auth->au_rslack = GSS_VERF_SLACK >> 2; + auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2; auth->au_verfsize = GSS_VERF_SLACK >> 2; auth->au_ralign = GSS_VERF_SLACK >> 2; - auth->au_flags = 0; + __set_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags); auth->au_ops = &authgss_ops; auth->au_flavor = flavor; if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) - auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; + __set_bit(RPCAUTH_AUTH_DATATOUCH, &auth->au_flags); refcount_set(&auth->au_count, 1); kref_init(&gss_auth->kref); @@ -1093,7 +1070,7 @@ err_destroy_credcache: err_put_mech: gss_mech_put(gss_auth->mech); err_put_net: - put_net(gss_auth->net); + put_net_track(gss_auth->net, &gss_auth->ns_tracker); err_free: kfree(gss_auth->target_name); kfree(gss_auth); @@ -1109,7 +1086,7 @@ gss_free(struct gss_auth *gss_auth) gss_pipe_free(gss_auth->gss_pipe[0]); gss_pipe_free(gss_auth->gss_pipe[1]); gss_mech_put(gss_auth->mech); - put_net(gss_auth->net); + put_net_track(gss_auth->net, &gss_auth->ns_tracker); kfree(gss_auth->target_name); kfree(gss_auth); @@ -1243,7 +1220,7 @@ gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred) struct gss_cred *new; /* Make a copy of the cred so that we can reference count it */ - new = kzalloc(sizeof(*gss_cred), GFP_NOFS); + new = kzalloc(sizeof(*gss_cred), GFP_KERNEL); if (new) { struct auth_cred acred = { .cred = gss_cred->gc_base.cr_cred, @@ -1283,8 +1260,9 @@ gss_send_destroy_context(struct rpc_cred *cred) if (new) { ctx->gc_proc = RPC_GSS_PROC_DESTROY; + trace_rpcgss_ctx_destroy(gss_cred); task = rpc_call_null(gss_auth->client, &new->gc_base, - RPC_TASK_ASYNC|RPC_TASK_SOFT); + RPC_TASK_ASYNC); if (!IS_ERR(task)) rpc_put_task(task); @@ -1348,7 +1326,6 @@ gss_destroy_nullcred(struct rpc_cred *cred) static void gss_destroy_cred(struct rpc_cred *cred) { - if (test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) gss_send_destroy_context(cred); gss_destroy_nullcred(cred); @@ -1363,10 +1340,11 @@ gss_hash_cred(struct auth_cred *acred, unsigned int hashbits) /* * Lookup RPCSEC_GSS cred for the current process */ -static struct rpc_cred * -gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +static struct rpc_cred *gss_lookup_cred(struct rpc_auth *auth, + struct auth_cred *acred, int flags) { - return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS); + return rpcauth_lookup_credcache(auth, acred, flags, + rpc_task_gfp_mask()); } static struct rpc_cred * @@ -1612,6 +1590,7 @@ static int gss_renew_cred(struct rpc_task *task) new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW); if (IS_ERR(new)) return PTR_ERR(new); + task->tk_rqstp->rq_cred = new; put_rpccred(oldcred); return 0; @@ -1691,7 +1670,7 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr) if (!p) goto validate_failed; - seq = kmalloc(4, GFP_NOFS); + seq = kmalloc(4, GFP_KERNEL); if (!seq) goto validate_failed; *seq = cpu_to_be32(task->tk_rqstp->rq_seqno); @@ -1708,7 +1687,8 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr) /* We leave it to unwrap to calculate au_rslack. For now we just * calculate the length of the verifier: */ - cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; + if (test_bit(RPCAUTH_AUTH_UPDATE_SLACK, &cred->cr_auth->au_flags)) + cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; status = 0; out: gss_put_ctx(ctx); @@ -1724,8 +1704,9 @@ bad_mic: goto out; } -static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, - struct rpc_task *task, struct xdr_stream *xdr) +static noinline_for_stack int +gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf; @@ -1799,11 +1780,11 @@ alloc_enc_pages(struct rpc_rqst *rqstp) rqstp->rq_enc_pages = kmalloc_array(rqstp->rq_enc_pages_num, sizeof(struct page *), - GFP_NOFS); + GFP_KERNEL); if (!rqstp->rq_enc_pages) goto out; for (i=0; i < rqstp->rq_enc_pages_num; i++) { - rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS); + rqstp->rq_enc_pages[i] = alloc_page(GFP_KERNEL); if (rqstp->rq_enc_pages[i] == NULL) goto out_free; } @@ -1816,8 +1797,9 @@ out: return -EAGAIN; } -static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, - struct rpc_task *task, struct xdr_stream *xdr) +static noinline_for_stack int +gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; @@ -1877,7 +1859,7 @@ static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, else iov = snd_buf->head; p = iov->iov_base + iov->iov_len; - pad = 3 - ((snd_buf->len - offset - 1) & 3); + pad = xdr_pad_size(snd_buf->len - offset); memset(p, 0, pad); iov->iov_len += pad; snd_buf->len += pad; @@ -1924,73 +1906,125 @@ out: return status; } -static int -gss_unwrap_resp_auth(struct rpc_cred *cred) +/** + * gss_update_rslack - Possibly update RPC receive buffer size estimates + * @task: rpc_task for incoming RPC Reply being unwrapped + * @cred: controlling rpc_cred for @task + * @before: XDR words needed before each RPC Reply message + * @after: XDR words needed following each RPC Reply message + * + */ +static void gss_update_rslack(struct rpc_task *task, struct rpc_cred *cred, + unsigned int before, unsigned int after) { struct rpc_auth *auth = cred->cr_auth; - auth->au_rslack = auth->au_verfsize; - auth->au_ralign = auth->au_verfsize; - return 0; + if (test_and_clear_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags)) { + auth->au_ralign = auth->au_verfsize + before; + auth->au_rslack = auth->au_verfsize + after; + trace_rpcgss_update_slack(task, auth); + } } static int +gss_unwrap_resp_auth(struct rpc_task *task, struct rpc_cred *cred) +{ + gss_update_rslack(task, cred, 0, 0); + return 0; +} + +/* + * RFC 2203, Section 5.3.2.2 + * + * struct rpc_gss_integ_data { + * opaque databody_integ<>; + * opaque checksum<>; + * }; + * + * struct rpc_gss_data_t { + * unsigned int seq_num; + * proc_req_arg_t arg; + * }; + */ +static noinline_for_stack int gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { - struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf; - u32 data_offset, mic_offset, integ_len, maj_stat; - struct rpc_auth *auth = cred->cr_auth; + struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf; + u32 len, offset, seqno, maj_stat; struct xdr_netobj mic; - __be32 *p; + int ret; - p = xdr_inline_decode(xdr, 2 * sizeof(*p)); - if (unlikely(!p)) + ret = -EIO; + mic.data = NULL; + + /* opaque databody_integ<>; */ + if (xdr_stream_decode_u32(xdr, &len)) goto unwrap_failed; - integ_len = be32_to_cpup(p++); - if (integ_len & 3) + if (len & 3) goto unwrap_failed; - data_offset = (u8 *)(p) - (u8 *)rcv_buf->head[0].iov_base; - mic_offset = integ_len + data_offset; - if (mic_offset > rcv_buf->len) + offset = rcv_buf->len - xdr_stream_remaining(xdr); + if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; - if (be32_to_cpup(p) != rqstp->rq_seqno) + if (seqno != rqstp->rq_seqno) goto bad_seqno; + if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) + goto unwrap_failed; - if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len)) + /* + * The xdr_stream now points to the beginning of the + * upper layer payload, to be passed below to + * rpcauth_unwrap_resp_decode(). The checksum, which + * follows the upper layer payload in @rcv_buf, is + * located and parsed without updating the xdr_stream. + */ + + /* opaque checksum<>; */ + offset += len; + if (xdr_decode_word(rcv_buf, offset, &len)) goto unwrap_failed; - if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset)) + offset += sizeof(__be32); + if (offset + len > rcv_buf->len) goto unwrap_failed; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); + mic.len = len; + mic.data = kmalloc(len, GFP_KERNEL); + if (ZERO_OR_NULL_PTR(mic.data)) + goto unwrap_failed; + if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len)) + goto unwrap_failed; + + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; - auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len); - auth->au_ralign = auth->au_verfsize + 2; - return 0; + gss_update_rslack(task, cred, 2, 2 + 1 + XDR_QUADLEN(mic.len)); + ret = 0; + +out: + kfree(mic.data); + return ret; + unwrap_failed: trace_rpcgss_unwrap_failed(task); - return -EIO; + goto out; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(p)); - return -EIO; + trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno); + goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); - return -EIO; + goto out; } -static int +static noinline_for_stack int gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; struct kvec *head = rqstp->rq_rcv_buf.head; - struct rpc_auth *auth = cred->cr_auth; - unsigned int savedlen = rcv_buf->len; u32 offset, opaque_len, maj_stat; __be32 *p; @@ -2001,9 +2035,9 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, offset = (u8 *)(p) - (u8 *)head->iov_base; if (offset + opaque_len > rcv_buf->len) goto unwrap_failed; - rcv_buf->len = offset + opaque_len; - maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf); + maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, + offset + opaque_len, rcv_buf); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) @@ -2017,10 +2051,9 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, */ xdr_init_decode(xdr, rcv_buf, p, rqstp); - auth->au_rslack = auth->au_verfsize + 2 + - XDR_QUADLEN(savedlen - rcv_buf->len); - auth->au_ralign = auth->au_verfsize + 2 + - XDR_QUADLEN(savedlen - rcv_buf->len); + gss_update_rslack(task, cred, 2 + ctx->gc_gss_ctx->align, + 2 + ctx->gc_gss_ctx->slack); + return 0; unwrap_failed: trace_rpcgss_unwrap_failed(task); @@ -2090,7 +2123,7 @@ gss_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr) goto out_decode; switch (gss_cred->gc_service) { case RPC_GSS_SVC_NONE: - status = gss_unwrap_resp_auth(cred); + status = gss_unwrap_resp_auth(task, cred); break; case RPC_GSS_SVC_INTEGRITY: status = gss_unwrap_resp_integ(task, cred, ctx, rqstp, xdr); diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h new file mode 100644 index 000000000000..c53b329092d4 --- /dev/null +++ b/net/sunrpc/auth_gss/auth_gss_internal.h @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* + * linux/net/sunrpc/auth_gss/auth_gss_internal.h + * + * Internal definitions for RPCSEC_GSS client authentication + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + */ +#include <linux/err.h> +#include <linux/string.h> +#include <linux/sunrpc/xdr.h> + +static inline const void * +simple_get_bytes(const void *p, const void *end, void *res, size_t len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + memcpy(res, p, len); + return q; +} + +static inline const void * +simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) +{ + const void *q; + unsigned int len; + + p = simple_get_bytes(p, end, &len, sizeof(len)); + if (IS_ERR(p)) + return p; + q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + if (len) { + dest->data = kmemdup(p, len, GFP_KERNEL); + if (unlikely(dest->data == NULL)) + return ERR_PTR(-ENOMEM); + } else + dest->data = NULL; + dest->len = len; + return q; +} diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c index fe97f3106536..4a4082bb22ad 100644 --- a/net/sunrpc/auth_gss/gss_generic_token.c +++ b/net/sunrpc/auth_gss/gss_generic_token.c @@ -222,10 +222,8 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size, if (ret) return ret; - if (!ret) { - *buf_in = buf; - *body_size = toksize; - } + *buf_in = buf; + *body_size = toksize; return ret; } diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 6f2d30d7b766..3ea58175e159 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -138,135 +138,6 @@ checksummer(struct scatterlist *sg, void *data) return crypto_ahash_update(req); } -static int -arcfour_hmac_md5_usage_to_salt(unsigned int usage, u8 salt[4]) -{ - unsigned int ms_usage; - - switch (usage) { - case KG_USAGE_SIGN: - ms_usage = 15; - break; - case KG_USAGE_SEAL: - ms_usage = 13; - break; - default: - return -EINVAL; - } - salt[0] = (ms_usage >> 0) & 0xff; - salt[1] = (ms_usage >> 8) & 0xff; - salt[2] = (ms_usage >> 16) & 0xff; - salt[3] = (ms_usage >> 24) & 0xff; - - return 0; -} - -static u32 -make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, - struct xdr_buf *body, int body_offset, u8 *cksumkey, - unsigned int usage, struct xdr_netobj *cksumout) -{ - struct scatterlist sg[1]; - int err = -1; - u8 *checksumdata; - u8 *rc4salt; - struct crypto_ahash *md5; - struct crypto_ahash *hmac_md5; - struct ahash_request *req; - - if (cksumkey == NULL) - return GSS_S_FAILURE; - - if (cksumout->len < kctx->gk5e->cksumlength) { - dprintk("%s: checksum buffer length, %u, too small for %s\n", - __func__, cksumout->len, kctx->gk5e->name); - return GSS_S_FAILURE; - } - - rc4salt = kmalloc_array(4, sizeof(*rc4salt), GFP_NOFS); - if (!rc4salt) - return GSS_S_FAILURE; - - if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) { - dprintk("%s: invalid usage value %u\n", __func__, usage); - goto out_free_rc4salt; - } - - checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS); - if (!checksumdata) - goto out_free_rc4salt; - - md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(md5)) - goto out_free_cksum; - - hmac_md5 = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(hmac_md5)) - goto out_free_md5; - - req = ahash_request_alloc(md5, GFP_NOFS); - if (!req) - goto out_free_hmac_md5; - - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); - - err = crypto_ahash_init(req); - if (err) - goto out; - sg_init_one(sg, rc4salt, 4); - ahash_request_set_crypt(req, sg, NULL, 4); - err = crypto_ahash_update(req); - if (err) - goto out; - - sg_init_one(sg, header, hdrlen); - ahash_request_set_crypt(req, sg, NULL, hdrlen); - err = crypto_ahash_update(req); - if (err) - goto out; - err = xdr_process_buf(body, body_offset, body->len - body_offset, - checksummer, req); - if (err) - goto out; - ahash_request_set_crypt(req, NULL, checksumdata, 0); - err = crypto_ahash_final(req); - if (err) - goto out; - - ahash_request_free(req); - req = ahash_request_alloc(hmac_md5, GFP_NOFS); - if (!req) - goto out_free_hmac_md5; - - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); - - err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength); - if (err) - goto out; - - sg_init_one(sg, checksumdata, crypto_ahash_digestsize(md5)); - ahash_request_set_crypt(req, sg, checksumdata, - crypto_ahash_digestsize(md5)); - err = crypto_ahash_digest(req); - if (err) - goto out; - - memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); - cksumout->len = kctx->gk5e->cksumlength; -out: - ahash_request_free(req); -out_free_hmac_md5: - crypto_free_ahash(hmac_md5); -out_free_md5: - crypto_free_ahash(md5); -out_free_cksum: - kfree(checksumdata); -out_free_rc4salt: - kfree(rc4salt); - return err ? GSS_S_FAILURE : 0; -} - /* * checksum the plaintext data and hdrlen bytes of the token header * The checksum is performed over the first 8 bytes of the @@ -284,18 +155,13 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, u8 *checksumdata; unsigned int checksumlen; - if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR) - return make_checksum_hmac_md5(kctx, header, hdrlen, - body, body_offset, - cksumkey, usage, cksumout); - if (cksumout->len < kctx->gk5e->cksumlength) { dprintk("%s: checksum buffer length, %u, too small for %s\n", __func__, cksumout->len, kctx->gk5e->name); return GSS_S_FAILURE; } - checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS); + checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_KERNEL); if (checksumdata == NULL) return GSS_S_FAILURE; @@ -303,7 +169,7 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, if (IS_ERR(tfm)) goto out_free_cksum; - req = ahash_request_alloc(tfm, GFP_NOFS); + req = ahash_request_alloc(tfm, GFP_KERNEL); if (!req) goto out_free_ahash; @@ -391,7 +257,7 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, return GSS_S_FAILURE; } - checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS); + checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_KERNEL); if (!checksumdata) return GSS_S_FAILURE; @@ -399,7 +265,7 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, if (IS_ERR(tfm)) goto out_free_cksum; - req = ahash_request_alloc(tfm, GFP_NOFS); + req = ahash_request_alloc(tfm, GFP_KERNEL); if (!req) goto out_free_ahash; @@ -688,7 +554,7 @@ gss_krb5_cts_crypt(struct crypto_sync_skcipher *cipher, struct xdr_buf *buf, WARN_ON(0); return -ENOMEM; } - data = kmalloc(GSS_KRB5_MAX_BLOCKSIZE * 2, GFP_NOFS); + data = kmalloc(GSS_KRB5_MAX_BLOCKSIZE * 2, GFP_KERNEL); if (!data) return -ENOMEM; @@ -851,8 +717,8 @@ out_err: } u32 -gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, - u32 *headskip, u32 *tailskip) +gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, u32 len, + struct xdr_buf *buf, u32 *headskip, u32 *tailskip) { struct xdr_buf subbuf; u32 ret = 0; @@ -881,7 +747,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, /* create a segment skipping the header and leaving out the checksum */ xdr_buf_subsegment(buf, &subbuf, offset + GSS_KRB5_TOK_HDR_LEN, - (buf->len - offset - GSS_KRB5_TOK_HDR_LEN - + (len - offset - GSS_KRB5_TOK_HDR_LEN - kctx->gk5e->cksumlength)); nblocks = (subbuf.len + blocksize - 1) / blocksize; @@ -926,7 +792,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, goto out_err; /* Get the packet's hmac value */ - ret = read_bytes_from_xdr_buf(buf, buf->len - kctx->gk5e->cksumlength, + ret = read_bytes_from_xdr_buf(buf, len - kctx->gk5e->cksumlength, pkt_hmac, kctx->gk5e->cksumlength); if (ret) goto out_err; @@ -942,145 +808,3 @@ out_err: ret = GSS_S_FAILURE; return ret; } - -/* - * Compute Kseq given the initial session key and the checksum. - * Set the key of the given cipher. - */ -int -krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, - struct crypto_sync_skcipher *cipher, - unsigned char *cksum) -{ - struct crypto_shash *hmac; - struct shash_desc *desc; - u8 Kseq[GSS_KRB5_MAX_KEYLEN]; - u32 zeroconstant = 0; - int err; - - dprintk("%s: entered\n", __func__); - - hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0); - if (IS_ERR(hmac)) { - dprintk("%s: error %ld, allocating hash '%s'\n", - __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); - return PTR_ERR(hmac); - } - - desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), - GFP_NOFS); - if (!desc) { - dprintk("%s: failed to allocate shash descriptor for '%s'\n", - __func__, kctx->gk5e->cksum_name); - crypto_free_shash(hmac); - return -ENOMEM; - } - - desc->tfm = hmac; - - /* Compute intermediate Kseq from session key */ - err = crypto_shash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength); - if (err) - goto out_err; - - err = crypto_shash_digest(desc, (u8 *)&zeroconstant, 4, Kseq); - if (err) - goto out_err; - - /* Compute final Kseq from the checksum and intermediate Kseq */ - err = crypto_shash_setkey(hmac, Kseq, kctx->gk5e->keylength); - if (err) - goto out_err; - - err = crypto_shash_digest(desc, cksum, 8, Kseq); - if (err) - goto out_err; - - err = crypto_sync_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); - if (err) - goto out_err; - - err = 0; - -out_err: - kzfree(desc); - crypto_free_shash(hmac); - dprintk("%s: returning %d\n", __func__, err); - return err; -} - -/* - * Compute Kcrypt given the initial session key and the plaintext seqnum. - * Set the key of cipher kctx->enc. - */ -int -krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, - struct crypto_sync_skcipher *cipher, - s32 seqnum) -{ - struct crypto_shash *hmac; - struct shash_desc *desc; - u8 Kcrypt[GSS_KRB5_MAX_KEYLEN]; - u8 zeroconstant[4] = {0}; - u8 seqnumarray[4]; - int err, i; - - dprintk("%s: entered, seqnum %u\n", __func__, seqnum); - - hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0); - if (IS_ERR(hmac)) { - dprintk("%s: error %ld, allocating hash '%s'\n", - __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); - return PTR_ERR(hmac); - } - - desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), - GFP_NOFS); - if (!desc) { - dprintk("%s: failed to allocate shash descriptor for '%s'\n", - __func__, kctx->gk5e->cksum_name); - crypto_free_shash(hmac); - return -ENOMEM; - } - - desc->tfm = hmac; - - /* Compute intermediate Kcrypt from session key */ - for (i = 0; i < kctx->gk5e->keylength; i++) - Kcrypt[i] = kctx->Ksess[i] ^ 0xf0; - - err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); - if (err) - goto out_err; - - err = crypto_shash_digest(desc, zeroconstant, 4, Kcrypt); - if (err) - goto out_err; - - /* Compute final Kcrypt from the seqnum and intermediate Kcrypt */ - err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); - if (err) - goto out_err; - - seqnumarray[0] = (unsigned char) ((seqnum >> 24) & 0xff); - seqnumarray[1] = (unsigned char) ((seqnum >> 16) & 0xff); - seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff); - seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff); - - err = crypto_shash_digest(desc, seqnumarray, 4, Kcrypt); - if (err) - goto out_err; - - err = crypto_sync_skcipher_setkey(cipher, Kcrypt, - kctx->gk5e->keylength); - if (err) - goto out_err; - - err = 0; - -out_err: - kzfree(desc); - crypto_free_shash(hmac); - dprintk("%s: returning %d\n", __func__, err); - return err; -} diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index 3b7f721c023b..726c076950c0 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -228,11 +228,11 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, ret = 0; err_free_raw: - kzfree(rawkey); + kfree_sensitive(rawkey); err_free_out: - kzfree(outblockdata); + kfree_sensitive(outblockdata); err_free_in: - kzfree(inblockdata); + kfree_sensitive(inblockdata); err_free_cipher: crypto_free_sync_skcipher(cipher); err_return: diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 75b3c2e9e8f8..1c092b05c2bb 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -21,6 +21,8 @@ #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/gss_krb5_enctypes.h> +#include "auth_gss_internal.h" + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif @@ -52,27 +54,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { }, #endif /* CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES */ /* - * RC4-HMAC - */ - { - .etype = ENCTYPE_ARCFOUR_HMAC, - .ctype = CKSUMTYPE_HMAC_MD5_ARCFOUR, - .name = "rc4-hmac", - .encrypt_name = "ecb(arc4)", - .cksum_name = "hmac(md5)", - .encrypt = krb5_encrypt, - .decrypt = krb5_decrypt, - .mk_key = NULL, - .signalg = SGN_ALG_HMAC_MD5, - .sealalg = SEAL_ALG_MICROSOFT_RC4, - .keybytes = 16, - .keylength = 16, - .blocksize = 1, - .conflen = 8, - .cksumlength = 8, - .keyed_cksum = 1, - }, - /* * 3DES */ { @@ -164,35 +145,6 @@ get_gss_krb5_enctype(int etype) return NULL; } -static const void * -simple_get_bytes(const void *p, const void *end, void *res, int len) -{ - const void *q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - memcpy(res, p, len); - return q; -} - -static const void * -simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) -{ - const void *q; - unsigned int len; - - p = simple_get_bytes(p, end, &len, sizeof(len)); - if (IS_ERR(p)) - return p; - q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - res->data = kmemdup(p, len, GFP_NOFS); - if (unlikely(res->data == NULL)) - return ERR_PTR(-ENOMEM); - res->len = len; - return q; -} - static inline const void * get_key(const void *p, const void *end, struct krb5_ctx *ctx, struct crypto_sync_skcipher **res) @@ -401,78 +353,6 @@ out_err: return -EINVAL; } -/* - * Note that RC4 depends on deriving keys using the sequence - * number or the checksum of a token. Therefore, the final keys - * cannot be calculated until the token is being constructed! - */ -static int -context_derive_keys_rc4(struct krb5_ctx *ctx) -{ - struct crypto_shash *hmac; - char sigkeyconstant[] = "signaturekey"; - int slen = strlen(sigkeyconstant) + 1; /* include null terminator */ - struct shash_desc *desc; - int err; - - dprintk("RPC: %s: entered\n", __func__); - /* - * derive cksum (aka Ksign) key - */ - hmac = crypto_alloc_shash(ctx->gk5e->cksum_name, 0, 0); - if (IS_ERR(hmac)) { - dprintk("%s: error %ld allocating hash '%s'\n", - __func__, PTR_ERR(hmac), ctx->gk5e->cksum_name); - err = PTR_ERR(hmac); - goto out_err; - } - - err = crypto_shash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength); - if (err) - goto out_err_free_hmac; - - - desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), GFP_NOFS); - if (!desc) { - dprintk("%s: failed to allocate hash descriptor for '%s'\n", - __func__, ctx->gk5e->cksum_name); - err = -ENOMEM; - goto out_err_free_hmac; - } - - desc->tfm = hmac; - - err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum); - kzfree(desc); - if (err) - goto out_err_free_hmac; - /* - * allocate hash, and skciphers for data and seqnum encryption - */ - ctx->enc = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0); - if (IS_ERR(ctx->enc)) { - err = PTR_ERR(ctx->enc); - goto out_err_free_hmac; - } - - ctx->seq = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0); - if (IS_ERR(ctx->seq)) { - crypto_free_sync_skcipher(ctx->enc); - err = PTR_ERR(ctx->seq); - goto out_err_free_hmac; - } - - dprintk("RPC: %s: returning success\n", __func__); - - err = 0; - -out_err_free_hmac: - crypto_free_shash(hmac); -out_err: - dprintk("RPC: %s: returning %d\n", __func__, err); - return err; -} - static int context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) { @@ -649,8 +529,6 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, switch (ctx->enctype) { case ENCTYPE_DES3_CBC_RAW: return context_derive_keys_des3(ctx, gfp_mask); - case ENCTYPE_ARCFOUR_HMAC: - return context_derive_keys_rc4(ctx); case ENCTYPE_AES128_CTS_HMAC_SHA1_96: case ENCTYPE_AES256_CTS_HMAC_SHA1_96: return context_derive_keys_new(ctx, gfp_mask); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index f1d280accf43..33061417ec97 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -214,7 +214,6 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, BUG(); case ENCTYPE_DES_CBC_RAW: case ENCTYPE_DES3_CBC_RAW: - case ENCTYPE_ARCFOUR_HMAC: return gss_get_mic_v1(ctx, text, token); case ENCTYPE_AES128_CTS_HMAC_SHA1_96: case ENCTYPE_AES256_CTS_HMAC_SHA1_96: diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c index 507105127095..3200b971a814 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c +++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c @@ -39,42 +39,6 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif -static s32 -krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, - unsigned char *cksum, unsigned char *buf) -{ - struct crypto_sync_skcipher *cipher; - unsigned char *plain; - s32 code; - - dprintk("RPC: %s:\n", __func__); - cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0); - if (IS_ERR(cipher)) - return PTR_ERR(cipher); - - plain = kmalloc(8, GFP_NOFS); - if (!plain) - return -ENOMEM; - - plain[0] = (unsigned char) ((seqnum >> 24) & 0xff); - plain[1] = (unsigned char) ((seqnum >> 16) & 0xff); - plain[2] = (unsigned char) ((seqnum >> 8) & 0xff); - plain[3] = (unsigned char) ((seqnum >> 0) & 0xff); - plain[4] = direction; - plain[5] = direction; - plain[6] = direction; - plain[7] = direction; - - code = krb5_rc4_setup_seq_key(kctx, cipher, cksum); - if (code) - goto out; - - code = krb5_encrypt(cipher, cksum, plain, buf, 8); -out: - kfree(plain); - crypto_free_sync_skcipher(cipher); - return code; -} s32 krb5_make_seq_num(struct krb5_ctx *kctx, struct crypto_sync_skcipher *key, @@ -85,11 +49,7 @@ krb5_make_seq_num(struct krb5_ctx *kctx, unsigned char *plain; s32 code; - if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) - return krb5_make_rc4_seq_num(kctx, direction, seqnum, - cksum, buf); - - plain = kmalloc(8, GFP_NOFS); + plain = kmalloc(8, GFP_KERNEL); if (!plain) return -ENOMEM; @@ -108,50 +68,6 @@ krb5_make_seq_num(struct krb5_ctx *kctx, return code; } -static s32 -krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, - unsigned char *buf, int *direction, s32 *seqnum) -{ - struct crypto_sync_skcipher *cipher; - unsigned char *plain; - s32 code; - - dprintk("RPC: %s:\n", __func__); - cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0); - if (IS_ERR(cipher)) - return PTR_ERR(cipher); - - code = krb5_rc4_setup_seq_key(kctx, cipher, cksum); - if (code) - goto out; - - plain = kmalloc(8, GFP_NOFS); - if (!plain) { - code = -ENOMEM; - goto out; - } - - code = krb5_decrypt(cipher, cksum, buf, plain, 8); - if (code) - goto out_plain; - - if ((plain[4] != plain[5]) || (plain[4] != plain[6]) - || (plain[4] != plain[7])) { - code = (s32)KG_BAD_SEQ; - goto out_plain; - } - - *direction = plain[4]; - - *seqnum = ((plain[0] << 24) | (plain[1] << 16) | - (plain[2] << 8) | (plain[3])); -out_plain: - kfree(plain); -out: - crypto_free_sync_skcipher(cipher); - return code; -} - s32 krb5_get_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, @@ -164,10 +80,7 @@ krb5_get_seq_num(struct krb5_ctx *kctx, dprintk("RPC: krb5_get_seq_num:\n"); - if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) - return krb5_get_rc4_seq_num(kctx, cksum, buf, - direction, seqnum); - plain = kmalloc(8, GFP_NOFS); + plain = kmalloc(8, GFP_KERNEL); if (!plain) return -ENOMEM; diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index aaab91cf24c8..ba04e3ec970a 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -218,7 +218,6 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, BUG(); case ENCTYPE_DES_CBC_RAW: case ENCTYPE_DES3_CBC_RAW: - case ENCTYPE_ARCFOUR_HMAC: return gss_verify_mic_v1(ctx, message_buffer, read_token); case ENCTYPE_AES128_CTS_HMAC_SHA1_96: case ENCTYPE_AES256_CTS_HMAC_SHA1_96: diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 6c1920eed771..48337687848c 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -130,14 +130,14 @@ gss_krb5_make_confounder(char *p, u32 conflen) /* initialize to random value */ if (i == 0) { - i = prandom_u32(); - i = (i << 32) | prandom_u32(); + i = get_random_u32(); + i = (i << 32) | get_random_u32(); } switch (conflen) { case 16: *q++ = i++; - /* fall through */ + fallthrough; case 8: *q++ = i++; break; @@ -236,32 +236,17 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))) return GSS_S_FAILURE; - if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { - struct crypto_sync_skcipher *cipher; - int err; - cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, - 0, 0); - if (IS_ERR(cipher)) - return GSS_S_FAILURE; - - krb5_rc4_setup_enc_key(kctx, cipher, seq_send); - - err = gss_encrypt_xdr_buf(cipher, buf, - offset + headlen - conflen, pages); - crypto_free_sync_skcipher(cipher); - if (err) - return GSS_S_FAILURE; - } else { - if (gss_encrypt_xdr_buf(kctx->enc, buf, - offset + headlen - conflen, pages)) - return GSS_S_FAILURE; - } + if (gss_encrypt_xdr_buf(kctx->enc, buf, + offset + headlen - conflen, pages)) + return GSS_S_FAILURE; return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; } static u32 -gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) +gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, int len, + struct xdr_buf *buf, unsigned int *slack, + unsigned int *align) { int signalg; int sealalg; @@ -279,12 +264,13 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) u32 conflen = kctx->gk5e->conflen; int crypt_offset; u8 *cksumkey; + unsigned int saved_len = buf->len; dprintk("RPC: gss_unwrap_kerberos\n"); ptr = (u8 *)buf->head[0].iov_base + offset; if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, - buf->len - offset)) + len - offset)) return GSS_S_DEFECTIVE_TOKEN; if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) || @@ -313,36 +299,9 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) - (unsigned char *)buf->head[0].iov_base; - /* - * Need plaintext seqnum to derive encryption key for arcfour-hmac - */ - if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN, - ptr + 8, &direction, &seqnum)) - return GSS_S_BAD_SIG; - - if ((kctx->initiate && direction != 0xff) || - (!kctx->initiate && direction != 0)) - return GSS_S_BAD_SIG; - - if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { - struct crypto_sync_skcipher *cipher; - int err; - - cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, - 0, 0); - if (IS_ERR(cipher)) - return GSS_S_FAILURE; - - krb5_rc4_setup_enc_key(kctx, cipher, seqnum); - - err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset); - crypto_free_sync_skcipher(cipher); - if (err) - return GSS_S_DEFECTIVE_TOKEN; - } else { - if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset)) - return GSS_S_DEFECTIVE_TOKEN; - } + buf->len = len; + if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset)) + return GSS_S_DEFECTIVE_TOKEN; if (kctx->gk5e->keyed_cksum) cksumkey = kctx->cksum; @@ -366,6 +325,14 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) /* do sequencing checks */ + if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN, + ptr + 8, &direction, &seqnum)) + return GSS_S_BAD_SIG; + + if ((kctx->initiate && direction != 0xff) || + (!kctx->initiate && direction != 0)) + return GSS_S_BAD_SIG; + /* Copy the data back to the right position. XXX: Would probably be * better to copy and encrypt at the same time. */ @@ -376,11 +343,15 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; memmove(orig_start, data_start, data_len); buf->head[0].iov_len -= (data_start - orig_start); - buf->len -= (data_start - orig_start); + buf->len = len - (data_start - orig_start); if (gss_krb5_remove_padding(buf, blocksize)) return GSS_S_DEFECTIVE_TOKEN; + /* slack must include room for krb5 padding */ + *slack = XDR_QUADLEN(saved_len - buf->len); + /* The GSS blob always precedes the RPC message payload */ + *align = *slack; return GSS_S_COMPLETE; } @@ -438,7 +409,7 @@ static u32 gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, struct page **pages) { - u8 *ptr, *plainhdr; + u8 *ptr; time64_t now; u8 flags = 0x00; __be16 *be16ptr; @@ -455,7 +426,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, return GSS_S_FAILURE; /* construct gss token header */ - ptr = plainhdr = buf->head[0].iov_base + offset; + ptr = buf->head[0].iov_base + offset; *ptr++ = (unsigned char) ((KG2_TOK_WRAP>>8) & 0xff); *ptr++ = (unsigned char) (KG2_TOK_WRAP & 0xff); @@ -486,7 +457,9 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, } static u32 -gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) +gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len, + struct xdr_buf *buf, unsigned int *slack, + unsigned int *align) { time64_t now; u8 *ptr; @@ -532,7 +505,7 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) if (rrc != 0) rotate_left(offset + 16, buf, rrc); - err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf, + err = (*kctx->gk5e->decrypt_v2)(kctx, offset, len, buf, &headskip, &tailskip); if (err) return GSS_S_FAILURE; @@ -542,7 +515,7 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) * it against the original */ err = read_bytes_from_xdr_buf(buf, - buf->len - GSS_KRB5_TOK_HDR_LEN - tailskip, + len - GSS_KRB5_TOK_HDR_LEN - tailskip, decrypted_hdr, GSS_KRB5_TOK_HDR_LEN); if (err) { dprintk("%s: error %u getting decrypted_hdr\n", __func__, err); @@ -568,18 +541,19 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) * Note that buf->head[0].iov_len may indicate the available * head buffer space rather than that actually occupied. */ - movelen = min_t(unsigned int, buf->head[0].iov_len, buf->len); + movelen = min_t(unsigned int, buf->head[0].iov_len, len); movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip; - if (offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen > - buf->head[0].iov_len) - return GSS_S_FAILURE; + BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen > + buf->head[0].iov_len); memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen); buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip; - buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip; + buf->len = len - (GSS_KRB5_TOK_HDR_LEN + headskip); /* Trim off the trailing "extra count" and checksum blob */ - buf->len -= ec + GSS_KRB5_TOK_HDR_LEN + tailskip; + xdr_buf_trim(buf, ec + GSS_KRB5_TOK_HDR_LEN + tailskip); + *align = XDR_QUADLEN(GSS_KRB5_TOK_HDR_LEN + headskip); + *slack = *align + XDR_QUADLEN(ec + GSS_KRB5_TOK_HDR_LEN + tailskip); return GSS_S_COMPLETE; } @@ -594,7 +568,6 @@ gss_wrap_kerberos(struct gss_ctx *gctx, int offset, BUG(); case ENCTYPE_DES_CBC_RAW: case ENCTYPE_DES3_CBC_RAW: - case ENCTYPE_ARCFOUR_HMAC: return gss_wrap_kerberos_v1(kctx, offset, buf, pages); case ENCTYPE_AES128_CTS_HMAC_SHA1_96: case ENCTYPE_AES256_CTS_HMAC_SHA1_96: @@ -603,7 +576,8 @@ gss_wrap_kerberos(struct gss_ctx *gctx, int offset, } u32 -gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf) +gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, + int len, struct xdr_buf *buf) { struct krb5_ctx *kctx = gctx->internal_ctx_id; @@ -612,10 +586,11 @@ gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf) BUG(); case ENCTYPE_DES_CBC_RAW: case ENCTYPE_DES3_CBC_RAW: - case ENCTYPE_ARCFOUR_HMAC: - return gss_unwrap_kerberos_v1(kctx, offset, buf); + return gss_unwrap_kerberos_v1(kctx, offset, len, buf, + &gctx->slack, &gctx->align); case ENCTYPE_AES128_CTS_HMAC_SHA1_96: case ENCTYPE_AES256_CTS_HMAC_SHA1_96: - return gss_unwrap_kerberos_v2(kctx, offset, buf); + return gss_unwrap_kerberos_v2(kctx, offset, len, buf, + &gctx->slack, &gctx->align); } } diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index db550bfc2642..fae632da1058 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -37,6 +37,8 @@ gss_mech_free(struct gss_api_mech *gm) for (i = 0; i < gm->gm_pf_num; i++) { pf = &gm->gm_pfs[i]; + if (pf->domain) + auth_domain_put(pf->domain); kfree(pf->auth_domain_name); pf->auth_domain_name = NULL; } @@ -59,6 +61,7 @@ make_auth_domain_name(char *name) static int gss_mech_svc_setup(struct gss_api_mech *gm) { + struct auth_domain *dom; struct pf_desc *pf; int i, status; @@ -68,10 +71,13 @@ gss_mech_svc_setup(struct gss_api_mech *gm) status = -ENOMEM; if (pf->auth_domain_name == NULL) goto out; - status = svcauth_gss_register_pseudoflavor(pf->pseudoflavor, - pf->auth_domain_name); - if (status) + dom = svcauth_gss_register_pseudoflavor( + pf->pseudoflavor, pf->auth_domain_name); + if (IS_ERR(dom)) { + status = PTR_ERR(dom); goto out; + } + pf->domain = dom; } return 0; out: @@ -411,10 +417,11 @@ gss_wrap(struct gss_ctx *ctx_id, u32 gss_unwrap(struct gss_ctx *ctx_id, int offset, + int len, struct xdr_buf *buf) { return ctx_id->mech_type->gm_ops - ->gss_unwrap(ctx_id, offset, buf); + ->gss_unwrap(ctx_id, offset, len, buf); } diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 0349f455a862..f549e4c05def 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -98,6 +98,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) * done without the correct namespace: */ .flags = RPC_CLNT_CREATE_NOPING | + RPC_CLNT_CREATE_CONNECTED | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; @@ -160,7 +161,7 @@ static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn) mutex_lock(&sn->gssp_lock); clnt = sn->gssp_clnt; if (clnt) - atomic_inc(&clnt->cl_count); + refcount_inc(&clnt->cl_count); mutex_unlock(&sn->gssp_lock); return clnt; } @@ -200,7 +201,7 @@ static int gssp_call(struct net *net, struct rpc_message *msg) static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) { - int i; + unsigned int i; for (i = 0; i < arg->npages && arg->pages[i]; i++) __free_page(arg->pages[i]); @@ -210,20 +211,25 @@ static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) { + unsigned int i; + arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE); arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL); - /* - * XXX: actual pages are allocated by xdr layer in - * xdr_partial_copy_from_skb. - */ if (!arg->pages) return -ENOMEM; + for (i = 0; i < arg->npages; i++) { + arg->pages[i] = alloc_page(GFP_KERNEL); + if (!arg->pages[i]) { + gssp_free_receive_pages(arg); + return -ENOMEM; + } + } return 0; } static char *gssp_stringify(struct xdr_netobj *netobj) { - return kstrndup(netobj->data, netobj->len, GFP_KERNEL); + return kmemdup_nul(netobj->data, netobj->len, GFP_KERNEL); } static void gssp_hostbased_service(char **principal) diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 2ff7b7083eba..d79f12c2550a 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -771,7 +771,6 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req, xdr_inline_pages(&req->rq_rcv_buf, PAGE_SIZE/2 /* pretty arbitrary */, arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE); - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; done: if (err) dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err); @@ -789,7 +788,7 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, scratch = alloc_page(GFP_KERNEL); if (!scratch) return -ENOMEM; - xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(xdr, scratch); /* res->status */ err = gssx_dec_status(xdr, &res->status); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 65b67b257302..bcd74dddbe2d 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -55,10 +55,6 @@ #include "gss_rpc_upcall.h" -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - /* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests * into replies. * @@ -184,6 +180,11 @@ static struct cache_head *rsi_alloc(void) return NULL; } +static int rsi_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + static void rsi_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -193,6 +194,8 @@ static void rsi_request(struct cache_detail *cd, qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len); qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len); (*bpp)[-1] = '\n'; + WARN_ONCE(*blen < 0, + "RPCSEC/GSS credential too large - please use gssproxy\n"); } static int rsi_parse(struct cache_detail *cd, @@ -282,6 +285,7 @@ static const struct cache_detail rsi_cache_template = { .hash_size = RSI_HASHMAX, .name = "auth.rpcsec.init", .cache_put = rsi_put, + .cache_upcall = rsi_upcall, .cache_request = rsi_request, .cache_parse = rsi_parse, .match = rsi_match, @@ -330,7 +334,7 @@ static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct r struct gss_svc_seq_data { /* highest seq number seen so far: */ - int sd_max; + u32 sd_max; /* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of * sd_win is nonzero iff sequence number i has been seen already: */ unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG]; @@ -428,6 +432,11 @@ rsc_alloc(void) return NULL; } +static int rsc_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return -EINVAL; +} + static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) { @@ -554,6 +563,7 @@ static const struct cache_detail rsc_cache_template = { .hash_size = RSC_HASHMAX, .name = "auth.rpcsec.context", .cache_put = rsc_put, + .cache_upcall = rsc_upcall, .cache_parse = rsc_parse, .match = rsc_match, .init = rsc_init, @@ -605,16 +615,29 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle) return found; } -/* Implements sequence number algorithm as specified in RFC 2203. */ -static int -gss_check_seq_num(struct rsc *rsci, int seq_num) +/** + * gss_check_seq_num - GSS sequence number window check + * @rqstp: RPC Call to use when reporting errors + * @rsci: cached GSS context state (updated on return) + * @seq_num: sequence number to check + * + * Implements sequence number algorithm as specified in + * RFC 2203, Section 5.3.3.1. "Context Management". + * + * Return values: + * %true: @rqstp's GSS sequence number is inside the window + * %false: @rqstp's GSS sequence number is outside the window + */ +static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci, + u32 seq_num) { struct gss_svc_seq_data *sd = &rsci->seqdata; + bool result = false; spin_lock(&sd->sd_lock); if (seq_num > sd->sd_max) { if (seq_num >= sd->sd_max + GSS_SEQ_WIN) { - memset(sd->sd_win,0,sizeof(sd->sd_win)); + memset(sd->sd_win, 0, sizeof(sd->sd_win)); sd->sd_max = seq_num; } else while (sd->sd_max < seq_num) { sd->sd_max++; @@ -622,18 +645,26 @@ gss_check_seq_num(struct rsc *rsci, int seq_num) } __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); goto ok; - } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) { - goto drop; + } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) { + goto toolow; } - /* sd_max - GSS_SEQ_WIN < seq_num <= sd_max */ if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) - goto drop; + goto alreadyseen; + ok: + result = true; +out: spin_unlock(&sd->sd_lock); - return 1; -drop: - spin_unlock(&sd->sd_lock); - return 0; + return result; + +toolow: + trace_rpcgss_svc_seqno_low(rqstp, seq_num, + sd->sd_max - GSS_SEQ_WIN, + sd->sd_max); + goto out; +alreadyseen: + trace_rpcgss_svc_seqno_seen(rqstp, seq_num); + goto out; } static inline u32 round_up_to_quad(u32 i) @@ -678,11 +709,11 @@ svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o) /* * Verify the checksum on the header and return SVC_OK on success. * Otherwise, return SVC_DROP (in the case of a bad sequence number) - * or return SVC_DENIED and indicate error in authp. + * or return SVC_DENIED and indicate error in rqstp->rq_auth_stat. */ static int gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, - __be32 *rpcstart, struct rpc_gss_wire_cred *gc, __be32 *authp) + __be32 *rpcstart, struct rpc_gss_wire_cred *gc) { struct gss_ctx *ctx_id = rsci->mechctx; struct xdr_buf rpchdr; @@ -696,7 +727,7 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart; xdr_buf_from_iov(&iov, &rpchdr); - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; if (argv->iov_len < 4) return SVC_DENIED; flavor = svc_getnl(argv); @@ -708,21 +739,17 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, if (rqstp->rq_deferred) /* skip verification of revisited request */ return SVC_OK; if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) { - *authp = rpcsec_gsserr_credproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; return SVC_DENIED; } if (gc->gc_seq > MAXSEQ) { - dprintk("RPC: svcauth_gss: discarding request with " - "large sequence number %d\n", gc->gc_seq); - *authp = rpcsec_gsserr_ctxproblem; + trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq); + rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; return SVC_DENIED; } - if (!gss_check_seq_num(rsci, gc->gc_seq)) { - dprintk("RPC: svcauth_gss: discarding request with " - "old sequence number %d\n", gc->gc_seq); + if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq)) return SVC_DROP; - } return SVC_OK; } @@ -754,7 +781,7 @@ gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) svc_putnl(rqstp->rq_res.head, RPC_AUTH_GSS); xdr_seq = kmalloc(4, GFP_KERNEL); if (!xdr_seq) - return -1; + return -ENOMEM; *xdr_seq = htonl(seq); iov.iov_base = xdr_seq; @@ -803,7 +830,7 @@ u32 svcauth_gss_flavor(struct auth_domain *dom) EXPORT_SYMBOL_GPL(svcauth_gss_flavor); -int +struct auth_domain * svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) { struct gss_domain *new; @@ -820,21 +847,23 @@ svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) new->h.flavour = &svcauthops_gss; new->pseudoflavor = pseudoflavor; - stat = 0; test = auth_domain_lookup(name, &new->h); - if (test != &new->h) { /* Duplicate registration */ + if (test != &new->h) { + pr_warn("svc: duplicate registration of gss pseudo flavour %s.\n", + name); + stat = -EADDRINUSE; auth_domain_put(test); - kfree(new->h.name); - goto out_free_dom; + goto out_free_name; } - return 0; + return test; +out_free_name: + kfree(new->h.name); out_free_dom: kfree(new); out: - return stat; + return ERR_PTR(stat); } - EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor); static inline int @@ -858,18 +887,20 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) static int unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) { + u32 integ_len, rseqno, maj_stat; int stat = -EINVAL; - u32 integ_len, maj_stat; struct xdr_netobj mic; struct xdr_buf integ_buf; + mic.data = NULL; + /* NFS READ normally uses splice to send data in-place. However * the data in cache can change after the reply's MIC is computed * but before the RPC reply is sent. To prevent the client from * rejecting the server-computed MIC in this somewhat rare case, * do not use splice with the GSS integrity service. */ - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) @@ -877,34 +908,44 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g integ_len = svc_getnl(&buf->head[0]); if (integ_len & 3) - return stat; + goto unwrap_failed; if (integ_len > buf->len) - return stat; - if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) { - WARN_ON_ONCE(1); - return stat; - } + goto unwrap_failed; + if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) + goto unwrap_failed; + /* copy out mic... */ if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) - return stat; + goto unwrap_failed; if (mic.len > RPC_MAX_AUTH_SIZE) - return stat; + goto unwrap_failed; mic.data = kmalloc(mic.len, GFP_KERNEL); if (!mic.data) - return stat; + goto unwrap_failed; if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) - goto out; + goto unwrap_failed; maj_stat = gss_verify_mic(ctx, &integ_buf, &mic); if (maj_stat != GSS_S_COMPLETE) - goto out; - if (svc_getnl(&buf->head[0]) != seq) - goto out; + goto bad_mic; + rseqno = svc_getnl(&buf->head[0]); + if (rseqno != seq) + goto bad_seqno; /* trim off the mic and padding at the end before returning */ - buf->len -= 4 + round_up_to_quad(mic.len); + xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4); stat = 0; out: kfree(mic.data); return stat; + +unwrap_failed: + trace_rpcgss_svc_unwrap_failed(rqstp); + goto out; +bad_seqno: + trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno); + goto out; +bad_mic: + trace_rpcgss_svc_mic(rqstp, maj_stat); + goto out; } static inline int @@ -928,9 +969,10 @@ static int unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) { u32 priv_len, maj_stat; - int pad, saved_len, remaining_len, offset; + int pad, remaining_len, offset; + u32 rseqno; - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); priv_len = svc_getnl(&buf->head[0]); if (rqstp->rq_deferred) { @@ -943,36 +985,42 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs * not yet read from the head, so these two values are different: */ remaining_len = total_buf_len(buf); if (priv_len > remaining_len) - return -EINVAL; + goto unwrap_failed; pad = remaining_len - priv_len; buf->len -= pad; fix_priv_head(buf, pad); - /* Maybe it would be better to give gss_unwrap a length parameter: */ - saved_len = buf->len; - buf->len = priv_len; - maj_stat = gss_unwrap(ctx, 0, buf); + maj_stat = gss_unwrap(ctx, 0, priv_len, buf); pad = priv_len - buf->len; - buf->len = saved_len; - buf->len -= pad; /* The upper layers assume the buffer is aligned on 4-byte boundaries. * In the krb5p case, at least, the data ends up offset, so we need to * move it around. */ /* XXX: This is very inefficient. It would be better to either do * this while we encrypt, or maybe in the receive code, if we can peak * ahead and work out the service and mechanism there. */ - offset = buf->head[0].iov_len % 4; + offset = xdr_pad_size(buf->head[0].iov_len); if (offset) { buf->buflen = RPCSVC_MAXPAYLOAD; xdr_shift_buf(buf, offset); fix_priv_head(buf, pad); } if (maj_stat != GSS_S_COMPLETE) - return -EINVAL; + goto bad_unwrap; out_seq: - if (svc_getnl(&buf->head[0]) != seq) - return -EINVAL; + rseqno = svc_getnl(&buf->head[0]); + if (rseqno != seq) + goto bad_seqno; return 0; + +unwrap_failed: + trace_rpcgss_svc_unwrap_failed(rqstp); + return -EINVAL; +bad_seqno: + trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno); + return -EINVAL; +bad_unwrap: + trace_rpcgss_svc_unwrap(rqstp, maj_stat); + return -EINVAL; } struct gss_svc_data { @@ -992,6 +1040,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp) struct rpc_gss_wire_cred *gc = &svcdata->clcred; int stat; + rqstp->rq_auth_stat = rpc_autherr_badcred; + /* * A gss export can be specified either by: * export *(sec=krb5,rw) @@ -1007,6 +1057,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp) stat = svcauth_unix_set_client(rqstp); if (stat == SVC_DROP || stat == SVC_CLOSE) return stat; + + rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } @@ -1096,16 +1148,16 @@ static void gss_free_in_token_pages(struct gssp_in_token *in_token) } static int gss_read_proxy_verf(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp, + struct rpc_gss_wire_cred *gc, struct xdr_netobj *in_handle, struct gssp_in_token *in_token) { struct kvec *argv = &rqstp->rq_arg.head[0]; - unsigned int page_base, length; - int pages, i, res; - size_t inlen; + unsigned int length, pgto_offs, pgfrom_offs; + int pages, i, res, pgto, pgfrom; + size_t inlen, to_offs, from_offs; - res = gss_read_common_verf(gc, argv, authp, in_handle); + res = gss_read_common_verf(gc, argv, &rqstp->rq_auth_stat, in_handle); if (res) return res; @@ -1131,17 +1183,24 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, memcpy(page_address(in_token->pages[0]), argv->iov_base, length); inlen -= length; - i = 1; - page_base = rqstp->rq_arg.page_base; + to_offs = length; + from_offs = rqstp->rq_arg.page_base; while (inlen) { - length = min_t(unsigned int, inlen, PAGE_SIZE); - memcpy(page_address(in_token->pages[i]), - page_address(rqstp->rq_arg.pages[i]) + page_base, + pgto = to_offs >> PAGE_SHIFT; + pgfrom = from_offs >> PAGE_SHIFT; + pgto_offs = to_offs & ~PAGE_MASK; + pgfrom_offs = from_offs & ~PAGE_MASK; + + length = min_t(unsigned int, inlen, + min_t(unsigned int, PAGE_SIZE - pgto_offs, + PAGE_SIZE - pgfrom_offs)); + memcpy(page_address(in_token->pages[pgto]) + pgto_offs, + page_address(rqstp->rq_arg.pages[pgfrom]) + pgfrom_offs, length); + to_offs += length; + from_offs += length; inlen -= length; - page_base = 0; - i++; } return 0; } @@ -1174,7 +1233,7 @@ gss_write_resv(struct kvec *resv, size_t size_limit, * Otherwise, drop the request pending an answer to the upcall. */ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp) + struct rpc_gss_wire_cred *gc) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -1183,7 +1242,7 @@ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); memset(&rsikey, 0, sizeof(rsikey)); - ret = gss_read_verf(gc, argv, authp, + ret = gss_read_verf(gc, argv, &rqstp->rq_auth_stat, &rsikey.in_handle, &rsikey.in_token); if (ret) return ret; @@ -1222,7 +1281,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd, long long ctxh; struct gss_api_mech *gm = NULL; time64_t expiry; - int status = -EINVAL; + int status; memset(&rsci, 0, sizeof(rsci)); /* context handle */ @@ -1245,7 +1304,6 @@ static int gss_proxy_save_rsc(struct cache_detail *cd, if (!ud->found_creds) { /* userspace seem buggy, we should always get at least a * mapping to nobody */ - dprintk("RPC: No creds found!\n"); goto out; } else { struct timespec64 boot; @@ -1287,7 +1345,7 @@ out: } static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp) + struct rpc_gss_wire_cred *gc) { struct kvec *resv = &rqstp->rq_res.head[0]; struct xdr_netobj cli_handle; @@ -1299,8 +1357,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); memset(&ud, 0, sizeof(ud)); - ret = gss_read_proxy_verf(rqstp, gc, authp, - &ud.in_handle, &ud.in_token); + ret = gss_read_proxy_verf(rqstp, gc, &ud.in_handle, &ud.in_token); if (ret) return ret; @@ -1311,8 +1368,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, if (status) goto out; - trace_rpcgss_accept_upcall(rqstp->rq_xid, ud.major_status, - ud.minor_status); + trace_rpcgss_svc_accept_upcall(rqstp, ud.major_status, ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: @@ -1320,31 +1376,23 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, break; case GSS_S_COMPLETE: status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle); - if (status) { - pr_info("%s: gss_proxy_save_rsc failed (%d)\n", - __func__, status); + if (status) goto out; - } cli_handle.data = (u8 *)&handle; cli_handle.len = sizeof(handle); break; default: - ret = SVC_CLOSE; goto out; } /* Got an answer to the upcall; use it: */ if (gss_write_init_verf(sn->rsc_cache, rqstp, - &cli_handle, &ud.major_status)) { - pr_info("%s: gss_write_init_verf failed\n", __func__); + &cli_handle, &ud.major_status)) goto out; - } if (gss_write_resv(resv, PAGE_SIZE, &cli_handle, &ud.out_token, - ud.major_status, ud.minor_status)) { - pr_info("%s: gss_write_resv failed\n", __func__); + ud.major_status, ud.minor_status)) goto out; - } ret = SVC_COMPLETE; out: @@ -1385,7 +1433,7 @@ static bool use_gss_proxy(struct net *net) static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct net *net = PDE_DATA(file_inode(file)); + struct net *net = pde_data(file_inode(file)); char tbuf[20]; unsigned long i; int res; @@ -1413,7 +1461,7 @@ static ssize_t write_gssp(struct file *file, const char __user *buf, static ssize_t read_gssp(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct net *net = PDE_DATA(file_inode(file)); + struct net *net = pde_data(file_inode(file)); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); unsigned long p = *ppos; char tbuf[10]; @@ -1482,7 +1530,7 @@ static void destroy_use_gss_proxy_proc_entry(struct net *net) {} * response here and return SVC_COMPLETE. */ static int -svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_gss_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -1495,10 +1543,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); - dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n", - argv->iov_len); - - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); if (!svcdata) @@ -1535,22 +1580,22 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0)) goto auth_err; - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; switch (gc->gc_proc) { case RPC_GSS_PROC_INIT: case RPC_GSS_PROC_CONTINUE_INIT: if (use_gss_proxy(SVC_NET(rqstp))) - return svcauth_gss_proxy_init(rqstp, gc, authp); + return svcauth_gss_proxy_init(rqstp, gc); else - return svcauth_gss_legacy_init(rqstp, gc, authp); + return svcauth_gss_legacy_init(rqstp, gc); case RPC_GSS_PROC_DATA: case RPC_GSS_PROC_DESTROY: /* Look up the context, and check the verifier: */ - *authp = rpcsec_gsserr_credproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx); if (!rsci) goto auth_err; - switch (gss_verify_header(rqstp, rsci, rpcstart, gc, authp)) { + switch (gss_verify_header(rqstp, rsci, rpcstart, gc)) { case SVC_OK: break; case SVC_DENIED: @@ -1560,7 +1605,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) } break; default: - *authp = rpc_autherr_rejectedcred; + rqstp->rq_auth_stat = rpc_autherr_rejectedcred; goto auth_err; } @@ -1576,13 +1621,13 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) svc_putnl(resv, RPC_SUCCESS); goto complete; case RPC_GSS_PROC_DATA: - *authp = rpcsec_gsserr_ctxproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; svcdata->verf_start = resv->iov_base + resv->iov_len; if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; rqstp->rq_cred = rsci->cred; get_group_info(rsci->cred.cr_group_info); - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; @@ -1614,6 +1659,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) GSS_C_QOP_DEFAULT, gc->gc_svc); ret = SVC_OK; + trace_rpcgss_svc_authenticate(rqstp, gc); goto out; } garbage_args: @@ -1680,7 +1726,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) goto out; integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; integ_len = resbuf->len - integ_offset; - BUG_ON(integ_len % 4); + if (integ_len & 3) + goto out; *p++ = htonl(integ_len); *p++ = htonl(gc->gc_seq); if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) { @@ -1704,7 +1751,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) resv->iov_len += XDR_QUADLEN(mic.len) << 2; /* not strictly required: */ resbuf->len += XDR_QUADLEN(mic.len) << 2; - BUG_ON(resv->iov_len > PAGE_SIZE); + if (resv->iov_len > PAGE_SIZE) + goto out_err; out: stat = 0; out_err: @@ -1740,9 +1788,11 @@ svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) * both the head and tail. */ if (resbuf->tail[0].iov_base) { - BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base - + PAGE_SIZE); - BUG_ON(resbuf->tail[0].iov_base < resbuf->head[0].iov_base); + if (resbuf->tail[0].iov_base >= + resbuf->head[0].iov_base + PAGE_SIZE) + return -EINVAL; + if (resbuf->tail[0].iov_base < resbuf->head[0].iov_base) + return -EINVAL; if (resbuf->tail[0].iov_len + resbuf->head[0].iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) return -ENOMEM; @@ -1780,11 +1830,14 @@ static int svcauth_gss_release(struct svc_rqst *rqstp) { struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; - struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct rpc_gss_wire_cred *gc; struct xdr_buf *resbuf = &rqstp->rq_res; int stat = -EINVAL; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); + if (!gsd) + goto out; + gc = &gsd->clcred; if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; /* Release can be called twice, but we only wrap once. */ @@ -1825,10 +1878,10 @@ out_err: if (rqstp->rq_cred.cr_group_info) put_group_info(rqstp->rq_cred.cr_group_info); rqstp->rq_cred.cr_group_info = NULL; - if (gsd->rsci) + if (gsd && gsd->rsci) { cache_put(&gsd->rsci->h, sn->rsc_cache); - gsd->rsci = NULL; - + gsd->rsci = NULL; + } return stat; } @@ -1932,7 +1985,7 @@ gss_svc_init_net(struct net *net) goto out2; return 0; out2: - destroy_use_gss_proxy_proc_entry(net); + rsi_cache_destroy_net(net); out1: rsc_cache_destroy_net(net); return rv; diff --git a/net/sunrpc/auth_gss/trace.c b/net/sunrpc/auth_gss/trace.c index 5576f1e66de9..76685abba60f 100644 --- a/net/sunrpc/auth_gss/trace.c +++ b/net/sunrpc/auth_gss/trace.c @@ -5,6 +5,9 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> +#include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svc_xprt.h> +#include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/gss_err.h> #define CREATE_TRACE_POINTS diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index e7df1f782b2e..1e091d3fa607 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -40,11 +40,19 @@ unx_destroy(struct rpc_auth *auth) /* * Lookup AUTH_UNIX creds for current process */ -static struct rpc_cred * -unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +static struct rpc_cred *unx_lookup_cred(struct rpc_auth *auth, + struct auth_cred *acred, int flags) { - struct rpc_cred *ret = mempool_alloc(unix_pool, GFP_NOFS); - + struct rpc_cred *ret; + + ret = kmalloc(sizeof(*ret), rpc_task_gfp_mask()); + if (!ret) { + if (!(flags & RPCAUTH_LOOKUP_ASYNC)) + return ERR_PTR(-ENOMEM); + ret = mempool_alloc(unix_pool, GFP_NOWAIT); + if (!ret) + return ERR_PTR(-ENOMEM); + } rpcauth_init_cred(ret, acred, auth, &unix_credops); ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; return ret; diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 195b40c5dae4..65a6c6429a53 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -1,23 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** (c) 2007 Network Appliance, Inc. All Rights Reserved. (c) 2009 NetApp. All Rights Reserved. -NetApp provides this source code under the GPL v2 License. -The GPL v2 license is available at -http://opensource.org/licenses/gpl-license.php. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ @@ -64,6 +50,17 @@ static void xprt_free_allocation(struct rpc_rqst *req) kfree(req); } +static void xprt_bc_reinit_xdr_buf(struct xdr_buf *buf) +{ + buf->head[0].iov_len = PAGE_SIZE; + buf->tail[0].iov_len = 0; + buf->pages = NULL; + buf->page_len = 0; + buf->flags = 0; + buf->len = 0; + buf->buflen = PAGE_SIZE; +} + static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags) { struct page *page; @@ -75,9 +72,9 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags) return 0; } -static -struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags) +static struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt) { + gfp_t gfp_flags = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; struct rpc_rqst *req; /* Pre-allocate one backchannel rpc_rqst */ @@ -111,7 +108,7 @@ out_free: * by the backchannel. This function can be called multiple times * when creating new sessions that use the same rpc_xprt. The * preallocated buffers are added to the pool of resources used by - * the rpc_xprt. Anyone of these resources may be used used by an + * the rpc_xprt. Any one of these resources may be used by an * incoming callback request. It's up to the higher levels in the * stack to enforce that the maximum number of session slots is not * being exceeded. @@ -154,7 +151,7 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) INIT_LIST_HEAD(&tmp_list); for (i = 0; i < min_reqs; i++) { /* Pre-allocate one backchannel rpc_rqst */ - req = xprt_alloc_bc_req(xprt, GFP_KERNEL); + req = xprt_alloc_bc_req(xprt); if (req == NULL) { printk(KERN_ERR "Failed to create bc rpc_rqst\n"); goto out_free; @@ -292,6 +289,9 @@ void xprt_free_bc_rqst(struct rpc_rqst *req) */ spin_lock_bh(&xprt->bc_pa_lock); if (xprt_need_to_requeue(xprt)) { + xprt_bc_reinit_xdr_buf(&req->rq_snd_buf); + xprt_bc_reinit_xdr_buf(&req->rq_rcv_buf); + req->rq_rcv_buf.len = PAGE_SIZE; list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); xprt->bc_alloc_count++; atomic_inc(&xprt->bc_slot_count); @@ -343,7 +343,7 @@ found: break; } else if (req) break; - new = xprt_alloc_bc_req(xprt, GFP_KERNEL); + new = xprt_alloc_bc_req(xprt); } while (new); return req; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index bd843a81afa0..f075a9fb5ccc 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -32,13 +32,15 @@ #include <linux/sunrpc/cache.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include <trace/events/sunrpc.h> + #include "netns.h" +#include "fail.h" #define RPCDBG_FACILITY RPCDBG_CACHE static bool cache_defer_req(struct cache_req *req, struct cache_head *item); static void cache_revisit_request(struct cache_head *item); -static bool cache_listeners_exist(struct cache_detail *detail); static void cache_init(struct cache_head *h, struct cache_detail *detail) { @@ -65,13 +67,14 @@ static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail, rcu_read_lock(); hlist_for_each_entry_rcu(tmp, head, cache_list) { - if (detail->match(tmp, key)) { - if (cache_is_expired(detail, tmp)) - continue; - tmp = cache_get_rcu(tmp); - rcu_read_unlock(); - return tmp; - } + if (!detail->match(tmp, key)) + continue; + if (test_bit(CACHE_VALID, &tmp->flags) && + cache_is_expired(detail, tmp)) + continue; + tmp = cache_get_rcu(tmp); + rcu_read_unlock(); + return tmp; } rcu_read_unlock(); return NULL; @@ -113,18 +116,21 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, spin_lock(&detail->hash_lock); /* check if entry appeared while we slept */ - hlist_for_each_entry_rcu(tmp, head, cache_list) { - if (detail->match(tmp, key)) { - if (cache_is_expired(detail, tmp)) { - sunrpc_begin_cache_remove_entry(tmp, detail); - freeme = tmp; - break; - } - cache_get(tmp); - spin_unlock(&detail->hash_lock); - cache_put(new, detail); - return tmp; + hlist_for_each_entry_rcu(tmp, head, cache_list, + lockdep_is_held(&detail->hash_lock)) { + if (!detail->match(tmp, key)) + continue; + if (test_bit(CACHE_VALID, &tmp->flags) && + cache_is_expired(detail, tmp)) { + sunrpc_begin_cache_remove_entry(tmp, detail); + trace_cache_entry_expired(detail, tmp); + freeme = tmp; + break; } + cache_get(tmp); + spin_unlock(&detail->hash_lock); + cache_put(new, detail); + return tmp; } hlist_add_head_rcu(&new->cache_list, head); @@ -174,6 +180,25 @@ static void cache_fresh_unlocked(struct cache_head *head, } } +static void cache_make_negative(struct cache_detail *detail, + struct cache_head *h) +{ + set_bit(CACHE_NEGATIVE, &h->flags); + trace_cache_entry_make_negative(detail, h); +} + +static void cache_entry_update(struct cache_detail *detail, + struct cache_head *h, + struct cache_head *new) +{ + if (!test_bit(CACHE_NEGATIVE, &new->flags)) { + detail->update(h, new); + trace_cache_entry_update(detail, h); + } else { + cache_make_negative(detail, h); + } +} + struct cache_head *sunrpc_cache_update(struct cache_detail *detail, struct cache_head *new, struct cache_head *old, int hash) { @@ -186,10 +211,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, if (!test_bit(CACHE_VALID, &old->flags)) { spin_lock(&detail->hash_lock); if (!test_bit(CACHE_VALID, &old->flags)) { - if (test_bit(CACHE_NEGATIVE, &new->flags)) - set_bit(CACHE_NEGATIVE, &old->flags); - else - detail->update(old, new); + cache_entry_update(detail, old, new); cache_fresh_locked(old, new->expiry_time, detail); spin_unlock(&detail->hash_lock); cache_fresh_unlocked(old, detail); @@ -207,10 +229,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, detail->init(tmp, old); spin_lock(&detail->hash_lock); - if (test_bit(CACHE_NEGATIVE, &new->flags)) - set_bit(CACHE_NEGATIVE, &tmp->flags); - else - detail->update(tmp, new); + cache_entry_update(detail, tmp, new); hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); detail->entries++; cache_get(tmp); @@ -224,13 +243,6 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, } EXPORT_SYMBOL_GPL(sunrpc_cache_update); -static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h) -{ - if (cd->cache_upcall) - return cd->cache_upcall(cd, h); - return sunrpc_cache_pipe_upcall(cd, h); -} - static inline int cache_is_valid(struct cache_head *h) { if (!test_bit(CACHE_VALID, &h->flags)) @@ -259,7 +271,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h spin_lock(&detail->hash_lock); rv = cache_is_valid(h); if (rv == -EAGAIN) { - set_bit(CACHE_NEGATIVE, &h->flags); + cache_make_negative(detail, h); cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY, detail); rv = -ENOENT; @@ -303,17 +315,14 @@ int cache_check(struct cache_detail *detail, (h->expiry_time != 0 && age > refresh_age/2)) { dprintk("RPC: Want update, refage=%lld, age=%lld\n", refresh_age, age); - if (!test_and_set_bit(CACHE_PENDING, &h->flags)) { - switch (cache_make_upcall(detail, h)) { - case -EINVAL: - rv = try_to_negate_entry(detail, h); - break; - case -EAGAIN: - cache_fresh_unlocked(h, detail); - break; - } - } else if (!cache_listeners_exist(detail)) + switch (detail->cache_upcall(detail, h)) { + case -EINVAL: rv = try_to_negate_entry(detail, h); + break; + case -EAGAIN: + cache_fresh_unlocked(h, detail); + break; + } } if (rv == -EAGAIN) { @@ -468,6 +477,7 @@ static int cache_clean(void) continue; sunrpc_begin_cache_remove_entry(ch, current_detail); + trace_cache_entry_expired(current_detail, ch); rv = 1; break; } @@ -490,16 +500,17 @@ static int cache_clean(void) */ static void do_cache_clean(struct work_struct *work) { - int delay = 5; - if (cache_clean() == -1) - delay = round_jiffies_relative(30*HZ); + int delay; if (list_empty(&cache_list)) - delay = 0; + return; - if (delay) - queue_delayed_work(system_power_efficient_wq, - &cache_cleaner, delay); + if (cache_clean() == -1) + delay = round_jiffies_relative(30*HZ); + else + delay = 5; + + queue_delayed_work(system_power_efficient_wq, &cache_cleaner, delay); } @@ -521,7 +532,6 @@ void cache_purge(struct cache_detail *detail) { struct cache_head *ch = NULL; struct hlist_head *head = NULL; - struct hlist_node *tmp = NULL; int i = 0; spin_lock(&detail->hash_lock); @@ -533,7 +543,9 @@ void cache_purge(struct cache_detail *detail) dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name); for (i = 0; i < detail->hash_size; i++) { head = &detail->hash_table[i]; - hlist_for_each_entry_safe(ch, tmp, head, cache_list) { + while (!hlist_empty(head)) { + ch = hlist_entry(head->first, struct cache_head, + cache_list); sunrpc_begin_cache_remove_entry(ch, detail); spin_unlock(&detail->hash_lock); sunrpc_end_cache_remove_entry(ch, detail); @@ -665,7 +677,7 @@ static void cache_limit_defers(void) /* Consider removing either the first or the last */ if (cache_defer_cnt > DFR_MAX) { - if (prandom_u32() & 1) + if (prandom_u32_max(2)) discard = list_entry(cache_defer_list.next, struct cache_deferred_req, recent); else @@ -678,16 +690,30 @@ static void cache_limit_defers(void) discard->revisit(discard, 1); } +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +static inline bool cache_defer_immediately(void) +{ + return !fail_sunrpc.ignore_cache_wait && + should_fail(&fail_sunrpc.attr, 1); +} +#else +static inline bool cache_defer_immediately(void) +{ + return false; +} +#endif + /* Return true if and only if a deferred request is queued. */ static bool cache_defer_req(struct cache_req *req, struct cache_head *item) { struct cache_deferred_req *dreq; - if (req->thread_wait) { + if (!cache_defer_immediately()) { cache_wait_req(req, item); if (!test_bit(CACHE_PENDING, &item->flags)) return false; } + dreq = req->defer(req); if (dreq == NULL) return false; @@ -768,7 +794,6 @@ void cache_clean_deferred(void *owner) */ static DEFINE_SPINLOCK(queue_lock); -static DEFINE_MUTEX(queue_io_mutex); struct cache_queue { struct list_head list; @@ -794,7 +819,7 @@ static int cache_request(struct cache_detail *detail, detail->cache_request(detail, crq->item, &bp, &len); if (len < 0) - return -EAGAIN; + return -E2BIG; return PAGE_SIZE - len; } @@ -896,44 +921,26 @@ static ssize_t cache_do_downcall(char *kaddr, const char __user *buf, return ret; } -static ssize_t cache_slow_downcall(const char __user *buf, - size_t count, struct cache_detail *cd) -{ - static char write_buf[8192]; /* protected by queue_io_mutex */ - ssize_t ret = -EINVAL; - - if (count >= sizeof(write_buf)) - goto out; - mutex_lock(&queue_io_mutex); - ret = cache_do_downcall(write_buf, buf, count, cd); - mutex_unlock(&queue_io_mutex); -out: - return ret; -} - static ssize_t cache_downcall(struct address_space *mapping, const char __user *buf, size_t count, struct cache_detail *cd) { - struct page *page; - char *kaddr; + char *write_buf; ssize_t ret = -ENOMEM; - if (count >= PAGE_SIZE) - goto out_slow; + if (count >= 32768) { /* 32k is max userland buffer, lets check anyway */ + ret = -EINVAL; + goto out; + } - page = find_or_create_page(mapping, 0, GFP_KERNEL); - if (!page) - goto out_slow; + write_buf = kvmalloc(count + 1, GFP_KERNEL); + if (!write_buf) + goto out; - kaddr = kmap(page); - ret = cache_do_downcall(kaddr, buf, count, cd); - kunmap(page); - unlock_page(page); - put_page(page); + ret = cache_do_downcall(write_buf, buf, count, cd); + kvfree(write_buf); +out: return ret; -out_slow: - return cache_slow_downcall(buf, count, cd); } static ssize_t cache_write(struct file *filp, const char __user *buf, @@ -1195,20 +1202,12 @@ static bool cache_listeners_exist(struct cache_detail *detail) * * Each request is at most one page long. */ -int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) +static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) { - char *buf; struct cache_request *crq; int ret = 0; - if (!detail->cache_request) - return -EINVAL; - - if (!cache_listeners_exist(detail)) { - warn_no_listener(detail); - return -EINVAL; - } if (test_bit(CACHE_CLEANED, &h->flags)) /* Too late to make an upcall */ return -EAGAIN; @@ -1231,6 +1230,7 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) if (test_bit(CACHE_PENDING, &h->flags)) { crq->item = cache_get(h); list_add_tail(&crq->q.list, &detail->queue); + trace_cache_entry_upcall(detail, h); } else /* Lost a race, no longer PENDING, so don't enqueue */ ret = -EAGAIN; @@ -1242,8 +1242,27 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) } return ret; } + +int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) +{ + if (test_and_set_bit(CACHE_PENDING, &h->flags)) + return 0; + return cache_pipe_upcall(detail, h); +} EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall); +int sunrpc_cache_pipe_upcall_timeout(struct cache_detail *detail, + struct cache_head *h) +{ + if (!cache_listeners_exist(detail)) { + warn_no_listener(detail); + trace_cache_entry_no_listener(detail, h); + return -EINVAL; + } + return sunrpc_cache_pipe_upcall(detail, h); +} +EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall_timeout); + /* * parse a message from user-space and pass it * to an appropriate cache @@ -1415,10 +1434,10 @@ static int c_show(struct seq_file *m, void *p) cache_get(cp); if (cache_check(cd, cp, NULL)) /* cache_check does a cache_put on failure */ - seq_printf(m, "# "); + seq_puts(m, "# "); else { if (cache_is_expired(cd, cp)) - seq_printf(m, "# "); + seq_puts(m, "# "); cache_put(cp, cd); } @@ -1533,7 +1552,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf, static ssize_t cache_read_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_read(filp, buf, count, ppos, cd); } @@ -1541,14 +1560,14 @@ static ssize_t cache_read_procfs(struct file *filp, char __user *buf, static ssize_t cache_write_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_write(filp, buf, count, ppos, cd); } static __poll_t cache_poll_procfs(struct file *filp, poll_table *wait) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_poll(filp, wait, cd); } @@ -1557,21 +1576,21 @@ static long cache_ioctl_procfs(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_ioctl(inode, filp, cmd, arg, cd); } static int cache_open_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_open(inode, filp, cd); } static int cache_release_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_release(inode, filp, cd); } @@ -1588,14 +1607,14 @@ static const struct proc_ops cache_channel_proc_ops = { static int content_open_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return content_open(inode, filp, cd); } static int content_release_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return content_release(inode, filp, cd); } @@ -1609,14 +1628,14 @@ static const struct proc_ops content_proc_ops = { static int open_flush_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return open_flush(inode, filp, cd); } static int release_flush_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return release_flush(inode, filp, cd); } @@ -1624,7 +1643,7 @@ static int release_flush_procfs(struct inode *inode, struct file *filp) static ssize_t read_flush_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return read_flush(filp, buf, count, ppos, cd); } @@ -1633,7 +1652,7 @@ static ssize_t write_flush_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return write_flush(filp, buf, count, ppos, cd); } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7324b21f923e..993acf38af87 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -41,16 +41,13 @@ #include <trace/events/sunrpc.h> #include "sunrpc.h" +#include "sysfs.h" #include "netns.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_CALL #endif -#define dprint_status(t) \ - dprintk("RPC: %5u %s (status %d)\n", t->tk_pid, \ - __func__, t->tk_status) - /* * All RPC clients are linked into this list */ @@ -79,6 +76,7 @@ static int rpc_encode_header(struct rpc_task *task, static int rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr); static int rpc_ping(struct rpc_clnt *clnt); +static int rpc_ping_noreply(struct rpc_clnt *clnt); static void rpc_check_timeout(struct rpc_task *task); static void rpc_register_client(struct rpc_clnt *clnt) @@ -170,7 +168,7 @@ static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event) case RPC_PIPEFS_MOUNT: if (clnt->cl_pipedir_objects.pdh_dentry != NULL) return 1; - if (atomic_read(&clnt->cl_count) == 0) + if (refcount_read(&clnt->cl_count) == 0) return 1; break; case RPC_PIPEFS_UMOUNT: @@ -331,6 +329,7 @@ err_auth: out: if (pipefs_sb) rpc_put_sb_net(net); + rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); return err; } @@ -346,7 +345,7 @@ static int rpc_alloc_clid(struct rpc_clnt *clnt) { int clid; - clid = ida_simple_get(&rpc_clids, 0, 0, GFP_KERNEL); + clid = ida_alloc(&rpc_clids, GFP_KERNEL); if (clid < 0) return clid; clnt->cl_clid = clid; @@ -355,7 +354,7 @@ static int rpc_alloc_clid(struct rpc_clnt *clnt) static void rpc_free_clid(struct rpc_clnt *clnt) { - ida_simple_remove(&rpc_clids, clnt->cl_clid); + ida_free(&rpc_clids, clnt->cl_clid); } static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, @@ -370,10 +369,6 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, const char *nodename = args->nodename; int err; - /* sanity check the name before trying to print it */ - dprintk("RPC: creating %s client for %s (xprt %p)\n", - program->name, args->servername, xprt); - err = rpciod_up(); if (err) goto out_no_rpciod; @@ -418,24 +413,28 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, } rpc_clnt_set_transport(clnt, xprt, timeout); + xprt->main = true; xprt_iter_init(&clnt->cl_xpi, xps); xprt_switch_put(xps); clnt->cl_rtt = &clnt->cl_rtt_default; rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval); - atomic_set(&clnt->cl_count, 1); + refcount_set(&clnt->cl_count, 1); if (nodename == NULL) nodename = utsname()->nodename; /* save the nodename */ rpc_clnt_set_nodename(clnt, nodename); + rpc_sysfs_client_setup(clnt, xps, rpc_net_ns(clnt)); err = rpc_client_register(clnt, args->authflavor, args->client_name); if (err) goto out_no_path; if (parent) - atomic_inc(&parent->cl_count); + refcount_inc(&parent->cl_count); + + trace_rpc_clnt_new(clnt, xprt, program->name, args->servername); return clnt; out_no_path: @@ -450,6 +449,7 @@ out_err: out_no_rpciod: xprt_switch_put(xps); xprt_put(xprt); + trace_rpc_clnt_new_err(program->name, args->servername, err); return ERR_PTR(err); } @@ -484,6 +484,12 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, rpc_shutdown_client(clnt); return ERR_PTR(err); } + } else if (args->flags & RPC_CLNT_CREATE_CONNECTED) { + int err = rpc_ping_noreply(clnt); + if (err != 0) { + rpc_shutdown_client(clnt); + return ERR_PTR(err); + } } clnt->cl_softrtry = 1; @@ -634,10 +640,8 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, args->nodename = clnt->cl_nodename; new = rpc_new_client(args, xps, xprt, clnt); - if (IS_ERR(new)) { - err = PTR_ERR(new); - goto out_err; - } + if (IS_ERR(new)) + return new; /* Turn off autobind on clones */ new->cl_autobind = 0; @@ -647,10 +651,11 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; new->cl_principal = clnt->cl_principal; + new->cl_max_connect = clnt->cl_max_connect; return new; out_err: - dprintk("RPC: %s: returned error %d\n", __func__, err); + trace_rpc_clnt_clone_err(clnt, err); return ERR_PTR(err); } @@ -723,11 +728,8 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, int err; xprt = xprt_create_transport(args); - if (IS_ERR(xprt)) { - dprintk("RPC: failed to create new xprt for clnt %p\n", - clnt); + if (IS_ERR(xprt)) return PTR_ERR(xprt); - } xps = xprt_switch_alloc(xprt, GFP_KERNEL); if (xps == NULL) { @@ -743,6 +745,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, rpc_unregister_client(clnt); __rpc_clnt_remove_pipedir(clnt); + rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); /* @@ -767,7 +770,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, rpc_release_client(parent); xprt_switch_put(oldxps); xprt_put(old); - dprintk("RPC: replaced xprt for clnt %p\n", clnt); + trace_rpc_clnt_replace_xprt(clnt); return 0; out_revert: @@ -777,13 +780,14 @@ out_revert: rpc_client_register(clnt, pseudoflavor, NULL); xprt_switch_put(xps); xprt_put(xprt); - dprintk("RPC: failed to switch xprt for clnt %p\n", clnt); + trace_rpc_clnt_replace_xprt_err(clnt); return err; } EXPORT_SYMBOL_GPL(rpc_switch_client_transport); static -int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) +int _rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi, + void func(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps)) { struct rpc_xprt_switch *xps; @@ -792,11 +796,24 @@ int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) rcu_read_unlock(); if (xps == NULL) return -EAGAIN; - xprt_iter_init_listall(xpi, xps); + func(xpi, xps); xprt_switch_put(xps); return 0; } +static +int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) +{ + return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listall); +} + +static +int rpc_clnt_xprt_iter_offline_init(struct rpc_clnt *clnt, + struct rpc_xprt_iter *xpi) +{ + return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listoffline); +} + /** * rpc_clnt_iterate_for_each_xprt - Apply a function to all transports * @clnt: pointer to client @@ -844,10 +861,11 @@ void rpc_killall_tasks(struct rpc_clnt *clnt) if (list_empty(&clnt->cl_tasks)) return; - dprintk("RPC: killing all tasks for client %p\n", clnt); + /* * Spin lock all_tasks to prevent changes... */ + trace_rpc_clnt_killall(clnt); spin_lock(&clnt->cl_lock); list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) rpc_signal_task(rovr); @@ -855,6 +873,57 @@ void rpc_killall_tasks(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_killall_tasks); +/** + * rpc_cancel_tasks - try to cancel a set of RPC tasks + * @clnt: Pointer to RPC client + * @error: RPC task error value to set + * @fnmatch: Pointer to selector function + * @data: User data + * + * Uses @fnmatch to define a set of RPC tasks that are to be cancelled. + * The argument @error must be a negative error value. + */ +unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error, + bool (*fnmatch)(const struct rpc_task *, + const void *), + const void *data) +{ + struct rpc_task *task; + unsigned long count = 0; + + if (list_empty(&clnt->cl_tasks)) + return 0; + /* + * Spin lock all_tasks to prevent changes... + */ + spin_lock(&clnt->cl_lock); + list_for_each_entry(task, &clnt->cl_tasks, tk_task) { + if (!RPC_IS_ACTIVATED(task)) + continue; + if (!fnmatch(task, data)) + continue; + rpc_task_try_cancel(task, error); + count++; + } + spin_unlock(&clnt->cl_lock); + return count; +} +EXPORT_SYMBOL_GPL(rpc_cancel_tasks); + +static int rpc_clnt_disconnect_xprt(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, void *dummy) +{ + if (xprt_connected(xprt)) + xprt_force_disconnect(xprt); + return 0; +} + +void rpc_clnt_disconnect(struct rpc_clnt *clnt) +{ + rpc_clnt_iterate_for_each_xprt(clnt, rpc_clnt_disconnect_xprt, NULL); +} +EXPORT_SYMBOL_GPL(rpc_clnt_disconnect); + /* * Properly shut down an RPC client, terminating all outstanding * requests. @@ -863,9 +932,7 @@ void rpc_shutdown_client(struct rpc_clnt *clnt) { might_sleep(); - dprintk_rcu("RPC: shutting down %s client for %s\n", - clnt->cl_program->name, - rcu_dereference(clnt->cl_xprt)->servername); + trace_rpc_clnt_shutdown(clnt); while (!list_empty(&clnt->cl_tasks)) { rpc_killall_tasks(clnt); @@ -880,27 +947,41 @@ EXPORT_SYMBOL_GPL(rpc_shutdown_client); /* * Free an RPC client */ +static void rpc_free_client_work(struct work_struct *work) +{ + struct rpc_clnt *clnt = container_of(work, struct rpc_clnt, cl_work); + + trace_rpc_clnt_free(clnt); + + /* These might block on processes that might allocate memory, + * so they cannot be called in rpciod, so they are handled separately + * here. + */ + rpc_sysfs_client_destroy(clnt); + rpc_clnt_debugfs_unregister(clnt); + rpc_free_clid(clnt); + rpc_clnt_remove_pipedir(clnt); + xprt_put(rcu_dereference_raw(clnt->cl_xprt)); + + kfree(clnt); + rpciod_down(); +} static struct rpc_clnt * rpc_free_client(struct rpc_clnt *clnt) { struct rpc_clnt *parent = NULL; - dprintk_rcu("RPC: destroying %s client for %s\n", - clnt->cl_program->name, - rcu_dereference(clnt->cl_xprt)->servername); + trace_rpc_clnt_release(clnt); if (clnt->cl_parent != clnt) parent = clnt->cl_parent; - rpc_clnt_debugfs_unregister(clnt); - rpc_clnt_remove_pipedir(clnt); rpc_unregister_client(clnt); rpc_free_iostats(clnt->cl_metrics); clnt->cl_metrics = NULL; - xprt_put(rcu_dereference_raw(clnt->cl_xprt)); xprt_iter_destroy(&clnt->cl_xpi); - rpciod_down(); put_cred(clnt->cl_cred); - rpc_free_clid(clnt); - kfree(clnt); + + INIT_WORK(&clnt->cl_work, rpc_free_client_work); + schedule_work(&clnt->cl_work); return parent; } @@ -910,18 +991,16 @@ rpc_free_client(struct rpc_clnt *clnt) static struct rpc_clnt * rpc_free_auth(struct rpc_clnt *clnt) { - if (clnt->cl_auth == NULL) - return rpc_free_client(clnt); - /* * Note: RPCSEC_GSS may need to send NULL RPC calls in order to * release remaining GSS contexts. This mechanism ensures * that it can do so safely. */ - atomic_inc(&clnt->cl_count); - rpcauth_release(clnt->cl_auth); - clnt->cl_auth = NULL; - if (atomic_dec_and_test(&clnt->cl_count)) + if (clnt->cl_auth != NULL) { + rpcauth_release(clnt->cl_auth); + clnt->cl_auth = NULL; + } + if (refcount_dec_and_test(&clnt->cl_count)) return rpc_free_client(clnt); return NULL; } @@ -932,12 +1011,10 @@ rpc_free_auth(struct rpc_clnt *clnt) void rpc_release_client(struct rpc_clnt *clnt) { - dprintk("RPC: rpc_release_client(%p)\n", clnt); - do { if (list_empty(&clnt->cl_tasks)) wake_up(&destroy_wait); - if (!atomic_dec_and_test(&clnt->cl_count)) + if (refcount_dec_not_one(&clnt->cl_count)) break; clnt = rpc_free_auth(clnt); } while (clnt != NULL); @@ -1061,8 +1138,13 @@ rpc_task_get_next_xprt(struct rpc_clnt *clnt) static void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) { - if (task->tk_xprt) - return; + if (task->tk_xprt) { + if (!(test_bit(XPRT_OFFLINE, &task->tk_xprt->state) && + (task->tk_flags & RPC_TASK_MOVEABLE))) + return; + xprt_release(task); + xprt_put(task->tk_xprt); + } if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) task->tk_xprt = rpc_task_get_first_xprt(clnt); else @@ -1072,24 +1154,19 @@ void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) static void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) { - - if (clnt != NULL) { - rpc_task_set_transport(task, clnt); - task->tk_client = clnt; - atomic_inc(&clnt->cl_count); - if (clnt->cl_softrtry) - task->tk_flags |= RPC_TASK_SOFT; - if (clnt->cl_softerr) - task->tk_flags |= RPC_TASK_TIMEOUT; - if (clnt->cl_noretranstimeo) - task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; - if (atomic_read(&clnt->cl_swapper)) - task->tk_flags |= RPC_TASK_SWAPPER; - /* Add to the client's list of all tasks */ - spin_lock(&clnt->cl_lock); - list_add_tail(&task->tk_task, &clnt->cl_tasks); - spin_unlock(&clnt->cl_lock); - } + rpc_task_set_transport(task, clnt); + task->tk_client = clnt; + refcount_inc(&clnt->cl_count); + if (clnt->cl_softrtry) + task->tk_flags |= RPC_TASK_SOFT; + if (clnt->cl_softerr) + task->tk_flags |= RPC_TASK_TIMEOUT; + if (clnt->cl_noretranstimeo) + task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; + /* Add to the client's list of all tasks */ + spin_lock(&clnt->cl_lock); + list_add_tail(&task->tk_task, &clnt->cl_tasks); + spin_unlock(&clnt->cl_lock); } static void @@ -1099,8 +1176,9 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) task->tk_msg.rpc_proc = msg->rpc_proc; task->tk_msg.rpc_argp = msg->rpc_argp; task->tk_msg.rpc_resp = msg->rpc_resp; - if (msg->rpc_cred != NULL) - task->tk_msg.rpc_cred = get_cred(msg->rpc_cred); + task->tk_msg.rpc_cred = msg->rpc_cred; + if (!(task->tk_flags & RPC_TASK_CRED_NOREF)) + get_cred(task->tk_msg.rpc_cred); } } @@ -1125,6 +1203,11 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) struct rpc_task *task; task = rpc_new_task(task_setup_data); + if (IS_ERR(task)) + return task; + + if (!RPC_IS_ASYNC(task)) + task->tk_flags |= RPC_TASK_CRED_NOREF; rpc_task_set_client(task, task_setup_data->rpc_client); rpc_task_set_rpc_message(task, task_setup_data->rpc_message); @@ -1222,6 +1305,11 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) * Create an rpc_task to send the data */ task = rpc_new_task(&task_setup_data); + if (IS_ERR(task)) { + xprt_free_bc_request(req); + return task; + } + xprt_init_bc_request(req, task); task->tk_action = call_bc_encode; @@ -1247,13 +1335,10 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages, unsigned int base, unsigned int len, unsigned int hdrsize) { - /* Subtract one to force an extra word of buffer space for the - * payload's XDR pad to fall into the rcv_buf's tail iovec. - */ - hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1; + hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign; xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len); - trace_rpc_reply_pages(req); + trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf); } EXPORT_SYMBOL_GPL(rpc_prepare_reply_pages); @@ -1607,7 +1692,8 @@ const char static void __rpc_call_rpcerror(struct rpc_task *task, int tk_status, int rpc_status) { - task->tk_rpc_status = rpc_status; + trace_rpc_call_rpcerror(task, tk_status, rpc_status); + rpc_task_set_rpc_status(task, rpc_status); rpc_exit(task, tk_status); } @@ -1630,10 +1716,6 @@ call_start(struct rpc_task *task) int idx = task->tk_msg.rpc_proc->p_statidx; trace_rpc_request(task); - dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid, - clnt->cl_program->name, clnt->cl_vers, - rpc_proc_name(task), - (RPC_IS_ASYNC(task) ? "async" : "sync")); /* Increment call count (version might not be valid for ping) */ if (clnt->cl_program->version[clnt->cl_vers]) @@ -1649,8 +1731,6 @@ call_start(struct rpc_task *task) static void call_reserve(struct rpc_task *task) { - dprint_status(task); - task->tk_status = 0; task->tk_action = call_reserveresult; xprt_reserve(task); @@ -1666,8 +1746,6 @@ call_reserveresult(struct rpc_task *task) { int status = task->tk_status; - dprint_status(task); - /* * After a call to xprt_reserve(), we must have either * a request slot or else an error status. @@ -1683,17 +1761,10 @@ call_reserveresult(struct rpc_task *task) return; } - /* - * Even though there was an error, we may have acquired - * a request slot somehow. Make sure not to leak it. - */ - if (task->tk_rqstp) - xprt_release(task); - switch (status) { case -ENOMEM: rpc_delay(task, HZ >> 2); - /* fall through */ + fallthrough; case -EAGAIN: /* woken up; retry */ task->tk_action = call_retry_reserve; return; @@ -1708,8 +1779,6 @@ call_reserveresult(struct rpc_task *task) static void call_retry_reserve(struct rpc_task *task) { - dprint_status(task); - task->tk_status = 0; task->tk_action = call_reserveresult; xprt_retry_reserve(task); @@ -1721,8 +1790,6 @@ call_retry_reserve(struct rpc_task *task) static void call_refresh(struct rpc_task *task) { - dprint_status(task); - task->tk_action = call_refreshresult; task->tk_status = 0; task->tk_client->cl_stats->rpcauthrefresh++; @@ -1737,8 +1804,6 @@ call_refreshresult(struct rpc_task *task) { int status = task->tk_status; - dprint_status(task); - task->tk_status = 0; task->tk_action = call_refresh; switch (status) { @@ -1750,23 +1815,24 @@ call_refreshresult(struct rpc_task *task) /* Use rate-limiting and a max number of retries if refresh * had status 0 but failed to update the cred. */ - /* fall through */ + fallthrough; case -ETIMEDOUT: rpc_delay(task, 3*HZ); - /* fall through */ + fallthrough; case -EAGAIN: status = -EACCES; - /* fall through */ + fallthrough; case -EKEYEXPIRED: if (!task->tk_cred_retry) break; task->tk_cred_retry--; - dprintk("RPC: %5u %s: retry refresh creds\n", - task->tk_pid, __func__); + trace_rpc_retry_refresh_status(task); + return; + case -ENOMEM: + rpc_delay(task, HZ >> 4); return; } - dprintk("RPC: %5u %s: refresh creds failed with error %d\n", - task->tk_pid, __func__, status); + trace_rpc_refresh_status(task); rpc_call_rpcerror(task, status); } @@ -1783,8 +1849,6 @@ call_allocate(struct rpc_task *task) const struct rpc_procinfo *proc = task->tk_msg.rpc_proc; int status; - dprint_status(task); - task->tk_status = 0; task->tk_action = call_encode; @@ -1814,7 +1878,7 @@ call_allocate(struct rpc_task *task) req->rq_rcvsize <<= 2; status = xprt->ops->buf_alloc(task); - xprt_inject_disconnect(xprt); + trace_rpc_buf_alloc(task, status); if (status == 0) return; if (status != -ENOMEM) { @@ -1822,8 +1886,6 @@ call_allocate(struct rpc_task *task) return; } - dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); - if (RPC_IS_ASYNC(task) || !fatal_signal_pending(current)) { task->tk_action = call_allocate; rpc_delay(task, HZ>>4); @@ -1859,7 +1921,6 @@ rpc_xdr_encode(struct rpc_task *task) req->rq_snd_buf.head[0].iov_len = 0; xdr_init_encode(&xdr, &req->rq_snd_buf, req->rq_snd_buf.head[0].iov_base, req); - xdr_free_bvec(&req->rq_snd_buf); if (rpc_encode_header(task, &xdr)) return; @@ -1874,11 +1935,14 @@ call_encode(struct rpc_task *task) { if (!rpc_task_need_encode(task)) goto out; - dprint_status(task); + /* Dequeue task from the receive queue while we're encoding */ xprt_request_dequeue_xprt(task); /* Encode here so that rpcsec_gss can use correct sequence number. */ rpc_xdr_encode(task); + /* Add task to reply queue before transmission to avoid races */ + if (task->tk_status == 0 && rpc_reply_expected(task)) + task->tk_status = xprt_request_enqueue_receive(task); /* Did the encode result in an error condition? */ if (task->tk_status != 0) { /* Was the error nonfatal? */ @@ -1889,12 +1953,11 @@ call_encode(struct rpc_task *task) break; case -EKEYEXPIRED: if (!task->tk_cred_retry) { - rpc_exit(task, task->tk_status); + rpc_call_rpcerror(task, task->tk_status); } else { task->tk_action = call_refresh; task->tk_cred_retry--; - dprintk("RPC: %5u %s: retry refresh creds\n", - task->tk_pid, __func__); + trace_rpc_retry_refresh_status(task); } break; default: @@ -1903,9 +1966,6 @@ call_encode(struct rpc_task *task) return; } - /* Add task to reply queue before transmission to avoid races */ - if (rpc_reply_expected(task)) - xprt_request_enqueue_receive(task); xprt_request_enqueue_transmit(task); out: task->tk_action = call_transmit; @@ -1951,8 +2011,6 @@ call_bind(struct rpc_task *task) return; } - dprint_status(task); - task->tk_action = call_bind_status; if (!xprt_prepare_transmit(task)) return; @@ -1974,8 +2032,6 @@ call_bind_status(struct rpc_task *task) return; } - dprint_status(task); - trace_rpc_bind_status(task); if (task->tk_status >= 0) goto out_next; if (xprt_bound(xprt)) { @@ -1985,12 +2041,10 @@ call_bind_status(struct rpc_task *task) switch (task->tk_status) { case -ENOMEM: - dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid); rpc_delay(task, HZ >> 2); goto retry_timeout; case -EACCES: - dprintk("RPC: %5u remote rpcbind: RPC program/version " - "unavailable\n", task->tk_pid); + trace_rpcb_prog_unavail_err(task); /* fail immediately if this is an RPC ping */ if (task->tk_msg.rpc_proc->p_proc == 0) { status = -EOPNOTSUPP; @@ -2007,17 +2061,14 @@ call_bind_status(struct rpc_task *task) case -EAGAIN: goto retry_timeout; case -ETIMEDOUT: - dprintk("RPC: %5u rpcbind request timed out\n", - task->tk_pid); + trace_rpcb_timeout_err(task); goto retry_timeout; case -EPFNOSUPPORT: /* server doesn't support any rpcbind version we know of */ - dprintk("RPC: %5u unrecognized remote rpcbind service\n", - task->tk_pid); + trace_rpcb_bind_version_err(task); break; case -EPROTONOSUPPORT: - dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n", - task->tk_pid); + trace_rpcb_bind_version_err(task); goto retry_timeout; case -ECONNREFUSED: /* connection problems */ case -ECONNRESET: @@ -2028,8 +2079,7 @@ call_bind_status(struct rpc_task *task) case -EHOSTUNREACH: case -ENETUNREACH: case -EPIPE: - dprintk("RPC: %5u remote rpcbind unreachable: %d\n", - task->tk_pid, task->tk_status); + trace_rpcb_unreachable_err(task); if (!RPC_IS_SOFTCONN(task)) { rpc_delay(task, 5*HZ); goto retry_timeout; @@ -2037,8 +2087,7 @@ call_bind_status(struct rpc_task *task) status = task->tk_status; break; default: - dprintk("RPC: %5u unrecognized rpcbind error (%d)\n", - task->tk_pid, -task->tk_status); + trace_rpcb_unrecognized_err(task); } rpc_call_rpcerror(task, status); @@ -2070,10 +2119,6 @@ call_connect(struct rpc_task *task) return; } - dprintk("RPC: %5u call_connect xprt %p %s connected\n", - task->tk_pid, xprt, - (xprt_connected(xprt) ? "is" : "is not")); - task->tk_action = call_connect_status; if (task->tk_status < 0) return; @@ -2101,7 +2146,6 @@ call_connect_status(struct rpc_task *task) return; } - dprint_status(task); trace_rpc_connect_status(task); if (task->tk_status == 0) { @@ -2123,7 +2167,7 @@ call_connect_status(struct rpc_task *task) rpc_force_rebind(clnt); goto out_retry; } - /* fall through */ + fallthrough; case -ECONNRESET: case -ECONNABORTED: case -ENETDOWN: @@ -2137,11 +2181,36 @@ call_connect_status(struct rpc_task *task) break; /* retry with existing socket, after a delay */ rpc_delay(task, 3*HZ); - /* fall through */ + fallthrough; case -EADDRINUSE: case -ENOTCONN: case -EAGAIN: case -ETIMEDOUT: + if (!(task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) && + (task->tk_flags & RPC_TASK_MOVEABLE) && + test_bit(XPRT_REMOVE, &xprt->state)) { + struct rpc_xprt *saved = task->tk_xprt; + struct rpc_xprt_switch *xps; + + rcu_read_lock(); + xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); + rcu_read_unlock(); + if (xps->xps_nxprts > 1) { + long value; + + xprt_release(task); + value = atomic_long_dec_return(&xprt->queuelen); + if (value == 0) + rpc_xprt_switch_remove_xprt(xps, saved, + true); + xprt_put(saved); + task->tk_xprt = NULL; + task->tk_action = call_start; + } + xprt_switch_put(xps); + if (!task->tk_xprt) + return; + } goto out_retry; case -ENOBUFS: rpc_delay(task, HZ >> 2); @@ -2169,8 +2238,6 @@ call_transmit(struct rpc_task *task) return; } - dprint_status(task); - task->tk_action = call_transmit_status; if (!xprt_prepare_transmit(task)) return; @@ -2205,7 +2272,6 @@ call_transmit_status(struct rpc_task *task) switch (task->tk_status) { default: - dprint_status(task); break; case -EBADMSG: task->tk_status = 0; @@ -2217,9 +2283,10 @@ call_transmit_status(struct rpc_task *task) * socket just returned a connection error, * then hold onto the transport lock. */ + case -ENOMEM: case -ENOBUFS: rpc_delay(task, HZ>>2); - /* fall through */ + fallthrough; case -EBADSLT: case -EAGAIN: task->tk_action = call_transmit; @@ -2238,7 +2305,7 @@ call_transmit_status(struct rpc_task *task) rpc_call_rpcerror(task, task->tk_status); return; } - /* fall through */ + fallthrough; case -ECONNRESET: case -ECONNABORTED: case -EADDRINUSE: @@ -2287,8 +2354,6 @@ call_bc_transmit_status(struct rpc_task *task) if (rpc_task_transmitted(task)) task->tk_status = 0; - dprint_status(task); - switch (task->tk_status) { case 0: /* Success */ @@ -2302,9 +2367,10 @@ call_bc_transmit_status(struct rpc_task *task) case -ENOTCONN: case -EPIPE: break; + case -ENOMEM: case -ENOBUFS: rpc_delay(task, HZ>>2); - /* fall through */ + fallthrough; case -EBADSLT: case -EAGAIN: task->tk_status = 0; @@ -2348,8 +2414,6 @@ call_status(struct rpc_task *task) if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); - dprint_status(task); - status = task->tk_status; if (status >= 0) { task->tk_action = call_decode; @@ -2371,7 +2435,7 @@ call_status(struct rpc_task *task) * were a timeout. */ rpc_delay(task, 3*HZ); - /* fall through */ + fallthrough; case -ETIMEDOUT: break; case -ECONNREFUSED: @@ -2382,10 +2446,15 @@ call_status(struct rpc_task *task) break; case -EADDRINUSE: rpc_delay(task, 3*HZ); - /* fall through */ + fallthrough; case -EPIPE: case -EAGAIN: break; + case -ENFILE: + case -ENOBUFS: + case -ENOMEM: + rpc_delay(task, HZ>>2); + break; case -EIO: /* shutdown or soft timeout */ goto out_exit; @@ -2396,7 +2465,8 @@ call_status(struct rpc_task *task) goto out_exit; } task->tk_action = call_encode; - rpc_check_timeout(task); + if (status != -ECONNRESET && status != -ECONNABORTED) + rpc_check_timeout(task); return; out_exit: rpc_call_rpcerror(task, status); @@ -2416,10 +2486,13 @@ rpc_check_timeout(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; + if (RPC_SIGNALLED(task)) + return; + if (xprt_adjust_timeout(task->tk_rqstp) == 0) return; - dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid); + trace_rpc_timeout_status(task); task->tk_timeouts++; if (RPC_IS_SOFTCONN(task) && !rpc_check_connected(task->tk_rqstp)) { @@ -2478,8 +2551,6 @@ call_decode(struct rpc_task *task) struct xdr_stream xdr; int err; - dprint_status(task); - if (!task->tk_msg.rpc_proc->p_decode) { task->tk_action = rpc_exit_task; return; @@ -2495,12 +2566,6 @@ call_decode(struct rpc_task *task) } /* - * Ensure that we see all writes made by xprt_complete_rqst() - * before it changed req->rq_reply_bytes_recvd. - */ - smp_rmb(); - - /* * Did we ever call xprt_complete_rqst()? If not, we should assume * the message is incomplete. */ @@ -2508,7 +2573,13 @@ call_decode(struct rpc_task *task) if (!req->rq_reply_bytes_recvd) goto out; + /* Ensure that we see all writes made by xprt_complete_rqst() + * before it changed req->rq_reply_bytes_recvd. + */ + smp_rmb(); + req->rq_rcv_buf.len = req->rq_private_buf.len; + trace_rpc_xdr_recvfrom(task, &req->rq_rcv_buf); /* Check that the softirq receive buffer is valid */ WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf, @@ -2522,8 +2593,6 @@ out: case 0: task->tk_action = rpc_exit_task; task->tk_status = rpcauth_unwrap_resp(task, &xdr); - dprintk("RPC: %5u %s result %d\n", - task->tk_pid, __func__, task->tk_status); return; case -EAGAIN: task->tk_status = 0; @@ -2644,7 +2713,7 @@ out_unparsable: out_verifier: trace_rpc_bad_verifier(task); - goto out_garbage; + goto out_err; out_msg_denied: error = -EACCES; @@ -2711,17 +2780,22 @@ static const struct rpc_procinfo rpcproc_null = { .p_decode = rpcproc_decode_null, }; -static int rpc_ping(struct rpc_clnt *clnt) +static const struct rpc_procinfo rpcproc_null_noreply = { + .p_encode = rpcproc_encode_null, +}; + +static void +rpc_null_call_prepare(struct rpc_task *task, void *data) { - struct rpc_message msg = { - .rpc_proc = &rpcproc_null, - }; - int err; - err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN | - RPC_TASK_NULLCREDS); - return err; + task->tk_flags &= ~RPC_TASK_NO_RETRANS_TIMEOUT; + rpc_call_start(task); } +static const struct rpc_call_ops rpc_null_ops = { + .rpc_call_prepare = rpc_null_call_prepare, + .rpc_call_done = rpc_default_callback, +}; + static struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_cred *cred, int flags, @@ -2735,9 +2809,10 @@ struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, .rpc_xprt = xprt, .rpc_message = &msg, .rpc_op_cred = cred, - .callback_ops = (ops != NULL) ? ops : &rpc_default_ops, + .callback_ops = ops ?: &rpc_null_ops, .callback_data = data, - .flags = flags | RPC_TASK_NULLCREDS, + .flags = flags | RPC_TASK_SOFT | RPC_TASK_SOFTCONN | + RPC_TASK_NULLCREDS, }; return rpc_run_task(&task_setup_data); @@ -2749,6 +2824,41 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int } EXPORT_SYMBOL_GPL(rpc_call_null); +static int rpc_ping(struct rpc_clnt *clnt) +{ + struct rpc_task *task; + int status; + + task = rpc_call_null_helper(clnt, NULL, NULL, 0, NULL, NULL); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); + return status; +} + +static int rpc_ping_noreply(struct rpc_clnt *clnt) +{ + struct rpc_message msg = { + .rpc_proc = &rpcproc_null_noreply, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = &rpc_null_ops, + .flags = RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, + }; + struct rpc_task *task; + int status; + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); + return status; +} + struct rpc_cb_add_xprt_calldata { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; @@ -2772,6 +2882,7 @@ static void rpc_cb_add_xprt_release(void *calldata) } static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { + .rpc_call_prepare = rpc_null_call_prepare, .rpc_call_done = rpc_cb_add_xprt_done, .rpc_release = rpc_cb_add_xprt_release, }; @@ -2790,7 +2901,16 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_cb_add_xprt_calldata *data; struct rpc_task *task; - data = kmalloc(sizeof(*data), GFP_NOFS); + if (xps->xps_nunique_destaddr_xprts + 1 > clnt->cl_max_connect) { + rcu_read_lock(); + pr_warn("SUNRPC: reached max allowed number (%d) did not add " + "transport to server: %s\n", clnt->cl_max_connect, + rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR)); + rcu_read_unlock(); + return -EINVAL; + } + + data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->xps = xprt_switch_get(xps); @@ -2800,17 +2920,42 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, goto success; } - task = rpc_call_null_helper(clnt, xprt, NULL, - RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS, + task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_ASYNC, &rpc_cb_add_xprt_call_ops, data); if (IS_ERR(task)) return PTR_ERR(task); + + data->xps->xps_nunique_destaddr_xprts++; rpc_put_task(task); success: return 1; } EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt); +static int rpc_clnt_add_xprt_helper(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + struct rpc_add_xprt_test *data) +{ + struct rpc_task *task; + int status = -EADDRINUSE; + + /* Test the connection */ + task = rpc_call_null_helper(clnt, xprt, NULL, 0, NULL, NULL); + if (IS_ERR(task)) + return PTR_ERR(task); + + status = task->tk_status; + rpc_put_task(task); + + if (status < 0) + return status; + + /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ + data->add_xprt_test(clnt, xprt, data->data); + + return 0; +} + /** * rpc_clnt_setup_test_and_add_xprt() * @@ -2834,8 +2979,6 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { - struct rpc_task *task; - struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data; int status = -EADDRINUSE; xprt = xprt_get(xprt); @@ -2844,33 +2987,19 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) goto out_err; - /* Test the connection */ - task = rpc_call_null_helper(clnt, xprt, NULL, - RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, - NULL, NULL); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out_err; - } - status = task->tk_status; - rpc_put_task(task); - + status = rpc_clnt_add_xprt_helper(clnt, xprt, data); if (status < 0) goto out_err; - /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ - xtest->add_xprt_test(clnt, xprt, xtest->data); - - xprt_put(xprt); - xprt_switch_put(xps); - - /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ - return 1; + status = 1; out_err: xprt_put(xprt); xprt_switch_put(xps); - pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not added\n", - status, xprt->address_strings[RPC_DISPLAY_ADDR]); + if (status < 0) + pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not " + "added\n", status, + xprt->address_strings[RPC_DISPLAY_ADDR]); + /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ return status; } EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt); @@ -2901,7 +3030,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, unsigned long connect_timeout; unsigned long reconnect_timeout; unsigned char resvport, reuseport; - int ret = 0; + int ret = 0, ident; rcu_read_lock(); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); @@ -2915,8 +3044,11 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, reuseport = xprt->reuseport; connect_timeout = xprt->connect_timeout; reconnect_timeout = xprt->max_reconnect_timeout; + ident = xprt->xprt_class->ident; rcu_read_unlock(); + if (!xprtargs->ident) + xprtargs->ident = ident; xprt = xprt_create_transport(xprtargs); if (IS_ERR(xprt)) { ret = PTR_ERR(xprt); @@ -2944,6 +3076,110 @@ out_put_switch: } EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt); +static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + struct rpc_add_xprt_test *data) +{ + struct rpc_xprt_switch *xps; + struct rpc_xprt *main_xprt; + int status = 0; + + xprt_get(xprt); + + rcu_read_lock(); + main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); + status = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr, + (struct sockaddr *)&main_xprt->addr); + rcu_read_unlock(); + xprt_put(main_xprt); + if (status || !test_bit(XPRT_OFFLINE, &xprt->state)) + goto out; + + status = rpc_clnt_add_xprt_helper(clnt, xprt, data); +out: + xprt_put(xprt); + xprt_switch_put(xps); + return status; +} + +/* rpc_clnt_probe_trunked_xprt -- probe offlined transport for session trunking + * @clnt rpc_clnt structure + * + * For each offlined transport found in the rpc_clnt structure call + * the function rpc_xprt_probe_trunked() which will determine if this + * transport still belongs to the trunking group. + */ +void rpc_clnt_probe_trunked_xprts(struct rpc_clnt *clnt, + struct rpc_add_xprt_test *data) +{ + struct rpc_xprt_iter xpi; + int ret; + + ret = rpc_clnt_xprt_iter_offline_init(clnt, &xpi); + if (ret) + return; + for (;;) { + struct rpc_xprt *xprt = xprt_iter_get_next(&xpi); + + if (!xprt) + break; + ret = rpc_xprt_probe_trunked(clnt, xprt, data); + xprt_put(xprt); + if (ret < 0) + break; + xprt_iter_rewind(&xpi); + } + xprt_iter_destroy(&xpi); +} +EXPORT_SYMBOL_GPL(rpc_clnt_probe_trunked_xprts); + +static int rpc_xprt_offline(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + void *data) +{ + struct rpc_xprt *main_xprt; + struct rpc_xprt_switch *xps; + int err = 0; + + xprt_get(xprt); + + rcu_read_lock(); + main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); + err = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr, + (struct sockaddr *)&main_xprt->addr); + rcu_read_unlock(); + xprt_put(main_xprt); + if (err) + goto out; + + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { + err = -EINTR; + goto out; + } + xprt_set_offline_locked(xprt, xps); + + xprt_release_write(xprt, NULL); +out: + xprt_put(xprt); + xprt_switch_put(xps); + return err; +} + +/* rpc_clnt_manage_trunked_xprts -- offline trunked transports + * @clnt rpc_clnt structure + * + * For each active transport found in the rpc_clnt structure call + * the function rpc_xprt_offline() which will identify trunked transports + * and will mark them offline. + */ +void rpc_clnt_manage_trunked_xprts(struct rpc_clnt *clnt) +{ + rpc_clnt_iterate_for_each_xprt(clnt, rpc_xprt_offline, NULL); +} +EXPORT_SYMBOL_GPL(rpc_clnt_manage_trunked_xprts); + struct connect_timeout_data { unsigned long connect_timeout; unsigned long reconnect_timeout; @@ -2986,8 +3222,22 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); +void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt) +{ + struct rpc_xprt_switch *xps; + + rcu_read_lock(); + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + rcu_read_unlock(); + xprt_set_online_locked(xprt, xps); +} + void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { + if (rpc_clnt_xprt_switch_has_addr(clnt, + (const struct sockaddr *)&xprt->addr)) { + return rpc_clnt_xprt_set_online(clnt, xprt); + } rcu_read_lock(); rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), xprt); @@ -2995,6 +3245,19 @@ void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); +void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) +{ + struct rpc_xprt_switch *xps; + + rcu_read_lock(); + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + rpc_xprt_switch_remove_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), + xprt, 0); + xps->xps_nunique_destaddr_xprts--; + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_remove_xprt); + bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap) { @@ -3066,6 +3329,8 @@ rpc_clnt_swap_activate_callback(struct rpc_clnt *clnt, int rpc_clnt_swap_activate(struct rpc_clnt *clnt) { + while (clnt != clnt->cl_parent) + clnt = clnt->cl_parent; if (atomic_inc_return(&clnt->cl_swapper) == 1) return rpc_clnt_iterate_for_each_xprt(clnt, rpc_clnt_swap_activate_callback, NULL); diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index fd9bca242724..a176d5a0b0ee 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -8,14 +8,14 @@ #include <linux/debugfs.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/clnt.h> + #include "netns.h" +#include "fail.h" static struct dentry *topdir; static struct dentry *rpc_clnt_dir; static struct dentry *rpc_xprt_dir; -unsigned int rpc_inject_disconnect; - static int tasks_show(struct seq_file *f, void *v) { @@ -90,7 +90,7 @@ static int tasks_open(struct inode *inode, struct file *filp) struct seq_file *seq = filp->private_data; struct rpc_clnt *clnt = seq->private = inode->i_private; - if (!atomic_inc_not_zero(&clnt->cl_count)) { + if (!refcount_inc_not_zero(&clnt->cl_count)) { seq_release(inode, filp); ret = -EINVAL; } @@ -128,13 +128,13 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n return 0; len = snprintf(name, sizeof(name), "../../rpc_xprt/%s", xprt->debugfs->d_name.name); - if (len > sizeof(name)) + if (len >= sizeof(name)) return -1; if (*nump == 0) strcpy(link, "xprt"); else { len = snprintf(link, sizeof(link), "xprt%d", *nump); - if (len > sizeof(link)) + if (len >= sizeof(link)) return -1; } debugfs_create_symlink(link, clnt->cl_debugfs, name); @@ -235,8 +235,6 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt) /* make tasks file */ debugfs_create_file("info", S_IFREG | 0400, xprt->debugfs, xprt, &xprt_info_fops); - - atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect); } void @@ -246,56 +244,33 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt) xprt->debugfs = NULL; } -static int -fault_open(struct inode *inode, struct file *filp) -{ - filp->private_data = kmalloc(128, GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - return 0; -} +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +struct fail_sunrpc_attr fail_sunrpc = { + .attr = FAULT_ATTR_INITIALIZER, +}; +EXPORT_SYMBOL_GPL(fail_sunrpc); -static int -fault_release(struct inode *inode, struct file *filp) +static void fail_sunrpc_init(void) { - kfree(filp->private_data); - return 0; -} + struct dentry *dir; -static ssize_t -fault_disconnect_read(struct file *filp, char __user *user_buf, - size_t len, loff_t *offset) -{ - char *buffer = (char *)filp->private_data; - size_t size; + dir = fault_create_debugfs_attr("fail_sunrpc", NULL, + &fail_sunrpc.attr); - size = sprintf(buffer, "%u\n", rpc_inject_disconnect); - return simple_read_from_buffer(user_buf, len, offset, buffer, size); -} + debugfs_create_bool("ignore-client-disconnect", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_client_disconnect); -static ssize_t -fault_disconnect_write(struct file *filp, const char __user *user_buf, - size_t len, loff_t *offset) + debugfs_create_bool("ignore-server-disconnect", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_server_disconnect); + + debugfs_create_bool("ignore-cache-wait", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_cache_wait); +} +#else +static void fail_sunrpc_init(void) { - char buffer[16]; - - if (len >= sizeof(buffer)) - len = sizeof(buffer) - 1; - if (copy_from_user(buffer, user_buf, len)) - return -EFAULT; - buffer[len] = '\0'; - if (kstrtouint(buffer, 10, &rpc_inject_disconnect)) - return -EINVAL; - return len; } - -static const struct file_operations fault_disconnect_fops = { - .owner = THIS_MODULE, - .open = fault_open, - .read = fault_disconnect_read, - .write = fault_disconnect_write, - .release = fault_release, -}; +#endif void __exit sunrpc_debugfs_exit(void) @@ -309,16 +284,11 @@ sunrpc_debugfs_exit(void) void __init sunrpc_debugfs_init(void) { - struct dentry *rpc_fault_dir; - topdir = debugfs_create_dir("sunrpc", NULL); rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir); rpc_xprt_dir = debugfs_create_dir("rpc_xprt", topdir); - rpc_fault_dir = debugfs_create_dir("inject_fault", topdir); - - debugfs_create_file("disconnect", S_IFREG | 0400, rpc_fault_dir, NULL, - &fault_disconnect_fops); + fail_sunrpc_init(); } diff --git a/net/sunrpc/fail.h b/net/sunrpc/fail.h new file mode 100644 index 000000000000..4b4b500df428 --- /dev/null +++ b/net/sunrpc/fail.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021, Oracle. All rights reserved. + */ + +#ifndef _NET_SUNRPC_FAIL_H_ +#define _NET_SUNRPC_FAIL_H_ + +#include <linux/fault-inject.h> + +#if IS_ENABLED(CONFIG_FAULT_INJECTION) + +struct fail_sunrpc_attr { + struct fault_attr attr; + + bool ignore_client_disconnect; + bool ignore_server_disconnect; + bool ignore_cache_wait; +}; + +extern struct fail_sunrpc_attr fail_sunrpc; + +#endif /* CONFIG_FAULT_INJECTION */ + +#endif /* _NET_SUNRPC_FAIL_H_ */ diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 39e14d5edaf1..0b6034fab9ab 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -197,7 +197,7 @@ static struct inode * rpc_alloc_inode(struct super_block *sb) { struct rpc_inode *rpci; - rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL); + rpci = alloc_inode_sb(sb, rpc_inode_cachep, GFP_KERNEL); if (!rpci) return NULL; return &rpci->vfs_inode; @@ -423,7 +423,7 @@ rpc_info_open(struct inode *inode, struct file *file) spin_lock(&file->f_path.dentry->d_lock); if (!d_unhashed(file->f_path.dentry)) clnt = RPC_I(inode)->private; - if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) { + if (clnt != NULL && refcount_inc_not_zero(&clnt->cl_count)) { spin_unlock(&file->f_path.dentry->d_lock); m->private = clnt; } else { @@ -478,6 +478,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode) inode->i_fop = &simple_dir_operations; inode->i_op = &simple_dir_inode_operations; inc_nlink(inode); + break; default: break; } @@ -599,9 +600,9 @@ static int __rpc_rmdir(struct inode *dir, struct dentry *dentry) dget(dentry); ret = simple_rmdir(dir, dentry); + d_drop(dentry); if (!ret) fsnotify_rmdir(dir, dentry); - d_delete(dentry); dput(dentry); return ret; } @@ -612,9 +613,9 @@ static int __rpc_unlink(struct inode *dir, struct dentry *dentry) dget(dentry); ret = simple_unlink(dir, dentry); + d_drop(dentry); if (!ret) fsnotify_unlink(dir, dentry); - d_delete(dentry); dput(dentry); return ret; } @@ -781,7 +782,8 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, } /** - * rpc_mkpipe - make an rpc_pipefs file for kernel<->userspace communication + * rpc_mkpipe_dentry - make an rpc_pipefs file for kernel<->userspace + * communication * @parent: dentry of directory to create new "pipe" in * @name: name of pipe * @private: private data to associate with the pipe, for the caller's use @@ -1317,6 +1319,7 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) q.len = strlen(gssd_dummy_clnt_dir[0].name); clnt_dentry = d_hash_and_lookup(gssd_dentry, &q); if (!clnt_dentry) { + __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); pipe_dentry = ERR_PTR(-ENOENT); goto out; } @@ -1509,6 +1512,6 @@ err_notifier: void unregister_rpc_pipefs(void) { rpc_clients_notifier_unregister(); - kmem_cache_destroy(rpc_inode_cachep); unregister_filesystem(&rpc_pipe_fs_type); + kmem_cache_destroy(rpc_inode_cachep); } diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 4a020b688860..5a8e6d46809a 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -31,11 +31,9 @@ #include <linux/sunrpc/sched.h> #include <linux/sunrpc/xprtsock.h> -#include "netns.h" +#include <trace/events/sunrpc.h> -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_BIND -#endif +#include "netns.h" #define RPCBIND_SOCK_PATHNAME "/var/run/rpcbind.sock" @@ -216,10 +214,6 @@ static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt, sn->rpcb_is_af_local = is_af_local ? 1 : 0; smp_wmb(); sn->rpcb_users = 1; - dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: " - "%p, rpcb_local_clnt4: %p) for net %x%s\n", - sn->rpcb_local_clnt, sn->rpcb_local_clnt4, - net->ns.inum, (net == &init_net) ? " (init_net)" : ""); } /* @@ -261,19 +255,13 @@ static int rpcb_create_local_unix(struct net *net) */ clnt = rpc_create(&args); if (IS_ERR(clnt)) { - dprintk("RPC: failed to create AF_LOCAL rpcbind " - "client (errno %ld).\n", PTR_ERR(clnt)); result = PTR_ERR(clnt); goto out; } clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4); - if (IS_ERR(clnt4)) { - dprintk("RPC: failed to bind second program to " - "rpcbind v4 client (errno %ld).\n", - PTR_ERR(clnt4)); + if (IS_ERR(clnt4)) clnt4 = NULL; - } rpcb_set_local(net, clnt, clnt4, true); @@ -309,8 +297,6 @@ static int rpcb_create_local_net(struct net *net) clnt = rpc_create(&args); if (IS_ERR(clnt)) { - dprintk("RPC: failed to create local rpcbind " - "client (errno %ld).\n", PTR_ERR(clnt)); result = PTR_ERR(clnt); goto out; } @@ -321,12 +307,8 @@ static int rpcb_create_local_net(struct net *net) * v4 upcalls. */ clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4); - if (IS_ERR(clnt4)) { - dprintk("RPC: failed to bind second program to " - "rpcbind v4 client (errno %ld).\n", - PTR_ERR(clnt4)); + if (IS_ERR(clnt4)) clnt4 = NULL; - } rpcb_set_local(net, clnt, clnt4, false); @@ -362,13 +344,15 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename, const char *hostname, struct sockaddr *srvaddr, size_t salen, int proto, u32 version, - const struct cred *cred) + const struct cred *cred, + const struct rpc_timeout *timeo) { struct rpc_create_args args = { .net = net, .protocol = proto, .address = srvaddr, .addrsize = salen, + .timeout = timeo, .servername = hostname, .nodename = nodename, .program = &rpcb_program, @@ -403,11 +387,8 @@ static int rpcb_register_call(struct sunrpc_net *sn, struct rpc_clnt *clnt, stru msg->rpc_resp = &result; error = rpc_call_sync(clnt, msg, flags); - if (error < 0) { - dprintk("RPC: failed to contact local rpcbind " - "server (errno %d).\n", -error); + if (error < 0) return error; - } if (!result) return -EACCES; @@ -461,9 +442,7 @@ int rpcb_register(struct net *net, u32 prog, u32 vers, int prot, unsigned short struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); bool is_set = false; - dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " - "rpcbind\n", (port ? "" : "un"), - prog, vers, prot, port); + trace_pmap_register(prog, vers, prot, port); msg.rpc_proc = &rpcb_procedures2[RPCBPROC_UNSET]; if (port != 0) { @@ -489,11 +468,6 @@ static int rpcb_register_inet4(struct sunrpc_net *sn, map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL); - dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " - "local rpcbind\n", (port ? "" : "un"), - map->r_prog, map->r_vers, - map->r_addr, map->r_netid); - msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; if (port != 0) { msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; @@ -520,11 +494,6 @@ static int rpcb_register_inet6(struct sunrpc_net *sn, map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL); - dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " - "local rpcbind\n", (port ? "" : "un"), - map->r_prog, map->r_vers, - map->r_addr, map->r_netid); - msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; if (port != 0) { msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; @@ -541,9 +510,7 @@ static int rpcb_unregister_all_protofamilies(struct sunrpc_net *sn, { struct rpcbind_args *map = msg->rpc_argp; - dprintk("RPC: unregistering [%u, %u, '%s'] with " - "local rpcbind\n", - map->r_prog, map->r_vers, map->r_netid); + trace_rpcb_unregister(map->r_prog, map->r_vers, map->r_netid); map->r_addr = ""; msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; @@ -615,6 +582,8 @@ int rpcb_v4_register(struct net *net, const u32 program, const u32 version, if (address == NULL) return rpcb_unregister_all_protofamilies(sn, &msg); + trace_rpcb_register(map.r_prog, map.r_vers, map.r_addr, map.r_netid); + switch (address->sa_family) { case AF_INET: return rpcb_register_inet4(sn, address, &msg); @@ -693,18 +662,12 @@ void rpcb_getport_async(struct rpc_task *task) rcu_read_unlock(); xprt = xprt_get(task->tk_xprt); - dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", - task->tk_pid, __func__, - xprt->servername, clnt->cl_prog, clnt->cl_vers, xprt->prot); - /* Put self on the wait queue to ensure we get notified if * some other task is already attempting to bind the port */ rpc_sleep_on_timeout(&xprt->binding, task, NULL, jiffies + xprt->bind_timeout); if (xprt_test_and_set_binding(xprt)) { - dprintk("RPC: %5u %s: waiting for another binder\n", - task->tk_pid, __func__); xprt_put(xprt); return; } @@ -712,8 +675,6 @@ void rpcb_getport_async(struct rpc_task *task) /* Someone else may have bound if we slept */ if (xprt_bound(xprt)) { status = 0; - dprintk("RPC: %5u %s: already bound\n", - task->tk_pid, __func__); goto bailout_nofree; } @@ -732,38 +693,30 @@ void rpcb_getport_async(struct rpc_task *task) break; default: status = -EAFNOSUPPORT; - dprintk("RPC: %5u %s: bad address family\n", - task->tk_pid, __func__); goto bailout_nofree; } if (proc == NULL) { xprt->bind_index = 0; status = -EPFNOSUPPORT; - dprintk("RPC: %5u %s: no more getport versions available\n", - task->tk_pid, __func__); goto bailout_nofree; } - dprintk("RPC: %5u %s: trying rpcbind version %u\n", - task->tk_pid, __func__, bind_version); + trace_rpcb_getport(clnt, task, bind_version); rpcb_clnt = rpcb_create(xprt->xprt_net, clnt->cl_nodename, xprt->servername, sap, salen, xprt->prot, bind_version, - clnt->cl_cred); + clnt->cl_cred, + task->tk_client->cl_timeout); if (IS_ERR(rpcb_clnt)) { status = PTR_ERR(rpcb_clnt); - dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n", - task->tk_pid, __func__, PTR_ERR(rpcb_clnt)); goto bailout_nofree; } - map = kzalloc(sizeof(struct rpcbind_args), GFP_NOFS); + map = kzalloc(sizeof(struct rpcbind_args), rpc_task_gfp_mask()); if (!map) { status = -ENOMEM; - dprintk("RPC: %5u %s: no memory available\n", - task->tk_pid, __func__); goto bailout_release_client; } map->r_prog = clnt->cl_prog; @@ -777,11 +730,9 @@ void rpcb_getport_async(struct rpc_task *task) case RPCBVERS_4: case RPCBVERS_3: map->r_netid = xprt->address_strings[RPC_DISPLAY_NETID]; - map->r_addr = rpc_sockaddr2uaddr(sap, GFP_NOFS); + map->r_addr = rpc_sockaddr2uaddr(sap, rpc_task_gfp_mask()); if (!map->r_addr) { status = -ENOMEM; - dprintk("RPC: %5u %s: no memory available\n", - task->tk_pid, __func__); goto bailout_free_args; } map->r_owner = ""; @@ -795,12 +746,6 @@ void rpcb_getport_async(struct rpc_task *task) child = rpcb_call_async(rpcb_clnt, map, proc); rpc_release_client(rpcb_clnt); - if (IS_ERR(child)) { - /* rpcb_map_release() has freed the arguments */ - dprintk("RPC: %5u %s: rpc_run_task failed\n", - task->tk_pid, __func__); - return; - } xprt->stat.bind_count++; rpc_put_task(child); @@ -824,34 +769,33 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) { struct rpcbind_args *map = data; struct rpc_xprt *xprt = map->r_xprt; - int status = child->tk_status; + + map->r_status = child->tk_status; /* Garbage reply: retry with a lesser rpcbind version */ - if (status == -EIO) - status = -EPROTONOSUPPORT; + if (map->r_status == -EIO) + map->r_status = -EPROTONOSUPPORT; /* rpcbind server doesn't support this rpcbind protocol version */ - if (status == -EPROTONOSUPPORT) + if (map->r_status == -EPROTONOSUPPORT) xprt->bind_index++; - if (status < 0) { + if (map->r_status < 0) { /* rpcbind server not available on remote host? */ - xprt->ops->set_port(xprt, 0); + map->r_port = 0; + } else if (map->r_port == 0) { /* Requested RPC service wasn't registered on remote host */ - xprt->ops->set_port(xprt, 0); - status = -EACCES; + map->r_status = -EACCES; } else { /* Succeeded */ - xprt->ops->set_port(xprt, map->r_port); - xprt_set_bound(xprt); - status = 0; + map->r_status = 0; } - dprintk("RPC: %5u rpcb_getport_done(status %d, port %u)\n", - child->tk_pid, status, map->r_port); - - map->r_status = status; + trace_rpcb_setport(child, map->r_status, map->r_port); + xprt->ops->set_port(xprt, map->r_port); + if (map->r_port) + xprt_set_bound(xprt); } /* @@ -864,11 +808,6 @@ static void rpcb_enc_mapping(struct rpc_rqst *req, struct xdr_stream *xdr, const struct rpcbind_args *rpcb = data; __be32 *p; - dprintk("RPC: %5u encoding PMAP_%s call (%u, %u, %d, %u)\n", - req->rq_task->tk_pid, - req->rq_task->tk_msg.rpc_proc->p_name, - rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port); - p = xdr_reserve_space(xdr, RPCB_mappingargs_sz << 2); *p++ = cpu_to_be32(rpcb->r_prog); *p++ = cpu_to_be32(rpcb->r_vers); @@ -890,8 +829,6 @@ static int rpcb_dec_getport(struct rpc_rqst *req, struct xdr_stream *xdr, return -EIO; port = be32_to_cpup(p); - dprintk("RPC: %5u PMAP_%s result: %lu\n", req->rq_task->tk_pid, - req->rq_task->tk_msg.rpc_proc->p_name, port); if (unlikely(port > USHRT_MAX)) return -EIO; @@ -912,11 +849,6 @@ static int rpcb_dec_set(struct rpc_rqst *req, struct xdr_stream *xdr, *boolp = 0; if (*p != xdr_zero) *boolp = 1; - - dprintk("RPC: %5u RPCB_%s call %s\n", - req->rq_task->tk_pid, - req->rq_task->tk_msg.rpc_proc->p_name, - (*boolp ? "succeeded" : "failed")); return 0; } @@ -941,12 +873,6 @@ static void rpcb_enc_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, const struct rpcbind_args *rpcb = data; __be32 *p; - dprintk("RPC: %5u encoding RPCB_%s call (%u, %u, '%s', '%s')\n", - req->rq_task->tk_pid, - req->rq_task->tk_msg.rpc_proc->p_name, - rpcb->r_prog, rpcb->r_vers, - rpcb->r_netid, rpcb->r_addr); - p = xdr_reserve_space(xdr, (RPCB_program_sz + RPCB_version_sz) << 2); *p++ = cpu_to_be32(rpcb->r_prog); *p = cpu_to_be32(rpcb->r_vers); @@ -976,11 +902,8 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, * If the returned universal address is a null string, * the requested RPC service was not registered. */ - if (len == 0) { - dprintk("RPC: %5u RPCB reply: program not registered\n", - req->rq_task->tk_pid); + if (len == 0) return 0; - } if (unlikely(len > RPCBIND_MAXUADDRLEN)) goto out_fail; @@ -988,8 +911,6 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, p = xdr_inline_decode(xdr, len); if (unlikely(p == NULL)) goto out_fail; - dprintk("RPC: %5u RPCB_%s reply: %s\n", req->rq_task->tk_pid, - req->rq_task->tk_msg.rpc_proc->p_name, (char *)p); if (rpc_uaddr2sockaddr(req->rq_xprt->xprt_net, (char *)p, len, sap, sizeof(address)) == 0) @@ -999,9 +920,6 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, return 0; out_fail: - dprintk("RPC: %5u malformed RPCB_%s reply\n", - req->rq_task->tk_pid, - req->rq_task->tk_msg.rpc_proc->p_name); return -EIO; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 55e900255b0c..be587a308e05 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -27,10 +27,6 @@ #include "sunrpc.h" -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -#define RPCDBG_FACILITY RPCDBG_SCHED -#endif - #define CREATE_TRACE_POINTS #include <trace/events/sunrpc.h> @@ -61,6 +57,21 @@ struct workqueue_struct *rpciod_workqueue __read_mostly; struct workqueue_struct *xprtiod_workqueue __read_mostly; EXPORT_SYMBOL_GPL(xprtiod_workqueue); +gfp_t rpc_task_gfp_mask(void) +{ + if (current->flags & PF_WQ_WORKER) + return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + return GFP_KERNEL; +} +EXPORT_SYMBOL_GPL(rpc_task_gfp_mask); + +bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status) +{ + if (cmpxchg(&task->tk_rpc_status, 0, rpc_status) == 0) + return true; + return false; +} + unsigned long rpc_task_timeout(const struct rpc_task *task) { @@ -85,7 +96,6 @@ __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task) { if (list_empty(&task->u.tk_wait.timer_list)) return; - dprintk("RPC: %5u disabling timer\n", task->tk_pid); task->tk_timeout = 0; list_del(&task->u.tk_wait.timer_list); if (list_empty(&queue->timer_list.list)) @@ -111,9 +121,6 @@ static void __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task, unsigned long timeout) { - dprintk("RPC: %5u setting alarm for %u ms\n", - task->tk_pid, jiffies_to_msecs(timeout - jiffies)); - task->tk_timeout = timeout; if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires)) rpc_set_queue_timer(queue, timeout); @@ -194,25 +201,14 @@ static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, /* * Add new request to wait queue. - * - * Swapper tasks always get inserted at the head of the queue. - * This should avoid many nasty memory deadlocks and hopefully - * improve overall performance. - * Everyone else gets appended to the queue to ensure proper FIFO behavior. */ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task, unsigned char queue_priority) { - WARN_ON_ONCE(RPC_IS_QUEUED(task)); - if (RPC_IS_QUEUED(task)) - return; - INIT_LIST_HEAD(&task->u.tk_wait.timer_list); if (RPC_IS_PRIORITY(queue)) __rpc_add_wait_queue_priority(queue, task, queue_priority); - else if (RPC_IS_SWAPPER(task)) - list_add(&task->u.tk_wait.list, &queue->tasks[0]); else list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); task->tk_waitqueue = queue; @@ -220,9 +216,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, /* barrier matches the read in rpc_wake_up_task_queue_locked() */ smp_wmb(); rpc_set_queued(task); - - dprintk("RPC: %5u added to queue %p \"%s\"\n", - task->tk_pid, queue, rpc_qname(queue)); } /* @@ -245,8 +238,6 @@ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_tas else list_del(&task->u.tk_wait.list); queue->qlen--; - dprintk("RPC: %5u removed from queue %p \"%s\"\n", - task->tk_pid, queue, rpc_qname(queue)); } static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, unsigned char nr_queues) @@ -285,7 +276,7 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; @@ -294,9 +285,17 @@ static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) static void rpc_task_set_debuginfo(struct rpc_task *task) { - static atomic_t rpc_pid; + struct rpc_clnt *clnt = task->tk_client; + + /* Might be a task carrying a reverse-direction operation */ + if (!clnt) { + static atomic_t rpc_pid; - task->tk_pid = atomic_inc_return(&rpc_pid); + task->tk_pid = atomic_inc_return(&rpc_pid); + return; + } + + task->tk_pid = atomic_inc_return(&clnt->cl_pid); } #else static inline void rpc_task_set_debuginfo(struct rpc_task *task) @@ -341,14 +340,12 @@ static int rpc_complete_task(struct rpc_task *task) * to enforce taking of the wq->lock and hence avoid races with * rpc_complete_task(). */ -int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action) +int rpc_wait_for_completion_task(struct rpc_task *task) { - if (action == NULL) - action = rpc_wait_bit_killable; return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE, - action, TASK_KILLABLE); + rpc_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); } -EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); +EXPORT_SYMBOL_GPL(rpc_wait_for_completion_task); /* * Make an RPC task runnable. @@ -382,25 +379,32 @@ static void rpc_make_runnable(struct workqueue_struct *wq, * NB: An RPC task will only receive interrupt-driven events as long * as it's on a wait queue. */ -static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, +static void __rpc_do_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, unsigned char queue_priority) { - dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n", - task->tk_pid, rpc_qname(q), jiffies); - trace_rpc_task_sleep(task, q); __rpc_add_wait_queue(q, task, queue_priority); +} +static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, + struct rpc_task *task, + unsigned char queue_priority) +{ + if (WARN_ON_ONCE(RPC_IS_QUEUED(task))) + return; + __rpc_do_sleep_on_priority(q, task, queue_priority); } static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q, struct rpc_task *task, unsigned long timeout, unsigned char queue_priority) { + if (WARN_ON_ONCE(RPC_IS_QUEUED(task))) + return; if (time_is_after_jiffies(timeout)) { - __rpc_sleep_on_priority(q, task, queue_priority); + __rpc_do_sleep_on_priority(q, task, queue_priority); __rpc_add_timer(q, task, timeout); } else task->tk_status = -ETIMEDOUT; @@ -503,9 +507,6 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, struct rpc_wait_queue *queue, struct rpc_task *task) { - dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", - task->tk_pid, jiffies); - /* Has the task been executed yet? If not, we cannot wake it up! */ if (!RPC_IS_ACTIVATED(task)) { printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task); @@ -517,8 +518,6 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, __rpc_remove_wait_queue(queue, task); rpc_make_runnable(wq, task); - - dprintk("RPC: __rpc_wake_up_task done\n"); } /* @@ -607,10 +606,20 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q struct rpc_task *task; /* + * Service the privileged queue. + */ + q = &queue->tasks[RPC_NR_PRIORITY - 1]; + if (queue->maxpriority > RPC_PRIORITY_PRIVILEGED && !list_empty(q)) { + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); + goto out; + } + + /* * Service a batch of tasks from a single owner. */ q = &queue->tasks[queue->priority]; - if (!list_empty(q) && --queue->nr) { + if (!list_empty(q) && queue->nr) { + queue->nr--; task = list_first_entry(q, struct rpc_task, u.tk_wait.list); goto out; } @@ -656,8 +665,6 @@ struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, { struct rpc_task *task = NULL; - dprintk("RPC: wake_up_first(%p \"%s\")\n", - queue, rpc_qname(queue)); spin_lock(&queue->lock); task = __rpc_find_next_queued(queue); if (task != NULL) @@ -693,6 +700,23 @@ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue) EXPORT_SYMBOL_GPL(rpc_wake_up_next); /** + * rpc_wake_up_locked - wake up all rpc_tasks + * @queue: rpc_wait_queue on which the tasks are sleeping + * + */ +static void rpc_wake_up_locked(struct rpc_wait_queue *queue) +{ + struct rpc_task *task; + + for (;;) { + task = __rpc_find_next_queued(queue); + if (task == NULL) + break; + rpc_wake_up_task_queue_locked(queue, task); + } +} + +/** * rpc_wake_up - wake up all rpc_tasks * @queue: rpc_wait_queue on which the tasks are sleeping * @@ -700,25 +724,28 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_next); */ void rpc_wake_up(struct rpc_wait_queue *queue) { - struct list_head *head; - spin_lock(&queue->lock); - head = &queue->tasks[queue->maxpriority]; + rpc_wake_up_locked(queue); + spin_unlock(&queue->lock); +} +EXPORT_SYMBOL_GPL(rpc_wake_up); + +/** + * rpc_wake_up_status_locked - wake up all rpc_tasks and set their status value. + * @queue: rpc_wait_queue on which the tasks are sleeping + * @status: status value to set + */ +static void rpc_wake_up_status_locked(struct rpc_wait_queue *queue, int status) +{ + struct rpc_task *task; + for (;;) { - while (!list_empty(head)) { - struct rpc_task *task; - task = list_first_entry(head, - struct rpc_task, - u.tk_wait.list); - rpc_wake_up_task_queue_locked(queue, task); - } - if (head == &queue->tasks[0]) + task = __rpc_find_next_queued(queue); + if (task == NULL) break; - head--; + rpc_wake_up_task_queue_set_status_locked(queue, task, status); } - spin_unlock(&queue->lock); } -EXPORT_SYMBOL_GPL(rpc_wake_up); /** * rpc_wake_up_status - wake up all rpc_tasks and set their status value. @@ -729,23 +756,8 @@ EXPORT_SYMBOL_GPL(rpc_wake_up); */ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) { - struct list_head *head; - spin_lock(&queue->lock); - head = &queue->tasks[queue->maxpriority]; - for (;;) { - while (!list_empty(head)) { - struct rpc_task *task; - task = list_first_entry(head, - struct rpc_task, - u.tk_wait.list); - task->tk_status = status; - rpc_wake_up_task_queue_locked(queue, task); - } - if (head == &queue->tasks[0]) - break; - head--; - } + rpc_wake_up_status_locked(queue, status); spin_unlock(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_status); @@ -763,7 +775,7 @@ static void __rpc_queue_timer_fn(struct work_struct *work) list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) { timeo = task->tk_timeout; if (time_after_eq(now, timeo)) { - dprintk("RPC: %5u timeout\n", task->tk_pid); + trace_rpc_task_timeout(task, task->tk_action); task->tk_status = -ETIMEDOUT; rpc_wake_up_task_queue_locked(queue, task); continue; @@ -831,6 +843,7 @@ void rpc_exit_task(struct rpc_task *task) else if (task->tk_client) rpc_count_iostats(task, task->tk_client->cl_metrics); if (task->tk_ops->rpc_call_done != NULL) { + trace_rpc_task_call_done(task, task->tk_ops->rpc_call_done); task->tk_ops->rpc_call_done(task, task->tk_calldata); if (task->tk_action != NULL) { /* Always release the RPC slot and buffer memory */ @@ -847,12 +860,25 @@ void rpc_signal_task(struct rpc_task *task) if (!RPC_IS_ACTIVATED(task)) return; + if (!rpc_task_set_rpc_status(task, -ERESTARTSYS)) + return; trace_rpc_task_signalled(task, task->tk_action); set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); smp_mb__after_atomic(); queue = READ_ONCE(task->tk_waitqueue); if (queue) - rpc_wake_up_queued_task_set_status(queue, task, -ERESTARTSYS); + rpc_wake_up_queued_task(queue, task); +} + +void rpc_task_try_cancel(struct rpc_task *task, int error) +{ + struct rpc_wait_queue *queue; + + if (!rpc_task_set_rpc_status(task, error)) + return; + queue = READ_ONCE(task->tk_waitqueue); + if (queue) + rpc_wake_up_queued_task(queue, task); } void rpc_exit(struct rpc_task *task, int status) @@ -869,6 +895,15 @@ void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata) ops->rpc_release(calldata); } +static bool xprt_needs_memalloc(struct rpc_xprt *xprt, struct rpc_task *tk) +{ + if (!xprt) + return false; + if (!atomic_read(&xprt->swapper)) + return false; + return test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == tk; +} + /* * This is the RPC `scheduler' (or rather, the finite state machine). */ @@ -877,9 +912,7 @@ static void __rpc_execute(struct rpc_task *task) struct rpc_wait_queue *queue; int task_is_async = RPC_IS_ASYNC(task); int status = 0; - - dprintk("RPC: %5u __rpc_execute flags=0x%x\n", - task->tk_pid, task->tk_flags); + unsigned long pflags = current->flags; WARN_ON_ONCE(RPC_IS_QUEUED(task)); if (RPC_IS_QUEUED(task)) @@ -892,31 +925,35 @@ static void __rpc_execute(struct rpc_task *task) * Perform the next FSM step or a pending callback. * * tk_action may be NULL if the task has been killed. - * In particular, note that rpc_killall_tasks may - * do this at any time, so beware when dereferencing. */ do_action = task->tk_action; + /* Tasks with an RPC error status should exit */ + if (do_action != rpc_exit_task && + (status = READ_ONCE(task->tk_rpc_status)) != 0) { + task->tk_status = status; + if (do_action != NULL) + do_action = rpc_exit_task; + } + /* Callbacks override all actions */ if (task->tk_callback) { do_action = task->tk_callback; task->tk_callback = NULL; } if (!do_action) break; + if (RPC_IS_SWAPPER(task) || + xprt_needs_memalloc(task->tk_xprt, task)) + current->flags |= PF_MEMALLOC; + trace_rpc_task_run_action(task, do_action); do_action(task); /* * Lockless check for whether task is sleeping or not. */ - if (!RPC_IS_QUEUED(task)) + if (!RPC_IS_QUEUED(task)) { + cond_resched(); continue; - - /* - * Signalled tasks should exit rather than sleep. - */ - if (RPC_SIGNALLED(task)) { - task->tk_rpc_status = -ERESTARTSYS; - rpc_exit(task, -ERESTARTSYS); } /* @@ -934,16 +971,22 @@ static void __rpc_execute(struct rpc_task *task) spin_unlock(&queue->lock); continue; } + /* Wake up any task that has an exit status */ + if (READ_ONCE(task->tk_rpc_status) != 0) { + rpc_wake_up_task_queue_locked(queue, task); + spin_unlock(&queue->lock); + continue; + } rpc_clear_running(task); spin_unlock(&queue->lock); if (task_is_async) - return; + goto out; /* sync task: sleep here */ - dprintk("RPC: %5u sync task going to sleep\n", task->tk_pid); + trace_rpc_task_sync_sleep(task, task->tk_action); status = out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_QUEUED, rpc_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE); if (status < 0) { /* * When a sync task receives a signal, it exits with @@ -951,18 +994,15 @@ static void __rpc_execute(struct rpc_task *task) * clean up after sleeping on some queue, we don't * break the loop here, but go around once more. */ - trace_rpc_task_signalled(task, task->tk_action); - set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); - task->tk_rpc_status = -ERESTARTSYS; - rpc_exit(task, -ERESTARTSYS); + rpc_signal_task(task); } - dprintk("RPC: %5u sync task resuming\n", task->tk_pid); + trace_rpc_task_sync_wake(task, task->tk_action); } - dprintk("RPC: %5u return %d, status %d\n", task->tk_pid, status, - task->tk_status); /* Release all resources associated with the task */ rpc_release_task(task); +out: + current_restore_flags(pflags, PF_MEMALLOC); } /* @@ -980,8 +1020,11 @@ void rpc_execute(struct rpc_task *task) rpc_set_active(task); rpc_make_runnable(rpciod_workqueue, task); - if (!is_async) + if (!is_async) { + unsigned int pflags = memalloc_nofs_save(); __rpc_execute(task); + memalloc_nofs_restore(pflags); + } } static void rpc_async_schedule(struct work_struct *work) @@ -1014,23 +1057,21 @@ int rpc_malloc(struct rpc_task *task) struct rpc_rqst *rqst = task->tk_rqstp; size_t size = rqst->rq_callsize + rqst->rq_rcvsize; struct rpc_buffer *buf; - gfp_t gfp = GFP_NOFS; - - if (RPC_IS_SWAPPER(task)) - gfp = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; + gfp_t gfp = rpc_task_gfp_mask(); size += sizeof(struct rpc_buffer); - if (size <= RPC_BUFFER_MAXSIZE) - buf = mempool_alloc(rpc_buffer_mempool, gfp); - else + if (size <= RPC_BUFFER_MAXSIZE) { + buf = kmem_cache_alloc(rpc_buffer_slabp, gfp); + /* Reach for the mempool if dynamic allocation fails */ + if (!buf && RPC_IS_ASYNC(task)) + buf = mempool_alloc(rpc_buffer_mempool, GFP_NOWAIT); + } else buf = kmalloc(size, gfp); if (!buf) return -ENOMEM; buf->len = size; - dprintk("RPC: %5u allocated buffer of size %zu at %p\n", - task->tk_pid, size, buf); rqst->rq_buffer = buf->data; rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; @@ -1051,9 +1092,6 @@ void rpc_free(struct rpc_task *task) buf = container_of(buffer, struct rpc_buffer, data); size = buf->len; - dprintk("RPC: freeing buffer of size %zu at %p\n", - size, buf); - if (size <= RPC_BUFFER_MAXSIZE) mempool_free(buf, rpc_buffer_mempool); else @@ -1088,15 +1126,16 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta task->tk_action = rpc_prepare_task; rpc_init_task_statistics(task); - - dprintk("RPC: new task initialized, procpid %u\n", - task_pid_nr(current)); } -static struct rpc_task * -rpc_alloc_task(void) +static struct rpc_task *rpc_alloc_task(void) { - return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); + struct rpc_task *task; + + task = kmem_cache_alloc(rpc_task_slabp, rpc_task_gfp_mask()); + if (task) + return task; + return mempool_alloc(rpc_task_mempool, GFP_NOWAIT); } /* @@ -1109,12 +1148,16 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data) if (task == NULL) { task = rpc_alloc_task(); + if (task == NULL) { + rpc_release_calldata(setup_data->callback_ops, + setup_data->callback_data); + return ERR_PTR(-ENOMEM); + } flags = RPC_TASK_DYNAMIC; } rpc_init_task(task, setup_data); task->tk_flags |= flags; - dprintk("RPC: allocated task %p\n", task); return task; } @@ -1144,10 +1187,8 @@ static void rpc_free_task(struct rpc_task *task) put_rpccred(task->tk_op_cred); rpc_release_calldata(task->tk_ops, task->tk_calldata); - if (tk_flags & RPC_TASK_DYNAMIC) { - dprintk("RPC: %5u freeing task\n", task->tk_pid); + if (tk_flags & RPC_TASK_DYNAMIC) mempool_free(task, rpc_task_mempool); - } } static void rpc_async_release(struct work_struct *work) @@ -1162,7 +1203,8 @@ static void rpc_release_resources_task(struct rpc_task *task) { xprt_release(task); if (task->tk_msg.rpc_cred) { - put_cred(task->tk_msg.rpc_cred); + if (!(task->tk_flags & RPC_TASK_CRED_NOREF)) + put_cred(task->tk_msg.rpc_cred); task->tk_msg.rpc_cred = NULL; } rpc_task_release_client(task); @@ -1200,8 +1242,6 @@ EXPORT_SYMBOL_GPL(rpc_put_task_async); static void rpc_release_task(struct rpc_task *task) { - dprintk("RPC: %5u release task\n", task->tk_pid); - WARN_ON_ONCE(RPC_IS_QUEUED(task)); rpc_release_resources_task(task); @@ -1242,13 +1282,11 @@ static int rpciod_start(void) /* * Create the rpciod thread and wait for it to start. */ - dprintk("RPC: creating workqueue rpciod\n"); wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!wq) goto out_failed; rpciod_workqueue = wq; - /* Note: highpri because network receive is latency sensitive */ - wq = alloc_workqueue("xprtiod", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_HIGHPRI, 0); + wq = alloc_workqueue("xprtiod", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); if (!wq) goto free_rpciod; xprtiod_workqueue = wq; @@ -1267,7 +1305,6 @@ static void rpciod_stop(void) if (rpciod_workqueue == NULL) return; - dprintk("RPC: destroying workqueue rpciod\n"); wq = rpciod_workqueue; rpciod_workqueue = NULL; diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 1a864f1ed119..71ba4cf513bc 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -14,9 +14,25 @@ #include <linux/types.h> #include <linux/pagemap.h> #include <linux/udp.h> +#include <linux/sunrpc/msg_prot.h> +#include <linux/sunrpc/sched.h> #include <linux/sunrpc/xdr.h> #include <linux/export.h> +#include "socklib.h" + +/* + * Helper structure for copying from an sk_buff. + */ +struct xdr_skb_reader { + struct sk_buff *skb; + unsigned int offset; + size_t count; + __wsum csum; +}; + +typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to, + size_t len); /** * xdr_skb_read_bits - copy some data bits from skb to internal buffer @@ -55,7 +71,7 @@ static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, if (len > desc->count) len = desc->count; pos = desc->offset; - csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0); + csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len); desc->csum = csum_block_add(desc->csum, csum2, pos); desc->count -= len; desc->offset += len; @@ -186,3 +202,123 @@ no_checksum: return 0; } EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr); + +static inline int xprt_sendmsg(struct socket *sock, struct msghdr *msg, + size_t seek) +{ + if (seek) + iov_iter_advance(&msg->msg_iter, seek); + return sock_sendmsg(sock, msg); +} + +static int xprt_send_kvec(struct socket *sock, struct msghdr *msg, + struct kvec *vec, size_t seek) +{ + iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len); + return xprt_sendmsg(sock, msg, seek); +} + +static int xprt_send_pagedata(struct socket *sock, struct msghdr *msg, + struct xdr_buf *xdr, size_t base) +{ + iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec, xdr_buf_pagecount(xdr), + xdr->page_len + xdr->page_base); + return xprt_sendmsg(sock, msg, base + xdr->page_base); +} + +/* Common case: + * - stream transport + * - sending from byte 0 of the message + * - the message is wholly contained in @xdr's head iovec + */ +static int xprt_send_rm_and_kvec(struct socket *sock, struct msghdr *msg, + rpc_fraghdr marker, struct kvec *vec, + size_t base) +{ + struct kvec iov[2] = { + [0] = { + .iov_base = &marker, + .iov_len = sizeof(marker) + }, + [1] = *vec, + }; + size_t len = iov[0].iov_len + iov[1].iov_len; + + iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len); + return xprt_sendmsg(sock, msg, base); +} + +/** + * xprt_sock_sendmsg - write an xdr_buf directly to a socket + * @sock: open socket to send on + * @msg: socket message metadata + * @xdr: xdr_buf containing this request + * @base: starting position in the buffer + * @marker: stream record marker field + * @sent_p: return the total number of bytes successfully queued for sending + * + * Return values: + * On success, returns zero and fills in @sent_p. + * %-ENOTSOCK if @sock is not a struct socket. + */ +int xprt_sock_sendmsg(struct socket *sock, struct msghdr *msg, + struct xdr_buf *xdr, unsigned int base, + rpc_fraghdr marker, unsigned int *sent_p) +{ + unsigned int rmsize = marker ? sizeof(marker) : 0; + unsigned int remainder = rmsize + xdr->len - base; + unsigned int want; + int err = 0; + + *sent_p = 0; + + if (unlikely(!sock)) + return -ENOTSOCK; + + msg->msg_flags |= MSG_MORE; + want = xdr->head[0].iov_len + rmsize; + if (base < want) { + unsigned int len = want - base; + + remainder -= len; + if (remainder == 0) + msg->msg_flags &= ~MSG_MORE; + if (rmsize) + err = xprt_send_rm_and_kvec(sock, msg, marker, + &xdr->head[0], base); + else + err = xprt_send_kvec(sock, msg, &xdr->head[0], base); + if (remainder == 0 || err != len) + goto out; + *sent_p += err; + base = 0; + } else { + base -= want; + } + + if (base < xdr->page_len) { + unsigned int len = xdr->page_len - base; + + remainder -= len; + if (remainder == 0) + msg->msg_flags &= ~MSG_MORE; + err = xprt_send_pagedata(sock, msg, xdr, base); + if (remainder == 0 || err != len) + goto out; + *sent_p += err; + base = 0; + } else { + base -= xdr->page_len; + } + + if (base >= xdr->tail[0].iov_len) + return 0; + msg->msg_flags &= ~MSG_MORE; + err = xprt_send_kvec(sock, msg, &xdr->tail[0], base); +out: + if (err > 0) { + *sent_p += err; + err = 0; + } + return err; +} diff --git a/net/sunrpc/socklib.h b/net/sunrpc/socklib.h new file mode 100644 index 000000000000..c48114ad6f00 --- /dev/null +++ b/net/sunrpc/socklib.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + * Copyright (C) 2020, Oracle. + */ + +#ifndef _NET_SUNRPC_SOCKLIB_H_ +#define _NET_SUNRPC_SOCKLIB_H_ + +int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb); +int xprt_sock_sendmsg(struct socket *sock, struct msghdr *msg, + struct xdr_buf *xdr, unsigned int base, + rpc_fraghdr marker, unsigned int *sent_p); + +#endif /* _NET_SUNRPC_SOCKLIB_H_ */ diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index c964b48eaaba..52908f9e6eab 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -66,7 +66,7 @@ static int rpc_proc_show(struct seq_file *seq, void *v) { static int rpc_proc_open(struct inode *inode, struct file *file) { - return single_open(file, rpc_proc_show, PDE_DATA(inode)); + return single_open(file, rpc_proc_show, pde_data(inode)); } static const struct proc_ops rpc_proc_ops = { diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h index c9bacb3c930f..d4a362c9e4b3 100644 --- a/net/sunrpc/sunrpc.h +++ b/net/sunrpc/sunrpc.h @@ -1,22 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /****************************************************************************** (c) 2008 NetApp. All Rights Reserved. -NetApp provides this source code under the GPL v2 License. -The GPL v2 license is available at -http://opensource.org/licenses/gpl-license.php. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ @@ -50,10 +36,7 @@ static inline int sock_is_loopback(struct sock *sk) return loopback; } -int svc_send_common(struct socket *sock, struct xdr_buf *xdr, - struct page *headpage, unsigned long headoffset, - struct page *tailpage, unsigned long tailoffset); - int rpc_clients_notifier_register(void); void rpc_clients_notifier_unregister(void); +void auth_domain_cleanup(void); #endif /* _NET_SUNRPC_SUNRPC_H */ diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index f9edaa9174a4..691c0000e9ea 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -23,6 +23,8 @@ #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/xprtsock.h> +#include "sunrpc.h" +#include "sysfs.h" #include "netns.h" unsigned int sunrpc_net_id; @@ -102,6 +104,10 @@ init_sunrpc(void) if (err) goto out4; + err = rpc_sysfs_init(); + if (err) + goto out5; + sunrpc_debugfs_init(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_register_sysctl(); @@ -110,6 +116,8 @@ init_sunrpc(void) init_socket_xprt(); /* clnt sock transport */ return 0; +out5: + unregister_rpc_pipefs(); out4: unregister_pernet_subsys(&sunrpc_net_ops); out3: @@ -123,7 +131,10 @@ out: static void __exit cleanup_sunrpc(void) { + rpc_sysfs_exit(); rpc_cleanup_clids(); + xprt_cleanup_ids(); + xprt_multipath_cleanup_ids(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); @@ -131,6 +142,7 @@ cleanup_sunrpc(void) unregister_rpc_pipefs(); rpc_destroy_mempool(); unregister_pernet_subsys(&sunrpc_net_ops); + auth_domain_cleanup(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_unregister_sysctl(); #endif diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 187dd4e73d64..149171774bc6 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -31,22 +31,43 @@ #include <trace/events/sunrpc.h> +#include "fail.h" + #define RPCDBG_FACILITY RPCDBG_SVCDSP static void svc_unregister(const struct svc_serv *serv, struct net *net); -#define svc_serv_is_pooled(serv) ((serv)->sv_ops->svo_function) - #define SVC_POOL_DEFAULT SVC_POOL_GLOBAL /* + * Mode for mapping cpus to pools. + */ +enum { + SVC_POOL_AUTO = -1, /* choose one of the others */ + SVC_POOL_GLOBAL, /* no mapping, just a single global pool + * (legacy & UP mode) */ + SVC_POOL_PERCPU, /* one pool per cpu */ + SVC_POOL_PERNODE /* one pool per numa node */ +}; + +/* * Structure for mapping cpus to pools and vice versa. * Setup once during sunrpc initialisation. */ -struct svc_pool_map svc_pool_map = { + +struct svc_pool_map { + int count; /* How many svc_servs use us */ + int mode; /* Note: int not enum to avoid + * warnings about "enumeration value + * not handled in switch" */ + unsigned int npools; + unsigned int *pool_to; /* maps pool id to cpu or node */ + unsigned int *to_pool; /* maps cpu or node to pool id */ +}; + +static struct svc_pool_map svc_pool_map = { .mode = SVC_POOL_DEFAULT }; -EXPORT_SYMBOL_GPL(svc_pool_map); static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ @@ -88,15 +109,15 @@ param_get_pool_mode(char *buf, const struct kernel_param *kp) switch (*ip) { case SVC_POOL_AUTO: - return strlcpy(buf, "auto", 20); + return strlcpy(buf, "auto\n", 20); case SVC_POOL_GLOBAL: - return strlcpy(buf, "global", 20); + return strlcpy(buf, "global\n", 20); case SVC_POOL_PERCPU: - return strlcpy(buf, "percpu", 20); + return strlcpy(buf, "percpu\n", 20); case SVC_POOL_PERNODE: - return strlcpy(buf, "pernode", 20); + return strlcpy(buf, "pernode\n", 20); default: - return sprintf(buf, "%d", *ip); + return sprintf(buf, "%d\n", *ip); } } @@ -217,10 +238,12 @@ svc_pool_map_init_pernode(struct svc_pool_map *m) /* * Add a reference to the global map of cpus to pools (and - * vice versa). Initialise the map if we're the first user. - * Returns the number of pools. + * vice versa) if pools are in use. + * Initialise the map if we're the first user. + * Returns the number of pools. If this is '1', no reference + * was taken. */ -unsigned int +static unsigned int svc_pool_map_get(void) { struct svc_pool_map *m = &svc_pool_map; @@ -230,6 +253,7 @@ svc_pool_map_get(void) if (m->count++) { mutex_unlock(&svc_pool_map_mutex); + WARN_ON_ONCE(m->npools <= 1); return m->npools; } @@ -245,30 +269,36 @@ svc_pool_map_get(void) break; } - if (npools < 0) { + if (npools <= 0) { /* default, or memory allocation failure */ npools = 1; m->mode = SVC_POOL_GLOBAL; } m->npools = npools; + if (npools == 1) + /* service is unpooled, so doesn't hold a reference */ + m->count--; + mutex_unlock(&svc_pool_map_mutex); - return m->npools; + return npools; } -EXPORT_SYMBOL_GPL(svc_pool_map_get); /* - * Drop a reference to the global map of cpus to pools. + * Drop a reference to the global map of cpus to pools, if + * pools were in use, i.e. if npools > 1. * When the last reference is dropped, the map data is * freed; this allows the sysadmin to change the pool * mode using the pool_mode module option without * rebooting or re-loading sunrpc.ko. */ -void -svc_pool_map_put(void) +static void +svc_pool_map_put(int npools) { struct svc_pool_map *m = &svc_pool_map; + if (npools <= 1) + return; mutex_lock(&svc_pool_map_mutex); if (!--m->count) { @@ -281,7 +311,6 @@ svc_pool_map_put(void) mutex_unlock(&svc_pool_map_mutex); } -EXPORT_SYMBOL_GPL(svc_pool_map_put); static int svc_pool_map_get_node(unsigned int pidx) { @@ -327,32 +356,35 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx) } } -/* - * Use the mapping mode to choose a pool for a given CPU. - * Used when enqueueing an incoming RPC. Always returns - * a non-NULL pool pointer. +/** + * svc_pool_for_cpu - Select pool to run a thread on this cpu + * @serv: An RPC service + * + * Use the active CPU and the svc_pool_map's mode setting to + * select the svc thread pool to use. Once initialized, the + * svc_pool_map does not change. + * + * Return value: + * A pointer to an svc_pool */ -struct svc_pool * -svc_pool_for_cpu(struct svc_serv *serv, int cpu) +struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv) { struct svc_pool_map *m = &svc_pool_map; + int cpu = raw_smp_processor_id(); unsigned int pidx = 0; - /* - * An uninitialised map happens in a pure client when - * lockd is brought up, so silently treat it the - * same as SVC_POOL_GLOBAL. - */ - if (svc_serv_is_pooled(serv)) { - switch (m->mode) { - case SVC_POOL_PERCPU: - pidx = m->to_pool[cpu]; - break; - case SVC_POOL_PERNODE: - pidx = m->to_pool[cpu_to_node(cpu)]; - break; - } + if (serv->sv_nrpools <= 1) + return serv->sv_pools; + + switch (m->mode) { + case SVC_POOL_PERCPU: + pidx = m->to_pool[cpu]; + break; + case SVC_POOL_PERNODE: + pidx = m->to_pool[cpu_to_node(cpu)]; + break; } + return &serv->sv_pools[pidx % serv->sv_nrpools]; } @@ -422,7 +454,7 @@ __svc_init_bc(struct svc_serv *serv) */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - const struct svc_serv_ops *ops) + int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int vers; @@ -433,13 +465,13 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, return NULL; serv->sv_name = prog->pg_name; serv->sv_program = prog; - serv->sv_nrthreads = 1; + kref_init(&serv->sv_refcnt); serv->sv_stats = prog->pg_stats; if (bufsize > RPCSVC_MAXPAYLOAD) bufsize = RPCSVC_MAXPAYLOAD; serv->sv_max_payload = bufsize? bufsize : 4096; serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE); - serv->sv_ops = ops; + serv->sv_threadfn = threadfn; xdrsize = 0; while (prog) { prog->pg_lovers = prog->pg_nvers-1; @@ -485,59 +517,56 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, return serv; } -struct svc_serv * -svc_create(struct svc_program *prog, unsigned int bufsize, - const struct svc_serv_ops *ops) +/** + * svc_create - Create an RPC service + * @prog: the RPC program the new service will handle + * @bufsize: maximum message size for @prog + * @threadfn: a function to service RPC requests for @prog + * + * Returns an instantiated struct svc_serv object or NULL. + */ +struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize, + int (*threadfn)(void *data)) { - return __svc_create(prog, bufsize, /*npools*/1, ops); + return __svc_create(prog, bufsize, 1, threadfn); } EXPORT_SYMBOL_GPL(svc_create); -struct svc_serv * -svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - const struct svc_serv_ops *ops) +/** + * svc_create_pooled - Create an RPC service with pooled threads + * @prog: the RPC program the new service will handle + * @bufsize: maximum message size for @prog + * @threadfn: a function to service RPC requests for @prog + * + * Returns an instantiated struct svc_serv object or NULL. + */ +struct svc_serv *svc_create_pooled(struct svc_program *prog, + unsigned int bufsize, + int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, ops); + serv = __svc_create(prog, bufsize, npools, threadfn); if (!serv) goto out_err; return serv; out_err: - svc_pool_map_put(); + svc_pool_map_put(npools); return NULL; } EXPORT_SYMBOL_GPL(svc_create_pooled); -void svc_shutdown_net(struct svc_serv *serv, struct net *net) -{ - svc_close_net(serv, net); - - if (serv->sv_ops->svo_shutdown) - serv->sv_ops->svo_shutdown(serv, net); -} -EXPORT_SYMBOL_GPL(svc_shutdown_net); - /* * Destroy an RPC service. Should be called with appropriate locking to - * protect the sv_nrthreads, sv_permsocks and sv_tempsocks. + * protect sv_permsocks and sv_tempsocks. */ void -svc_destroy(struct svc_serv *serv) +svc_destroy(struct kref *ref) { - dprintk("svc: svc_destroy(%s, %d)\n", - serv->sv_program->pg_name, - serv->sv_nrthreads); - - if (serv->sv_nrthreads) { - if (--(serv->sv_nrthreads) != 0) { - svc_sock_update_bufs(serv); - return; - } - } else - printk("svc_destroy: no threads for serv=%p!\n", serv); + struct svc_serv *serv = container_of(ref, struct svc_serv, sv_refcnt); + dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name); del_timer_sync(&serv->sv_temptimer); /* @@ -549,8 +578,7 @@ svc_destroy(struct svc_serv *serv) cache_clean_deferred(serv); - if (svc_serv_is_pooled(serv)) - svc_pool_map_put(); + svc_pool_map_put(serv->sv_nrpools); kfree(serv->sv_pools); kfree(serv); @@ -559,7 +587,7 @@ EXPORT_SYMBOL_GPL(svc_destroy); /* * Allocate an RPC server's buffer space. - * We allocate pages and place them in rq_argpages. + * We allocate pages and place them in rq_pages. */ static int svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) @@ -614,6 +642,10 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) rqstp->rq_server = serv; rqstp->rq_pool = pool; + rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0); + if (!rqstp->rq_scratch_page) + goto out_enomem; + rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_argp) goto out_enomem; @@ -632,7 +664,7 @@ out_enomem: } EXPORT_SYMBOL_GPL(svc_rqst_alloc); -struct svc_rqst * +static struct svc_rqst * svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) { struct svc_rqst *rqstp; @@ -641,14 +673,17 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp) return ERR_PTR(-ENOMEM); - serv->sv_nrthreads++; + svc_get(serv); + spin_lock_bh(&serv->sv_lock); + serv->sv_nrthreads += 1; + spin_unlock_bh(&serv->sv_lock); + spin_lock_bh(&pool->sp_lock); pool->sp_nrthreads++; list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); spin_unlock_bh(&pool->sp_lock); return rqstp; } -EXPORT_SYMBOL_GPL(svc_prepare_thread); /* * Choose a pool in which to create a new thread, for svc_set_num_threads @@ -722,11 +757,9 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) if (IS_ERR(rqstp)) return PTR_ERR(rqstp); - __module_get(serv->sv_ops->svo_module); - task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp, + task = kthread_create_on_node(serv->sv_threadfn, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { - module_put(serv->sv_ops->svo_module); svc_exit_thread(rqstp); return PTR_ERR(task); } @@ -742,59 +775,13 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) return 0; } - -/* destroy old threads */ -static int -svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) -{ - struct task_struct *task; - unsigned int state = serv->sv_nrthreads-1; - - /* destroy old threads */ - do { - task = choose_victim(serv, pool, &state); - if (task == NULL) - break; - send_sig(SIGINT, task, 1); - nrservs++; - } while (nrservs < 0); - - return 0; -} - /* * Create or destroy enough new threads to make the number * of threads the given number. If `pool' is non-NULL, applies * only to threads in that pool, otherwise round-robins between * all pools. Caller must ensure that mutual exclusion between this and * server startup or shutdown. - * - * Destroying threads relies on the service threads filling in - * rqstp->rq_task, which only the nfs ones do. Assumes the serv - * has been created using svc_create_pooled(). - * - * Based on code that used to be in nfsd_svc() but tweaked - * to be pool-aware. */ -int -svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) -{ - if (pool == NULL) { - /* The -1 assumes caller has done a svc_get() */ - nrservs -= (serv->sv_nrthreads-1); - } else { - spin_lock_bh(&pool->sp_lock); - nrservs -= pool->sp_nrthreads; - spin_unlock_bh(&pool->sp_lock); - } - - if (nrservs > 0) - return svc_start_kthreads(serv, pool, nrservs); - if (nrservs < 0) - return svc_signal_kthreads(serv, pool, nrservs); - return 0; -} -EXPORT_SYMBOL_GPL(svc_set_num_threads); /* destroy old threads */ static int @@ -815,11 +802,10 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) } int -svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { if (pool == NULL) { - /* The -1 assumes caller has done a svc_get() */ - nrservs -= (serv->sv_nrthreads-1); + nrservs -= serv->sv_nrthreads; } else { spin_lock_bh(&pool->sp_lock); nrservs -= pool->sp_nrthreads; @@ -832,7 +818,28 @@ svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrser return svc_stop_kthreads(serv, pool, nrservs); return 0; } -EXPORT_SYMBOL_GPL(svc_set_num_threads_sync); +EXPORT_SYMBOL_GPL(svc_set_num_threads); + +/** + * svc_rqst_replace_page - Replace one page in rq_pages[] + * @rqstp: svc_rqst with pages to replace + * @page: replacement page + * + * When replacing a page in rq_pages, batch the release of the + * replaced pages to avoid hammering the page allocator. + */ +void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) +{ + if (*rqstp->rq_next_page) { + if (!pagevec_space(&rqstp->rq_pvec)) + __pagevec_release(&rqstp->rq_pvec); + pagevec_add(&rqstp->rq_pvec, *rqstp->rq_next_page); + } + + get_page(page); + *(rqstp->rq_next_page++) = page; +} +EXPORT_SYMBOL_GPL(svc_rqst_replace_page); /* * Called from a server thread as it's exiting. Caller must hold the "service @@ -842,6 +849,8 @@ void svc_rqst_free(struct svc_rqst *rqstp) { svc_release_buffer(rqstp); + if (rqstp->rq_scratch_page) + put_page(rqstp->rq_scratch_page); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); @@ -861,11 +870,14 @@ svc_exit_thread(struct svc_rqst *rqstp) list_del_rcu(&rqstp->rq_all); spin_unlock_bh(&pool->sp_lock); + spin_lock_bh(&serv->sv_lock); + serv->sv_nrthreads -= 1; + spin_unlock_bh(&serv->sv_lock); + svc_sock_update_bufs(serv); + svc_rqst_free(rqstp); - /* Release the server */ - if (serv) - svc_destroy(serv); + svc_put(serv); } EXPORT_SYMBOL_GPL(svc_exit_thread); @@ -991,6 +1003,7 @@ static int __svc_register(struct net *net, const char *progname, #endif } + trace_svc_register(progname, version, protocol, port, family, error); return error; } @@ -1000,11 +1013,6 @@ int svc_rpcbind_set_version(struct net *net, unsigned short proto, unsigned short port) { - dprintk("svc: svc_register(%sv%d, %s, %u, %u)\n", - progp->pg_name, version, - proto == IPPROTO_UDP? "udp" : "tcp", - port, family); - return __svc_register(net, progp->pg_name, progp->pg_prog, version, family, proto, port); @@ -1024,11 +1032,8 @@ int svc_generic_rpcbind_set(struct net *net, return 0; if (vers->vs_hidden) { - dprintk("svc: svc_register(%sv%d, %s, %u, %u)" - " (but not telling portmap)\n", - progp->pg_name, version, - proto == IPPROTO_UDP? "udp" : "tcp", - port, family); + trace_svc_noregister(progp->pg_name, version, proto, + port, family, 0); return 0; } @@ -1106,8 +1111,7 @@ static void __svc_unregister(struct net *net, const u32 program, const u32 versi if (error == -EPROTONOSUPPORT) error = rpcb_register(net, program, version, 0, 0); - dprintk("svc: %s(%sv%u), error %d\n", - __func__, progname, version, error); + trace_svc_unregister(progname, version, error); } /* @@ -1132,9 +1136,6 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net) continue; if (progp->pg_vers[i]->vs_hidden) continue; - - dprintk("svc: attempting to unregister %sv%u\n", - progp->pg_name, i); __svc_unregister(net, progp->pg_prog, i, progp->pg_name); } } @@ -1169,61 +1170,6 @@ static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, .. #endif __be32 -svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err) -{ - set_bit(RQ_AUTHERR, &rqstp->rq_flags); - return auth_err; -} -EXPORT_SYMBOL_GPL(svc_return_autherr); - -static __be32 -svc_get_autherr(struct svc_rqst *rqstp, __be32 *statp) -{ - if (test_and_clear_bit(RQ_AUTHERR, &rqstp->rq_flags)) - return *statp; - return rpc_auth_ok; -} - -static int -svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp) -{ - struct kvec *argv = &rqstp->rq_arg.head[0]; - struct kvec *resv = &rqstp->rq_res.head[0]; - const struct svc_procedure *procp = rqstp->rq_procinfo; - - /* - * Decode arguments - * XXX: why do we ignore the return value? - */ - if (procp->pc_decode && - !procp->pc_decode(rqstp, argv->iov_base)) { - *statp = rpc_garbage_args; - return 1; - } - - *statp = procp->pc_func(rqstp); - - if (*statp == rpc_drop_reply || - test_bit(RQ_DROPME, &rqstp->rq_flags)) - return 0; - - if (test_bit(RQ_AUTHERR, &rqstp->rq_flags)) - return 1; - - if (*statp != rpc_success) - return 1; - - /* Encode reply */ - if (procp->pc_encode && - !procp->pc_encode(rqstp, resv->iov_base + resv->iov_len)) { - dprintk("svc: failed to encode reply\n"); - /* serv->sv_stats->rpcsystemerr++; */ - *statp = rpc_system_err; - } - return 1; -} - -__be32 svc_generic_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *ret) @@ -1259,7 +1205,7 @@ svc_generic_init_request(struct svc_rqst *rqstp, goto err_bad_proc; /* Initialize storage for argp and resp */ - memset(rqstp->rq_argp, 0, procp->pc_argsize); + memset(rqstp->rq_argp, 0, procp->pc_argzero); memset(rqstp->rq_resp, 0, procp->pc_ressize); /* Bump per-procedure stats counter */ @@ -1288,8 +1234,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) struct svc_process_info process; __be32 *statp; u32 prog, vers; - __be32 auth_stat, rpc_stat; - int auth_res; + __be32 rpc_stat; + int auth_res, rc; __be32 *reply_statp; rpc_stat = rpc_success; @@ -1298,10 +1244,10 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto err_short_len; /* Will be turned off by GSS integrity and privacy services */ - set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + __set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Will be turned off only when NFSv4 Sessions are used */ - set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); - clear_bit(RQ_DROPME, &rqstp->rq_flags); + __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + __clear_bit(RQ_DROPME, &rqstp->rq_flags); svc_putu32(resv, rqstp->rq_xid); @@ -1331,14 +1277,12 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) * We do this before anything else in order to get a decent * auth verifier. */ - auth_res = svc_authenticate(rqstp, &auth_stat); + auth_res = svc_authenticate(rqstp); /* Also give the program a chance to reject this call: */ - if (auth_res == SVC_OK && progp) { - auth_stat = rpc_autherr_badcred; + if (auth_res == SVC_OK && progp) auth_res = progp->pg_authenticate(rqstp); - } if (auth_res != SVC_OK) - trace_svc_authenticate(rqstp, auth_res, auth_stat); + trace_svc_authenticate(rqstp, auth_res); switch (auth_res) { case SVC_OK: break; @@ -1392,54 +1336,43 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) svc_reserve_auth(rqstp, procp->pc_xdrressize<<2); /* Call the function that processes the request. */ - if (!process.dispatch) { - if (!svc_generic_dispatch(rqstp, statp)) - goto release_dropit; - if (*statp == rpc_garbage_args) - goto err_garbage; - auth_stat = svc_get_autherr(rqstp, statp); - if (auth_stat != rpc_auth_ok) - goto err_release_bad_auth; - } else { - dprintk("svc: calling dispatcher\n"); - if (!process.dispatch(rqstp, statp)) - goto release_dropit; /* Release reply info */ - } + rc = process.dispatch(rqstp, statp); + if (procp->pc_release) + procp->pc_release(rqstp); + if (!rc) + goto dropit; + if (rqstp->rq_auth_stat != rpc_auth_ok) + goto err_bad_auth; /* Check RPC status result */ if (*statp != rpc_success) resv->iov_len = ((void*)statp) - resv->iov_base + 4; - /* Release reply info */ - if (procp->pc_release) - procp->pc_release(rqstp); - if (procp->pc_encode == NULL) goto dropit; sendit: if (svc_authorise(rqstp)) - goto close; + goto close_xprt; return 1; /* Caller can now send it */ -release_dropit: - if (procp->pc_release) - procp->pc_release(rqstp); dropit: svc_authorise(rqstp); /* doesn't hurt to call this twice */ dprintk("svc: svc_process dropit\n"); return 0; close: + svc_authorise(rqstp); +close_xprt: if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) - svc_close_xprt(rqstp->rq_xprt); + svc_xprt_close(rqstp->rq_xprt); dprintk("svc: svc_process close\n"); return 0; err_short_len: svc_printk(rqstp, "short len %zd, dropping request\n", argv->iov_len); - goto close; + goto close_xprt; err_bad_rpc: serv->sv_stats->rpcbadfmt++; @@ -1449,17 +1382,15 @@ err_bad_rpc: svc_putnl(resv, 2); goto sendit; -err_release_bad_auth: - if (procp->pc_release) - procp->pc_release(rqstp); err_bad_auth: - dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat)); + dprintk("svc: authentication failed (%d)\n", + be32_to_cpu(rqstp->rq_auth_stat)); serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of accept status: */ xdr_ressize_check(rqstp, reply_statp); svc_putnl(resv, 1); /* REJECT */ svc_putnl(resv, 1); /* AUTH_ERROR */ - svc_putnl(resv, ntohl(auth_stat)); /* status */ + svc_putu32(resv, rqstp->rq_auth_stat); /* status */ goto sendit; err_bad_prog: @@ -1503,8 +1434,13 @@ svc_process(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; - struct svc_serv *serv = rqstp->rq_server; - u32 dir; + __be32 dir; + +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) + if (!fail_sunrpc.ignore_server_disconnect && + should_fail(&fail_sunrpc.attr, 1)) + svc_xprt_deferred_close(rqstp->rq_xprt); +#endif /* * Setup response xdr_buf. @@ -1513,7 +1449,7 @@ svc_process(struct svc_rqst *rqstp) rqstp->rq_next_page = &rqstp->rq_respages[1]; resv->iov_base = page_address(rqstp->rq_respages[0]); resv->iov_len = 0; - rqstp->rq_res.pages = rqstp->rq_respages + 1; + rqstp->rq_res.pages = rqstp->rq_next_page; rqstp->rq_res.len = 0; rqstp->rq_res.page_base = 0; rqstp->rq_res.page_len = 0; @@ -1521,22 +1457,17 @@ svc_process(struct svc_rqst *rqstp) rqstp->rq_res.tail[0].iov_base = NULL; rqstp->rq_res.tail[0].iov_len = 0; - dir = svc_getnl(argv); - if (dir != 0) { - /* direction != CALL */ - svc_printk(rqstp, "bad direction %d, dropping request\n", dir); - serv->sv_stats->rpcbadfmt++; + dir = svc_getu32(argv); + if (dir != rpc_call) + goto out_baddir; + if (!svc_process_common(rqstp, argv, resv)) goto out_drop; - } - - /* Reserve space for the record marker */ - if (rqstp->rq_prot == IPPROTO_TCP) - svc_putnl(resv, 0); - - /* Returns 1 for send, 0 for drop */ - if (likely(svc_process_common(rqstp, argv, resv))) - return svc_send(rqstp); + return svc_send(rqstp); +out_baddir: + svc_printk(rqstp, "bad direction 0x%08x, dropping request\n", + be32_to_cpu(dir)); + rqstp->rq_server->sv_stats->rpcbadfmt++; out_drop: svc_drop(rqstp); return 0; @@ -1623,8 +1554,12 @@ out: EXPORT_SYMBOL_GPL(bc_svc_process); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* - * Return (transport-specific) limit on the rpc payload. +/** + * svc_max_payload - Return transport-specific limit on the RPC payload + * @rqstp: RPC transaction context + * + * Returns the maximum number of payload bytes the current transport + * allows. */ u32 svc_max_payload(const struct svc_rqst *rqstp) { @@ -1637,18 +1572,51 @@ u32 svc_max_payload(const struct svc_rqst *rqstp) EXPORT_SYMBOL_GPL(svc_max_payload); /** + * svc_proc_name - Return RPC procedure name in string form + * @rqstp: svc_rqst to operate on + * + * Return value: + * Pointer to a NUL-terminated string + */ +const char *svc_proc_name(const struct svc_rqst *rqstp) +{ + if (rqstp && rqstp->rq_procinfo) + return rqstp->rq_procinfo->pc_name; + return "unknown"; +} + + +/** + * svc_encode_result_payload - mark a range of bytes as a result payload + * @rqstp: svc_rqst to operate on + * @offset: payload's byte offset in rqstp->rq_res + * @length: size of payload, in bytes + * + * Returns zero on success, or a negative errno if a permanent + * error occurred. + */ +int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) +{ + return rqstp->rq_xprt->xpt_ops->xpo_result_payload(rqstp, offset, + length); +} +EXPORT_SYMBOL_GPL(svc_encode_result_payload); + +/** * svc_fill_write_vector - Construct data argument for VFS write call * @rqstp: svc_rqst to operate on - * @pages: list of pages containing data payload - * @first: buffer containing first section of write payload - * @total: total number of bytes of write payload + * @payload: xdr_buf containing only the write data payload * * Fills in rqstp::rq_vec, and returns the number of elements. */ -unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages, - struct kvec *first, size_t total) +unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, + struct xdr_buf *payload) { + struct page **pages = payload->pages; + struct kvec *first = payload->head; struct kvec *vec = rqstp->rq_vec; + size_t total = payload->len; unsigned int i; /* Some types of transport can present the write payload diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index de3c077733a7..2106003645a7 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -6,6 +6,7 @@ */ #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/errno.h> #include <linux/freezer.h> #include <linux/kthread.h> @@ -104,8 +105,17 @@ void svc_unreg_xprt_class(struct svc_xprt_class *xcl) } EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); -/* - * Format the transport list for printing +/** + * svc_print_xprts - Format the transport list for printing + * @buf: target buffer for formatted address + * @maxlen: length of target buffer + * + * Fills in @buf with a string containing a list of transport names, each name + * terminated with '\n'. If the buffer is too small, some entries may be + * missing, but it is guaranteed that all lines in the output buffer are + * complete. + * + * Returns positive length of the filled-in string. */ int svc_print_xprts(char *buf, int maxlen) { @@ -118,9 +128,9 @@ int svc_print_xprts(char *buf, int maxlen) list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { int slen; - sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); - slen = strlen(tmpstr); - if (len + slen > maxlen) + slen = snprintf(tmpstr, sizeof(tmpstr), "%s %d\n", + xcl->xcl_name, xcl->xcl_max_payload); + if (slen >= sizeof(tmpstr) || len + slen >= maxlen) break; len += slen; strcat(buf, tmpstr); @@ -130,6 +140,20 @@ int svc_print_xprts(char *buf, int maxlen) return len; } +/** + * svc_xprt_deferred_close - Close a transport + * @xprt: transport instance + * + * Used in contexts that need to defer the work of shutting down + * the transport to an nfsd thread. + */ +void svc_xprt_deferred_close(struct svc_xprt *xprt) +{ + if (!test_and_set_bit(XPT_CLOSE, &xprt->xpt_flags)) + svc_xprt_enqueue(xprt); +} +EXPORT_SYMBOL_GPL(svc_xprt_deferred_close); + static void svc_xprt_free(struct kref *kref) { struct svc_xprt *xprt = @@ -138,12 +162,13 @@ static void svc_xprt_free(struct kref *kref) if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) svcauth_unix_info_release(xprt); put_cred(xprt->xpt_cred); - put_net(xprt->xpt_net); + put_net_track(xprt->xpt_net, &xprt->ns_tracker); /* See comment on corresponding get in xs_setup_bc_tcp(): */ if (xprt->xpt_bc_xprt) xprt_put(xprt->xpt_bc_xprt); if (xprt->xpt_bc_xps) xprt_switch_put(xprt->xpt_bc_xps); + trace_svc_xprt_free(xprt); xprt->xpt_ops->xpo_free(xprt); module_put(owner); } @@ -173,7 +198,7 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl, mutex_init(&xprt->xpt_mutex); spin_lock_init(&xprt->xpt_lock); set_bit(XPT_BUSY, &xprt->xpt_flags); - xprt->xpt_net = get_net(net); + xprt->xpt_net = get_net_track(net, &xprt->ns_tracker, GFP_ATOMIC); strcpy(xprt->xpt_remotebuf, "uninitialized"); } EXPORT_SYMBOL_GPL(svc_xprt_init); @@ -197,6 +222,7 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, .sin6_port = htons(port), }; #endif + struct svc_xprt *xprt; struct sockaddr *sap; size_t len; @@ -215,18 +241,24 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, return ERR_PTR(-EAFNOSUPPORT); } - return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); + xprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); + if (IS_ERR(xprt)) + trace_svc_xprt_create_err(serv->sv_program->pg_name, + xcl->xcl_name, sap, len, xprt); + return xprt; } -/* - * svc_xprt_received conditionally queues the transport for processing - * by another thread. The caller must hold the XPT_BUSY bit and must +/** + * svc_xprt_received - start next receiver thread + * @xprt: controlling transport + * + * The caller must hold the XPT_BUSY bit and must * not thereafter touch transport data. * * Note: XPT_DATA only gets cleared when a read-attempt finds no (or * insufficient) data. */ -static void svc_xprt_received(struct svc_xprt *xprt) +void svc_xprt_received(struct svc_xprt *xprt) { if (!test_bit(XPT_BUSY, &xprt->xpt_flags)) { WARN_ONCE(1, "xprt=0x%p already busy!", xprt); @@ -234,14 +266,15 @@ static void svc_xprt_received(struct svc_xprt *xprt) } /* As soon as we clear busy, the xprt could be closed and - * 'put', so we need a reference to call svc_enqueue_xprt with: + * 'put', so we need a reference to call svc_xprt_enqueue with: */ svc_xprt_get(xprt); smp_mb__before_atomic(); clear_bit(XPT_BUSY, &xprt->xpt_flags); - xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt); + svc_xprt_enqueue(xprt); svc_xprt_put(xprt); } +EXPORT_SYMBOL_GPL(svc_xprt_received); void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new) { @@ -252,7 +285,7 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new) svc_xprt_received(new); } -static int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name, +static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred) @@ -288,25 +321,35 @@ static int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name, return -EPROTONOSUPPORT; } -int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, +/** + * svc_xprt_create - Add a new listener to @serv + * @serv: target RPC service + * @xprt_name: transport class name + * @net: network namespace + * @family: network address family + * @port: listener port + * @flags: SVC_SOCK flags + * @cred: credential to bind to this transport + * + * Return values: + * %0: New listener added successfully + * %-EPROTONOSUPPORT: Requested transport type not supported + */ +int svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred) { int err; - dprintk("svc: creating transport %s[%d]\n", xprt_name, port); - err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred); + err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred); if (err == -EPROTONOSUPPORT) { request_module("svc%s", xprt_name); - err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred); + err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred); } - if (err < 0) - dprintk("svc: transport %s not found, err %d\n", - xprt_name, -err); return err; } -EXPORT_SYMBOL_GPL(svc_create_xprt); +EXPORT_SYMBOL_GPL(svc_xprt_create); /* * Copy the local and remote xprt addresses to the rqstp structure @@ -382,6 +425,8 @@ static bool svc_xprt_ready(struct svc_xprt *xprt) smp_rmb(); xpt_flags = READ_ONCE(xprt->xpt_flags); + if (xpt_flags & BIT(XPT_BUSY)) + return false; if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE))) return true; if (xpt_flags & (BIT(XPT_DATA) | BIT(XPT_DEFERRED))) { @@ -394,11 +439,15 @@ static bool svc_xprt_ready(struct svc_xprt *xprt) return false; } -void svc_xprt_do_enqueue(struct svc_xprt *xprt) +/** + * svc_xprt_enqueue - Queue a transport on an idle nfsd thread + * @xprt: transport with data pending + * + */ +void svc_xprt_enqueue(struct svc_xprt *xprt) { struct svc_pool *pool; struct svc_rqst *rqstp = NULL; - int cpu; if (!svc_xprt_ready(xprt)) return; @@ -411,8 +460,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) return; - cpu = get_cpu(); - pool = svc_pool_for_cpu(xprt->xpt_server, cpu); + pool = svc_pool_for_cpu(xprt->xpt_server); atomic_long_inc(&pool->sp_stats.packets); @@ -435,21 +483,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) rqstp = NULL; out_unlock: rcu_read_unlock(); - put_cpu(); - trace_svc_xprt_do_enqueue(xprt, rqstp); -} -EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); - -/* - * Queue up a transport with data pending. If there are idle nfsd - * processes, wake 'em up. - * - */ -void svc_xprt_enqueue(struct svc_xprt *xprt) -{ - if (test_bit(XPT_BUSY, &xprt->xpt_flags)) - return; - xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt); + trace_svc_xprt_enqueue(xprt, rqstp); } EXPORT_SYMBOL_GPL(svc_xprt_enqueue); @@ -509,6 +543,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp) kfree(rqstp->rq_deferred); rqstp->rq_deferred = NULL; + pagevec_release(&rqstp->rq_pvec); svc_free_res_pages(rqstp); rqstp->rq_res.page_len = 0; rqstp->rq_res.page_base = 0; @@ -631,36 +666,38 @@ static void svc_check_conn_limits(struct svc_serv *serv) static int svc_alloc_arg(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; - struct xdr_buf *arg; - int pages; - int i; + struct xdr_buf *arg = &rqstp->rq_arg; + unsigned long pages, filled, ret; + + pagevec_init(&rqstp->rq_pvec); - /* now allocate needed pages. If we get a failure, sleep briefly */ pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; if (pages > RPCSVC_MAXPAGES) { - pr_warn_once("svc: warning: pages=%u > RPCSVC_MAXPAGES=%lu\n", + pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n", pages, RPCSVC_MAXPAGES); /* use as many pages as possible */ pages = RPCSVC_MAXPAGES; } - for (i = 0; i < pages ; i++) - while (rqstp->rq_pages[i] == NULL) { - struct page *p = alloc_page(GFP_KERNEL); - if (!p) { - set_current_state(TASK_INTERRUPTIBLE); - if (signalled() || kthread_should_stop()) { - set_current_state(TASK_RUNNING); - return -EINTR; - } - schedule_timeout(msecs_to_jiffies(500)); - } - rqstp->rq_pages[i] = p; + + for (filled = 0; filled < pages; filled = ret) { + ret = alloc_pages_bulk_array(GFP_KERNEL, pages, + rqstp->rq_pages); + if (ret > filled) + /* Made progress, don't sleep yet */ + continue; + + set_current_state(TASK_INTERRUPTIBLE); + if (signalled() || kthread_should_stop()) { + set_current_state(TASK_RUNNING); + return -EINTR; } - rqstp->rq_page_end = &rqstp->rq_pages[i]; - rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ + trace_svc_alloc_arg_err(pages, ret); + memalloc_retry_wait(GFP_KERNEL); + } + rqstp->rq_page_end = &rqstp->rq_pages[pages]; + rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */ /* Make arg->head point to first page and arg->pages point to rest */ - arg = &rqstp->rq_arg; arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); arg->head[0].iov_len = PAGE_SIZE; arg->pages = rqstp->rq_pages + 1; @@ -771,7 +808,6 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) int len = 0; if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { - dprintk("svc_recv: found XPT_CLOSE\n"); if (test_and_clear_bit(XPT_KILL_TEMP, &xprt->xpt_flags)) xprt->xpt_ops->xpo_kill_temp_xprt(xprt); svc_delete_xprt(xprt); @@ -790,8 +826,11 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) if (newxpt) { newxpt->xpt_cred = get_cred(xprt->xpt_cred); svc_add_new_temp_xprt(serv, newxpt); - } else + trace_svc_xprt_accept(newxpt, serv->sv_name); + } else { module_put(xprt->xpt_class->xcl_owner); + } + svc_xprt_received(xprt); } else if (svc_xprt_reserve_slot(rqstp, xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", @@ -805,11 +844,10 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) rqstp->rq_stime = ktime_get(); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); - } - /* clear XPT_BUSY: */ - svc_xprt_received(xprt); + } else + svc_xprt_received(xprt); + out: - trace_svc_handle_xprt(xprt, len); return len; } @@ -824,14 +862,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) struct svc_serv *serv = rqstp->rq_server; int len, err; - dprintk("svc: server %p waiting for data (to = %ld)\n", - rqstp, timeout); - - if (rqstp->rq_xprt) - printk(KERN_ERR - "svc_recv: service %p, transport not NULL!\n", - rqstp); - err = svc_alloc_arg(rqstp); if (err) goto out; @@ -854,6 +884,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) err = -EAGAIN; if (len <= 0) goto out_release; + trace_svc_xdr_recvfrom(&rqstp->rq_arg); clear_bit(XPT_OLD, &xprt->xpt_flags); @@ -863,7 +894,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (serv->sv_stats) serv->sv_stats->netcnt++; - trace_svc_recv(rqstp, len); return len; out_release: rqstp->rq_res.len = 0; @@ -879,7 +909,6 @@ EXPORT_SYMBOL_GPL(svc_recv); void svc_drop(struct svc_rqst *rqstp) { trace_svc_drop(rqstp); - dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); svc_xprt_release(rqstp); } EXPORT_SYMBOL_GPL(svc_drop); @@ -897,24 +926,16 @@ int svc_send(struct svc_rqst *rqstp) if (!xprt) goto out; - /* release the receive skb before sending the reply */ - xprt->xpt_ops->xpo_release_rqst(rqstp); - /* calculate over-all length */ xb = &rqstp->rq_res; xb->len = xb->head[0].iov_len + xb->page_len + xb->tail[0].iov_len; - - /* Grab mutex to serialize outgoing data. */ - mutex_lock(&xprt->xpt_mutex); + trace_svc_xdr_sendto(rqstp->rq_xid, xb); trace_svc_stats_latency(rqstp); - if (test_bit(XPT_DEAD, &xprt->xpt_flags) - || test_bit(XPT_CLOSE, &xprt->xpt_flags)) - len = -ENOTCONN; - else - len = xprt->xpt_ops->xpo_sendto(rqstp); - mutex_unlock(&xprt->xpt_mutex); + + len = xprt->xpt_ops->xpo_sendto(rqstp); + trace_svc_send(rqstp, len); svc_xprt_release(rqstp); @@ -1022,12 +1043,13 @@ static void svc_delete_xprt(struct svc_xprt *xprt) struct svc_serv *serv = xprt->xpt_server; struct svc_deferred_req *dr; - /* Only do this once */ if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) - BUG(); + return; - dprintk("svc: svc_delete_xprt(%p)\n", xprt); + trace_svc_xprt_detach(xprt); xprt->xpt_ops->xpo_detach(xprt); + if (xprt->xpt_bc_xprt) + xprt->xpt_bc_xprt->ops->close(xprt->xpt_bc_xprt); spin_lock_bh(&serv->sv_lock); list_del_init(&xprt->xpt_list); @@ -1043,8 +1065,14 @@ static void svc_delete_xprt(struct svc_xprt *xprt) svc_xprt_put(xprt); } -void svc_close_xprt(struct svc_xprt *xprt) +/** + * svc_xprt_close - Close a client connection + * @xprt: transport to disconnect + * + */ +void svc_xprt_close(struct svc_xprt *xprt) { + trace_svc_xprt_close(xprt); set_bit(XPT_CLOSE, &xprt->xpt_flags); if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) /* someone else will have to effect the close */ @@ -1057,14 +1085,14 @@ void svc_close_xprt(struct svc_xprt *xprt) */ svc_delete_xprt(xprt); } -EXPORT_SYMBOL_GPL(svc_close_xprt); +EXPORT_SYMBOL_GPL(svc_xprt_close); static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net) { struct svc_xprt *xprt; int ret = 0; - spin_lock(&serv->sv_lock); + spin_lock_bh(&serv->sv_lock); list_for_each_entry(xprt, xprt_list, xpt_list) { if (xprt->xpt_net != net) continue; @@ -1072,7 +1100,7 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); } - spin_unlock(&serv->sv_lock); + spin_unlock_bh(&serv->sv_lock); return ret; } @@ -1109,7 +1137,11 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) } } -/* +/** + * svc_xprt_destroy_all - Destroy transports associated with @serv + * @serv: RPC service to be shut down + * @net: target network namespace + * * Server threads may still be running (especially in the case where the * service is still running in other network namespaces). * @@ -1121,7 +1153,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * threads, we may need to wait a little while and then check again to * see if they're done. */ -void svc_close_net(struct svc_serv *serv, struct net *net) +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) { int delay = 0; @@ -1132,6 +1164,7 @@ void svc_close_net(struct svc_serv *serv, struct net *net) msleep(delay++); } } +EXPORT_SYMBOL_GPL(svc_xprt_destroy_all); /* * Handle defer and revisit of requests @@ -1147,16 +1180,15 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) set_bit(XPT_DEFERRED, &xprt->xpt_flags); if (too_many || test_bit(XPT_DEAD, &xprt->xpt_flags)) { spin_unlock(&xprt->xpt_lock); - dprintk("revisit canceled\n"); + trace_svc_defer_drop(dr); svc_xprt_put(xprt); - trace_svc_drop_deferred(dr); kfree(dr); return; } - dprintk("revisit queued\n"); dr->xprt = NULL; list_add(&dr->handle.recent, &xprt->xpt_deferred); spin_unlock(&xprt->xpt_lock); + trace_svc_defer_queue(dr); svc_xprt_enqueue(xprt); svc_xprt_put(xprt); } @@ -1195,44 +1227,48 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) dr->addrlen = rqstp->rq_addrlen; dr->daddr = rqstp->rq_daddr; dr->argslen = rqstp->rq_arg.len >> 2; - dr->xprt_hlen = rqstp->rq_xprt_hlen; + dr->xprt_ctxt = rqstp->rq_xprt_ctxt; + rqstp->rq_xprt_ctxt = NULL; /* back up head to the start of the buffer and copy */ skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, dr->argslen << 2); } + trace_svc_defer(rqstp); svc_xprt_get(rqstp->rq_xprt); dr->xprt = rqstp->rq_xprt; - set_bit(RQ_DROPME, &rqstp->rq_flags); + __set_bit(RQ_DROPME, &rqstp->rq_flags); dr->handle.revisit = svc_revisit; - trace_svc_defer(rqstp); return &dr->handle; } /* * recv data from a deferred request into an active one */ -static int svc_deferred_recv(struct svc_rqst *rqstp) +static noinline int svc_deferred_recv(struct svc_rqst *rqstp) { struct svc_deferred_req *dr = rqstp->rq_deferred; + trace_svc_defer_recv(dr); + /* setup iov_base past transport header */ - rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2); + rqstp->rq_arg.head[0].iov_base = dr->args; /* The iov_len does not include the transport header bytes */ - rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen; + rqstp->rq_arg.head[0].iov_len = dr->argslen << 2; rqstp->rq_arg.page_len = 0; /* The rq_arg.len includes the transport header bytes */ - rqstp->rq_arg.len = dr->argslen<<2; + rqstp->rq_arg.len = dr->argslen << 2; rqstp->rq_prot = dr->prot; memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); rqstp->rq_addrlen = dr->addrlen; /* Save off transport header len in case we get deferred again */ - rqstp->rq_xprt_hlen = dr->xprt_hlen; rqstp->rq_daddr = dr->daddr; rqstp->rq_respages = rqstp->rq_pages; - return (dr->argslen<<2) - dr->xprt_hlen; + rqstp->rq_xprt_ctxt = dr->xprt_ctxt; + svc_xprt_received(rqstp->rq_xprt); + return dr->argslen << 2; } @@ -1248,7 +1284,6 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) struct svc_deferred_req, handle.recent); list_del_init(&dr->handle.recent); - trace_svc_revisit_deferred(dr); } else clear_bit(XPT_DEFERRED, &xprt->xpt_flags); spin_unlock(&xprt->xpt_lock); diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 552617e3467b..e72ba2f13f6c 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -21,6 +21,8 @@ #include <trace/events/sunrpc.h> +#include "sunrpc.h" + #define RPCDBG_FACILITY RPCDBG_AUTH @@ -29,10 +31,12 @@ */ extern struct auth_ops svcauth_null; extern struct auth_ops svcauth_unix; +extern struct auth_ops svcauth_tls; static struct auth_ops __rcu *authtab[RPC_AUTH_MAXFLAVOR] = { [RPC_AUTH_NULL] = (struct auth_ops __force __rcu *)&svcauth_null, [RPC_AUTH_UNIX] = (struct auth_ops __force __rcu *)&svcauth_unix, + [RPC_AUTH_TLS] = (struct auth_ops __force __rcu *)&svcauth_tls, }; static struct auth_ops * @@ -57,12 +61,12 @@ svc_put_auth_ops(struct auth_ops *aops) } int -svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) +svc_authenticate(struct svc_rqst *rqstp) { rpc_authflavor_t flavor; struct auth_ops *aops; - *authp = rpc_auth_ok; + rqstp->rq_auth_stat = rpc_auth_ok; flavor = svc_getnl(&rqstp->rq_arg.head[0]); @@ -70,7 +74,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) aops = svc_get_auth_ops(flavor); if (aops == NULL) { - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } @@ -78,7 +82,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) init_svc_cred(&rqstp->rq_cred); rqstp->rq_authop = aops; - return aops->accept(rqstp, authp); + return aops->accept(rqstp); } EXPORT_SYMBOL_GPL(svc_authenticate); @@ -205,3 +209,26 @@ struct auth_domain *auth_domain_find(char *name) return NULL; } EXPORT_SYMBOL_GPL(auth_domain_find); + +/** + * auth_domain_cleanup - check that the auth_domain table is empty + * + * On module unload the auth_domain_table must be empty. To make it + * easier to catch bugs which don't clean up domains properly, we + * warn if anything remains in the table at cleanup time. + * + * Note that we cannot proactively remove the domains at this stage. + * The ->release() function might be in a module that has already been + * unloaded. + */ + +void auth_domain_cleanup(void) +{ + int h; + struct auth_domain *hp; + + for (h = 0; h < DN_HASHMAX; h++) + hlist_for_each_entry(hp, &auth_domain_table[h], hash) + pr_warn("svc: domain %s still present at module unload.\n", + hp->name); +} diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 04aa80a2d752..b1efc34db6ed 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -37,6 +37,7 @@ struct unix_domain { extern struct auth_ops svcauth_null; extern struct auth_ops svcauth_unix; +extern struct auth_ops svcauth_tls; static void svcauth_unix_domain_release_rcu(struct rcu_head *head) { @@ -148,6 +149,11 @@ static struct cache_head *ip_map_alloc(void) return NULL; } +static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h); +} + static void ip_map_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -298,15 +304,6 @@ static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, return NULL; } -static inline struct ip_map *ip_map_lookup(struct net *net, char *class, - struct in6_addr *addr) -{ - struct sunrpc_net *sn; - - sn = net_generic(net, sunrpc_net_id); - return __ip_map_lookup(sn->ip_map_cache, class, addr); -} - static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time64_t expiry) { @@ -327,15 +324,6 @@ static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, return 0; } -static inline int ip_map_update(struct net *net, struct ip_map *ipm, - struct unix_domain *udom, time64_t expiry) -{ - struct sunrpc_net *sn; - - sn = net_generic(net, sunrpc_net_id); - return __ip_map_update(sn->ip_map_cache, ipm, udom, expiry); -} - void svcauth_unix_purge(struct net *net) { struct sunrpc_net *sn; @@ -467,6 +455,11 @@ static struct cache_head *unix_gid_alloc(void) return NULL; } +static int unix_gid_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + static void unix_gid_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -584,6 +577,7 @@ static const struct cache_detail unix_gid_cache_template = { .hash_size = GID_HASHMAX, .name = "auth.unix.gid", .cache_put = unix_gid_put, + .cache_upcall = unix_gid_upcall, .cache_request = unix_gid_request, .cache_parse = unix_gid_parse, .cache_show = unix_gid_show, @@ -688,8 +682,9 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) rqstp->rq_client = NULL; if (rqstp->rq_proc == 0) - return SVC_OK; + goto out; + rqstp->rq_auth_stat = rpc_autherr_badcred; ipm = ip_map_cached_get(xprt); if (ipm == NULL) ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class, @@ -726,13 +721,16 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) put_group_info(cred->cr_group_info); cred->cr_group_info = gi; } + +out: + rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } EXPORT_SYMBOL_GPL(svcauth_unix_set_client); static int -svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_null_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -743,12 +741,12 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) if (svc_getu32(argv) != 0) { dprintk("svc: bad null cred\n"); - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { dprintk("svc: bad null verf\n"); - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } @@ -792,7 +790,66 @@ struct auth_ops svcauth_null = { static int -svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_tls_accept(struct svc_rqst *rqstp) +{ + struct svc_cred *cred = &rqstp->rq_cred; + struct kvec *argv = rqstp->rq_arg.head; + struct kvec *resv = rqstp->rq_res.head; + + if (argv->iov_len < XDR_UNIT * 3) + return SVC_GARBAGE; + + /* Call's cred length */ + if (svc_getu32(argv) != xdr_zero) { + rqstp->rq_auth_stat = rpc_autherr_badcred; + return SVC_DENIED; + } + + /* Call's verifier flavor and its length */ + if (svc_getu32(argv) != rpc_auth_null || + svc_getu32(argv) != xdr_zero) { + rqstp->rq_auth_stat = rpc_autherr_badverf; + return SVC_DENIED; + } + + /* AUTH_TLS is not valid on non-NULL procedures */ + if (rqstp->rq_proc != 0) { + rqstp->rq_auth_stat = rpc_autherr_badcred; + return SVC_DENIED; + } + + /* Mapping to nobody uid/gid is required */ + cred->cr_uid = INVALID_UID; + cred->cr_gid = INVALID_GID; + cred->cr_group_info = groups_alloc(0); + if (cred->cr_group_info == NULL) + return SVC_CLOSE; /* kmalloc failure - client must retry */ + + /* Reply's verifier */ + svc_putnl(resv, RPC_AUTH_NULL); + if (rqstp->rq_xprt->xpt_ops->xpo_start_tls) { + svc_putnl(resv, 8); + memcpy(resv->iov_base + resv->iov_len, "STARTTLS", 8); + resv->iov_len += 8; + } else + svc_putnl(resv, 0); + + rqstp->rq_cred.cr_flavor = RPC_AUTH_TLS; + return SVC_OK; +} + +struct auth_ops svcauth_tls = { + .name = "tls", + .owner = THIS_MODULE, + .flavour = RPC_AUTH_TLS, + .accept = svcauth_tls_accept, + .release = svcauth_null_release, + .set_client = svcauth_unix_set_client, +}; + + +static int +svcauth_unix_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -834,7 +891,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) } groups_sort(cred->cr_group_info); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } @@ -846,7 +903,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) return SVC_OK; badcred: - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } @@ -881,6 +938,7 @@ static const struct cache_detail ip_map_cache_template = { .hash_size = IP_HASHMAX, .name = "auth.unix.ip", .cache_put = ip_map_put, + .cache_upcall = ip_map_upcall, .cache_request = ip_map_request, .cache_parse = ip_map_parse, .cache_show = ip_map_show, diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2934dd711715..2fc98fea59b4 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -44,8 +44,8 @@ #include <net/tcp.h> #include <net/tcp_states.h> #include <linux/uaccess.h> +#include <linux/highmem.h> #include <asm/ioctls.h> -#include <trace/events/skb.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/clnt.h> @@ -55,6 +55,9 @@ #include <linux/sunrpc/stats.h> #include <linux/sunrpc/xprt.h> +#include <trace/events/sunrpc.h> + +#include "socklib.h" #include "sunrpc.h" #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -107,31 +110,26 @@ static void svc_reclassify_socket(struct socket *sock) } #endif -/* - * Release an skbuff after use +/** + * svc_tcp_release_rqst - Release transport-related resources + * @rqstp: request structure with resources to be released + * */ -static void svc_release_skb(struct svc_rqst *rqstp) +static void svc_tcp_release_rqst(struct svc_rqst *rqstp) { - struct sk_buff *skb = rqstp->rq_xprt_ctxt; - - if (skb) { - struct svc_sock *svsk = - container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - rqstp->rq_xprt_ctxt = NULL; - - dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); - skb_free_datagram_locked(svsk->sk_sk, skb); - } } -static void svc_release_udp_skb(struct svc_rqst *rqstp) +/** + * svc_udp_release_rqst - Release transport-related resources + * @rqstp: request structure with resources to be released + * + */ +static void svc_udp_release_rqst(struct svc_rqst *rqstp) { struct sk_buff *skb = rqstp->rq_xprt_ctxt; if (skb) { rqstp->rq_xprt_ctxt = NULL; - - dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); consume_skb(skb); } } @@ -174,109 +172,10 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) } } -/* - * send routine intended to be shared by the fore- and back-channel - */ -int svc_send_common(struct socket *sock, struct xdr_buf *xdr, - struct page *headpage, unsigned long headoffset, - struct page *tailpage, unsigned long tailoffset) +static int svc_sock_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) { - int result; - int size; - struct page **ppage = xdr->pages; - size_t base = xdr->page_base; - unsigned int pglen = xdr->page_len; - unsigned int flags = MSG_MORE | MSG_SENDPAGE_NOTLAST; - int slen; - int len = 0; - - slen = xdr->len; - - /* send head */ - if (slen == xdr->head[0].iov_len) - flags = 0; - len = kernel_sendpage(sock, headpage, headoffset, - xdr->head[0].iov_len, flags); - if (len != xdr->head[0].iov_len) - goto out; - slen -= xdr->head[0].iov_len; - if (slen == 0) - goto out; - - /* send page data */ - size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; - while (pglen > 0) { - if (slen == size) - flags = 0; - result = kernel_sendpage(sock, *ppage, base, size, flags); - if (result > 0) - len += result; - if (result != size) - goto out; - slen -= size; - pglen -= size; - size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; - base = 0; - ppage++; - } - - /* send tail */ - if (xdr->tail[0].iov_len) { - result = kernel_sendpage(sock, tailpage, tailoffset, - xdr->tail[0].iov_len, 0); - if (result > 0) - len += result; - } - -out: - return len; -} - - -/* - * Generic sendto routine - */ -static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) -{ - struct svc_sock *svsk = - container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - struct socket *sock = svsk->sk_sock; - union { - struct cmsghdr hdr; - long all[SVC_PKTINFO_SPACE / sizeof(long)]; - } buffer; - struct cmsghdr *cmh = &buffer.hdr; - int len = 0; - unsigned long tailoff; - unsigned long headoff; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - - if (rqstp->rq_prot == IPPROTO_UDP) { - struct msghdr msg = { - .msg_name = &rqstp->rq_addr, - .msg_namelen = rqstp->rq_addrlen, - .msg_control = cmh, - .msg_controllen = sizeof(buffer), - .msg_flags = MSG_MORE, - }; - - svc_set_cmsg_data(rqstp, cmh); - - if (sock_sendmsg(sock, &msg) < 0) - goto out; - } - - tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1); - headoff = 0; - len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff, - rqstp->rq_respages[0], tailoff); - -out: - dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n", - svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, - xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); - - return len; + return 0; } /* @@ -316,34 +215,66 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) return len; } +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +static void svc_flush_bvec(const struct bio_vec *bvec, size_t size, size_t seek) +{ + struct bvec_iter bi = { + .bi_size = size + seek, + }; + struct bio_vec bv; + + bvec_iter_advance(bvec, &bi, seek & PAGE_MASK); + for_each_bvec(bv, bvec, bi, bi) + flush_dcache_page(bv.bv_page); +} +#else +static inline void svc_flush_bvec(const struct bio_vec *bvec, size_t size, + size_t seek) +{ +} +#endif + /* - * Generic recvfrom routine. + * Read from @rqstp's transport socket. The incoming message fills whole + * pages in @rqstp's rq_pages array until the last page of the message + * has been received into a partial page. */ -static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, - unsigned int nr, size_t buflen, unsigned int base) +static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, + size_t seek) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct bio_vec *bvec = rqstp->rq_bvec; struct msghdr msg = { NULL }; + unsigned int i; ssize_t len; - - rqstp->rq_xprt_hlen = 0; + size_t t; clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - iov_iter_kvec(&msg.msg_iter, READ, iov, nr, buflen); - if (base != 0) { - iov_iter_advance(&msg.msg_iter, base); - buflen -= base; + + for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) { + bvec[i].bv_page = rqstp->rq_pages[i]; + bvec[i].bv_len = PAGE_SIZE; + bvec[i].bv_offset = 0; + } + rqstp->rq_respages = &rqstp->rq_pages[i]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + iov_iter_bvec(&msg.msg_iter, READ, bvec, i, buflen); + if (seek) { + iov_iter_advance(&msg.msg_iter, seek); + buflen -= seek; } len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); + if (len > 0) + svc_flush_bvec(bvec, len, seek); + /* If we read a full record, then assume there may be more * data to read (stream based sockets only!) */ if (len == buflen) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - dprintk("svc: socket %p recvfrom(%p, %zu) = %zd\n", - svsk, iov[0].iov_base, iov[0].iov_len, len); return len; } @@ -367,9 +298,9 @@ static void svc_sock_setbufsize(struct svc_sock *svsk, unsigned int nreqs) static void svc_sock_secure_port(struct svc_rqst *rqstp) { if (svc_port_is_privileged(svc_addr(rqstp))) - set_bit(RQ_SECURE, &rqstp->rq_flags); + __set_bit(RQ_SECURE, &rqstp->rq_flags); else - clear_bit(RQ_SECURE, &rqstp->rq_flags); + __clear_bit(RQ_SECURE, &rqstp->rq_flags); } /* @@ -380,13 +311,10 @@ static void svc_data_ready(struct sock *sk) struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; if (svsk) { - dprintk("svc: socket %p(inet %p), busy=%d\n", - svsk, sk, - test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); - /* Refer to svc_setup_socket() for details. */ rmb(); svsk->sk_odata(sk); + trace_svcsock_data_ready(&svsk->sk_xprt, 0); if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags)) svc_xprt_enqueue(&svsk->sk_xprt); } @@ -400,11 +328,9 @@ static void svc_write_space(struct sock *sk) struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); if (svsk) { - dprintk("svc: socket %p(inet %p), write_space busy=%d\n", - svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); - /* Refer to svc_setup_socket() for details. */ rmb(); + trace_svcsock_write_space(&svsk->sk_xprt, 0); svsk->sk_owspace(sk); svc_xprt_enqueue(&svsk->sk_xprt); } @@ -421,17 +347,9 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) { - struct svc_sock *svsk; - struct socket *sock; - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - svsk = container_of(xprt, struct svc_sock, sk_xprt); - sock = svsk->sk_sock; - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); + sock_no_linger(svsk->sk_sock->sk); } /* @@ -489,8 +407,15 @@ static int svc_udp_get_dest_address(struct svc_rqst *rqstp, return 0; } -/* - * Receive a datagram from a UDP socket. +/** + * svc_udp_recvfrom - Receive a datagram from a UDP socket. + * @rqstp: request structure into which to receive an RPC Call + * + * Called in a loop when XPT_DATA has been set. + * + * Returns: + * On success, the number of bytes in a received RPC Call, or + * %0 if a complete RPC Call message was not ready to return */ static int svc_udp_recvfrom(struct svc_rqst *rqstp) { @@ -524,20 +449,14 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) svc_sock_setbufsize(svsk, serv->sv_nrthreads + 3); clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - skb = NULL; err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 0, 0, MSG_PEEK | MSG_DONTWAIT); - if (err >= 0) - skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err); - - if (skb == NULL) { - if (err != -EAGAIN) { - /* possibly an icmp error */ - dprintk("svc: recvfrom returned error %d\n", -err); - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - } - return 0; - } + if (err < 0) + goto out_recv_err; + skb = skb_recv_udp(svsk->sk_sk, MSG_DONTWAIT, &err); + if (!skb) + goto out_recv_err; + len = svc_addr_len(svc_addr(rqstp)); rqstp->rq_addrlen = len; if (skb->tstamp == 0) { @@ -548,26 +467,21 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) sock_write_timestamp(svsk->sk_sk, skb->tstamp); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ - len = skb->len; + len = skb->len; rqstp->rq_arg.len = len; + trace_svcsock_udp_recv(&svsk->sk_xprt, len); rqstp->rq_prot = IPPROTO_UDP; - if (!svc_udp_get_dest_address(rqstp, cmh)) { - net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n", - cmh->cmsg_level, cmh->cmsg_type); - goto out_free; - } + if (!svc_udp_get_dest_address(rqstp, cmh)) + goto out_cmsg_err; rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp)); if (skb_is_nonlinear(skb)) { /* we have to copy */ local_bh_disable(); - if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { - local_bh_enable(); - /* checksum error */ - goto out_free; - } + if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) + goto out_bh_enable; local_bh_enable(); consume_skb(skb); } else { @@ -594,23 +508,86 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (serv->sv_stats) serv->sv_stats->netudpcnt++; + svc_xprt_received(rqstp->rq_xprt); return len; + +out_recv_err: + if (err != -EAGAIN) { + /* possibly an icmp error */ + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + } + trace_svcsock_udp_recv_err(&svsk->sk_xprt, err); + goto out_clear_busy; +out_cmsg_err: + net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n", + cmh->cmsg_level, cmh->cmsg_type); + goto out_free; +out_bh_enable: + local_bh_enable(); out_free: kfree_skb(skb); +out_clear_busy: + svc_xprt_received(rqstp->rq_xprt); return 0; } -static int -svc_udp_sendto(struct svc_rqst *rqstp) +/** + * svc_udp_sendto - Send out a reply on a UDP socket + * @rqstp: completed svc_rqst + * + * xpt_mutex ensures @rqstp's whole message is written to the socket + * without interruption. + * + * Returns the number of bytes sent, or a negative errno. + */ +static int svc_udp_sendto(struct svc_rqst *rqstp) { - int error; + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct xdr_buf *xdr = &rqstp->rq_res; + union { + struct cmsghdr hdr; + long all[SVC_PKTINFO_SPACE / sizeof(long)]; + } buffer; + struct cmsghdr *cmh = &buffer.hdr; + struct msghdr msg = { + .msg_name = &rqstp->rq_addr, + .msg_namelen = rqstp->rq_addrlen, + .msg_control = cmh, + .msg_controllen = sizeof(buffer), + }; + unsigned int sent; + int err; + + svc_udp_release_rqst(rqstp); - error = svc_sendto(rqstp, &rqstp->rq_res); - if (error == -ECONNREFUSED) + svc_set_cmsg_data(rqstp, cmh); + + mutex_lock(&xprt->xpt_mutex); + + if (svc_xprt_is_dead(xprt)) + goto out_notconn; + + err = xdr_alloc_bvec(xdr, GFP_KERNEL); + if (err < 0) + goto out_unlock; + + err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); + if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ - error = svc_sendto(rqstp, &rqstp->rq_res); + err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); + } + xdr_free_bvec(xdr); + trace_svcsock_udp_send(xprt, err); +out_unlock: + mutex_unlock(&xprt->xpt_mutex); + if (err < 0) + return err; + return sent; - return error; +out_notconn: + mutex_unlock(&xprt->xpt_mutex); + return -ENOTCONN; } static int svc_udp_has_wspace(struct svc_xprt *xprt) @@ -653,7 +630,8 @@ static const struct svc_xprt_ops svc_udp_ops = { .xpo_create = svc_udp_create, .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, - .xpo_release_rqst = svc_release_udp_skb, + .xpo_result_payload = svc_sock_result_payload, + .xpo_release_rqst = svc_udp_release_rqst, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_udp_has_wspace, @@ -672,8 +650,6 @@ static struct svc_xprt_class svc_udp_class = { static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) { - int err, level, optname, one = 1; - svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class, &svsk->sk_xprt, serv); clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); @@ -693,19 +669,14 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { case AF_INET: - level = SOL_IP; - optname = IP_PKTINFO; + ip_sock_set_pktinfo(svsk->sk_sock->sk); break; case AF_INET6: - level = SOL_IPV6; - optname = IPV6_RECVPKTINFO; + ip6_sock_set_recvpktinfo(svsk->sk_sock->sk); break; default: BUG(); } - err = kernel_setsockopt(svsk->sk_sock, level, optname, - (char *)&one, sizeof(one)); - dprintk("svc: kernel_setsockopt returned %d\n", err); } /* @@ -716,9 +687,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - dprintk("svc: socket %p TCP (listen) state change %d\n", - sk, sk->sk_state); - if (svsk) { /* Refer to svc_setup_socket() for details. */ rmb(); @@ -739,8 +707,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk) if (svsk) { set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); - } else - printk("svc: socket %p: no user data\n", sk); + } } } @@ -751,19 +718,13 @@ static void svc_tcp_state_change(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", - sk, sk->sk_state, sk->sk_user_data); - - if (!svsk) - printk("svc: socket %p: no user data\n", sk); - else { + if (svsk) { /* Refer to svc_setup_socket() for details. */ rmb(); svsk->sk_ostate(sk); - if (sk->sk_state != TCP_ESTABLISHED) { - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - } + trace_svcsock_tcp_state(&svsk->sk_xprt, svsk->sk_sock); + if (sk->sk_state != TCP_ESTABLISHED) + svc_xprt_deferred_close(&svsk->sk_xprt); } } @@ -780,9 +741,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) struct socket *newsock; struct svc_sock *newsvsk; int err, slen; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); if (!sock) return NULL; @@ -795,30 +754,18 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) else if (err != -EAGAIN) net_warn_ratelimited("%s: accept failed (err %d)!\n", serv->sv_name, -err); + trace_svcsock_accept_err(xprt, serv->sv_name, err); return NULL; } set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_getpeername(newsock, sin); if (err < 0) { - net_warn_ratelimited("%s: peername failed (err %d)!\n", - serv->sv_name, -err); + trace_svcsock_getpeername_err(xprt, serv->sv_name, err); goto failed; /* aborted connection or whatever */ } slen = err; - /* Ideally, we would want to reject connections from unauthorized - * hosts here, but when we get encryption, the IP of the host won't - * tell us anything. For now just warn about unpriv connections. - */ - if (!svc_port_is_privileged(sin)) { - dprintk("%s: connect from unprivileged port: %s\n", - serv->sv_name, - __svc_print_addr(sin, buf, sizeof(buf))); - } - dprintk("%s: connect from %s\n", serv->sv_name, - __svc_print_addr(sin, buf, sizeof(buf))); - /* Reset the inherited callbacks before calling svc_setup_socket */ newsock->sk->sk_state_change = svsk->sk_ostate; newsock->sk->sk_data_ready = svsk->sk_odata; @@ -836,10 +783,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen); err = kernel_getsockname(newsock, sin); slen = err; - if (unlikely(err < 0)) { - dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); + if (unlikely(err < 0)) slen = offsetof(struct sockaddr, sa_data); - } svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen); if (sock_is_loopback(newsock->sk)) @@ -856,13 +801,14 @@ failed: return NULL; } -static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) +static size_t svc_tcp_restore_pages(struct svc_sock *svsk, + struct svc_rqst *rqstp) { - unsigned int i, len, npages; + size_t len = svsk->sk_datalen; + unsigned int i, npages; - if (svsk->sk_datalen == 0) + if (!len) return 0; - len = svsk->sk_datalen; npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < npages; i++) { if (rqstp->rq_pages[i] != NULL) @@ -911,47 +857,45 @@ out: } /* - * Receive fragment record header. - * If we haven't gotten the record length yet, get the next four bytes. + * Receive fragment record header into sk_marker. */ -static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) +static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, + struct svc_rqst *rqstp) { - struct svc_serv *serv = svsk->sk_xprt.xpt_server; - unsigned int want; - int len; + ssize_t want, len; + /* If we haven't gotten the record length yet, + * get the next four bytes. + */ if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) { + struct msghdr msg = { NULL }; struct kvec iov; want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; - iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; + iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen; iov.iov_len = want; - len = svc_recvfrom(rqstp, &iov, 1, want, 0); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, want); + len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); if (len < 0) - goto error; + return len; svsk->sk_tcplen += len; - if (len < want) { - dprintk("svc: short recvfrom while reading record " - "length (%d of %d)\n", len, want); - return -EAGAIN; + /* call again to read the remaining bytes */ + goto err_short; } - - dprintk("svc: TCP record, %d bytes\n", svc_sock_reclen(svsk)); + trace_svcsock_marker(&svsk->sk_xprt, svsk->sk_marker); if (svc_sock_reclen(svsk) + svsk->sk_datalen > - serv->sv_max_mesg) { - net_notice_ratelimited("RPC: fragment too large: %d\n", - svc_sock_reclen(svsk)); - goto err_delete; - } + svsk->sk_xprt.xpt_server->sv_max_mesg) + goto err_too_large; } - return svc_sock_reclen(svsk); -error: - dprintk("RPC: TCP recv_record got %d\n", len); - return len; -err_delete: - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + +err_too_large: + net_notice_ratelimited("svc: %s %s RPC fragment too large: %d\n", + __func__, svsk->sk_xprt.xpt_server->sv_name, + svc_sock_reclen(svsk)); + svc_xprt_deferred_close(&svsk->sk_xprt); +err_short: return -EAGAIN; } @@ -1000,87 +944,58 @@ unlock_eagain: return -EAGAIN; } -static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len) -{ - int i = 0; - int t = 0; - - while (t < len) { - vec[i].iov_base = page_address(pages[i]); - vec[i].iov_len = PAGE_SIZE; - i++; - t += PAGE_SIZE; - } - return i; -} - static void svc_tcp_fragment_received(struct svc_sock *svsk) { /* If we have more data, signal svc_xprt_enqueue() to try again */ - dprintk("svc: TCP %s record (%d bytes)\n", - svc_sock_final_rec(svsk) ? "final" : "nonfinal", - svc_sock_reclen(svsk)); svsk->sk_tcplen = 0; - svsk->sk_reclen = 0; + svsk->sk_marker = xdr_zero; } -/* - * Receive data from a TCP socket. +/** + * svc_tcp_recvfrom - Receive data from a TCP socket + * @rqstp: request structure into which to receive an RPC Call + * + * Called in a loop when XPT_DATA has been set. + * + * Read the 4-byte stream record marker, then use the record length + * in that marker to set up exactly the resources needed to receive + * the next RPC message into @rqstp. + * + * Returns: + * On success, the number of bytes in a received RPC Call, or + * %0 if a complete RPC Call message was not ready to return + * + * The zero return case handles partial receives and callback Replies. + * The state of a partial receive is preserved in the svc_sock for + * the next call to svc_tcp_recvfrom. */ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct svc_serv *serv = svsk->sk_xprt.xpt_server; - int len; - struct kvec *vec; - unsigned int want, base; + size_t want, base; + ssize_t len; __be32 *p; __be32 calldir; - int pnum; - - dprintk("svc: tcp_recv %p data %d conn %d close %d\n", - svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), - test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), - test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); - len = svc_tcp_recv_record(svsk, rqstp); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + len = svc_tcp_read_marker(svsk, rqstp); if (len < 0) goto error; base = svc_tcp_restore_pages(svsk, rqstp); - want = svc_sock_reclen(svsk) - (svsk->sk_tcplen - sizeof(rpc_fraghdr)); - - vec = rqstp->rq_vec; - - pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], base + want); - - rqstp->rq_respages = &rqstp->rq_pages[pnum]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* Now receive data */ - len = svc_recvfrom(rqstp, vec, pnum, base + want, base); + want = len - (svsk->sk_tcplen - sizeof(rpc_fraghdr)); + len = svc_tcp_read_msg(rqstp, base + want, base); if (len >= 0) { + trace_svcsock_tcp_recv(&svsk->sk_xprt, len); svsk->sk_tcplen += len; svsk->sk_datalen += len; } - if (len != want || !svc_sock_final_rec(svsk)) { - svc_tcp_save_pages(svsk, rqstp); - if (len < 0 && len != -EAGAIN) - goto err_delete; - if (len == want) - svc_tcp_fragment_received(svsk); - else - dprintk("svc: incomplete TCP record (%d of %d)\n", - (int)(svsk->sk_tcplen - sizeof(rpc_fraghdr)), - svc_sock_reclen(svsk)); - goto err_noclose; - } - - if (svsk->sk_datalen < 8) { - svsk->sk_datalen = 0; - goto err_delete; /* client is nuts. */ - } + if (len != want || !svc_sock_final_rec(svsk)) + goto err_incomplete; + if (svsk->sk_datalen < 8) + goto err_nuts; rqstp->rq_arg.len = svsk->sk_datalen; rqstp->rq_arg.page_base = 0; @@ -1093,9 +1008,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; if (test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags)) - set_bit(RQ_LOCAL, &rqstp->rq_flags); + __set_bit(RQ_LOCAL, &rqstp->rq_flags); else - clear_bit(RQ_LOCAL, &rqstp->rq_flags); + __clear_bit(RQ_LOCAL, &rqstp->rq_flags); p = (__be32 *)rqstp->rq_arg.head[0].iov_base; calldir = p[1]; @@ -1113,50 +1028,167 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) if (serv->sv_stats) serv->sv_stats->nettcpcnt++; + svc_xprt_received(rqstp->rq_xprt); return rqstp->rq_arg.len; +err_incomplete: + svc_tcp_save_pages(svsk, rqstp); + if (len < 0 && len != -EAGAIN) + goto err_delete; + if (len == want) + svc_tcp_fragment_received(svsk); + else + trace_svcsock_tcp_recv_short(&svsk->sk_xprt, + svc_sock_reclen(svsk), + svsk->sk_tcplen - sizeof(rpc_fraghdr)); + goto err_noclose; error: if (len != -EAGAIN) goto err_delete; - dprintk("RPC: TCP recvfrom got EAGAIN\n"); - return 0; + trace_svcsock_tcp_recv_eagain(&svsk->sk_xprt, 0); + goto err_noclose; +err_nuts: + svsk->sk_datalen = 0; err_delete: - printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", - svsk->sk_xprt.xpt_server->sv_name, -len); - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + trace_svcsock_tcp_recv_err(&svsk->sk_xprt, len); + svc_xprt_deferred_close(&svsk->sk_xprt); err_noclose: + svc_xprt_received(rqstp->rq_xprt); return 0; /* record not complete */ } +static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, + int flags) +{ + return kernel_sendpage(sock, virt_to_page(vec->iov_base), + offset_in_page(vec->iov_base), + vec->iov_len, flags); +} + /* - * Send out data on TCP socket. + * kernel_sendpage() is used exclusively to reduce the number of + * copy operations in this path. Therefore the caller must ensure + * that the pages backing @xdr are unchanging. + * + * In addition, the logic assumes that * .bv_len is never larger + * than PAGE_SIZE. */ -static int svc_tcp_sendto(struct svc_rqst *rqstp) +static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr, + rpc_fraghdr marker, unsigned int *sentp) { - struct xdr_buf *xbufp = &rqstp->rq_res; - int sent; - __be32 reclen; + const struct kvec *head = xdr->head; + const struct kvec *tail = xdr->tail; + struct kvec rm = { + .iov_base = &marker, + .iov_len = sizeof(marker), + }; + struct msghdr msg = { + .msg_flags = 0, + }; + int ret; + + *sentp = 0; + ret = xdr_alloc_bvec(xdr, GFP_KERNEL); + if (ret < 0) + return ret; + + ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len); + if (ret < 0) + return ret; + *sentp += ret; + if (ret != rm.iov_len) + return -EAGAIN; - /* Set up the first element of the reply kvec. - * Any other kvecs that may be in use have been taken - * care of by the server implementation itself. - */ - reclen = htonl(0x80000000|((xbufp->len ) - 4)); - memcpy(xbufp->head[0].iov_base, &reclen, 4); - - sent = svc_sendto(rqstp, &rqstp->rq_res); - if (sent != xbufp->len) { - printk(KERN_NOTICE - "rpc-srv/tcp: %s: %s %d when sending %d bytes " - "- shutting down socket\n", - rqstp->rq_xprt->xpt_server->sv_name, - (sent<0)?"got error":"sent only", - sent, xbufp->len); - set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags); - svc_xprt_enqueue(rqstp->rq_xprt); - sent = -EAGAIN; + ret = svc_tcp_send_kvec(sock, head, 0); + if (ret < 0) + return ret; + *sentp += ret; + if (ret != head->iov_len) + goto out; + + if (xdr->page_len) { + unsigned int offset, len, remaining; + struct bio_vec *bvec; + + bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT); + offset = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining > 0) { + len = min(remaining, bvec->bv_len - offset); + ret = kernel_sendpage(sock, bvec->bv_page, + bvec->bv_offset + offset, + len, 0); + if (ret < 0) + return ret; + *sentp += ret; + if (ret != len) + goto out; + remaining -= len; + offset = 0; + bvec++; + } + } + + if (tail->iov_len) { + ret = svc_tcp_send_kvec(sock, tail, 0); + if (ret < 0) + return ret; + *sentp += ret; } + +out: + return 0; +} + +/** + * svc_tcp_sendto - Send out a reply on a TCP socket + * @rqstp: completed svc_rqst + * + * xpt_mutex ensures @rqstp's whole message is written to the socket + * without interruption. + * + * Returns the number of bytes sent, or a negative errno. + */ +static int svc_tcp_sendto(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct xdr_buf *xdr = &rqstp->rq_res; + rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | + (u32)xdr->len); + unsigned int sent; + int err; + + svc_tcp_release_rqst(rqstp); + + atomic_inc(&svsk->sk_sendqlen); + mutex_lock(&xprt->xpt_mutex); + if (svc_xprt_is_dead(xprt)) + goto out_notconn; + tcp_sock_set_cork(svsk->sk_sk, true); + err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent); + xdr_free_bvec(xdr); + trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent); + if (err < 0 || sent != (xdr->len + sizeof(marker))) + goto out_close; + if (atomic_dec_and_test(&svsk->sk_sendqlen)) + tcp_sock_set_cork(svsk->sk_sk, false); + mutex_unlock(&xprt->xpt_mutex); return sent; + +out_notconn: + atomic_dec(&svsk->sk_sendqlen); + mutex_unlock(&xprt->xpt_mutex); + return -ENOTCONN; +out_close: + pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + xprt->xpt_server->sv_name, + (err < 0) ? "got error" : "sent", + (err < 0) ? err : sent, xdr->len); + svc_xprt_deferred_close(xprt); + atomic_dec(&svsk->sk_sendqlen); + mutex_unlock(&xprt->xpt_mutex); + return -EAGAIN; } static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, @@ -1171,7 +1203,8 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_create = svc_tcp_create, .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, - .xpo_release_rqst = svc_release_skb, + .xpo_result_payload = svc_sock_result_payload, + .xpo_release_rqst = svc_tcp_release_rqst, .xpo_detach = svc_tcp_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_tcp_has_wspace, @@ -1209,23 +1242,21 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags); if (sk->sk_state == TCP_LISTEN) { - dprintk("setting up TCP socket for listening\n"); strcpy(svsk->sk_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { - dprintk("setting up TCP socket for reading\n"); sk->sk_state_change = svc_tcp_state_change; sk->sk_data_ready = svc_data_ready; sk->sk_write_space = svc_write_space; - svsk->sk_reclen = 0; + svsk->sk_marker = xdr_zero; svsk->sk_tcplen = 0; svsk->sk_datalen = 0; memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages)); - tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; + tcp_sock_set_nodelay(sk); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); switch (sk->sk_state) { @@ -1233,7 +1264,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) case TCP_ESTABLISHED: break; default: - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + svc_xprt_deferred_close(&svsk->sk_xprt); } } } @@ -1265,7 +1296,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); int err = 0; - dprintk("svc: svc_setup_socket %p\n", sock); svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); if (!svsk) return ERR_PTR(-ENOMEM); @@ -1302,12 +1332,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); - dprintk("svc: svc_setup_socket created %p (inet %p), " - "listen %d close %d\n", - svsk, svsk->sk_sk, - test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags), - test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); - + trace_svcsock_new_socket(sock); return svsk; } @@ -1399,12 +1424,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, struct sockaddr *newsin = (struct sockaddr *)&addr; int newlen; int family; - int val; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - - dprintk("svc: svc_create_socket(%s, %d, %s)\n", - serv->sv_program->pg_name, protocol, - __svc_print_addr(sin, buf, sizeof(buf))); if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { printk(KERN_WARNING "svc: only UDP and TCP " @@ -1435,11 +1454,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, * getting requests from IPv4 remotes. Those should * be shunted to a PF_INET listener via rpcbind. */ - val = 1; if (family == PF_INET6) - kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, - (char *)&val, sizeof(val)); - + ip6_sock_set_v6only(sock->sk); if (type == SOCK_STREAM) sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */ error = kernel_bind(sock, sin, len); @@ -1464,7 +1480,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); return (struct svc_xprt *)svsk; bummer: - dprintk("svc: svc_create_socket error = %d\n", -error); sock_release(sock); return ERR_PTR(error); } @@ -1478,8 +1493,6 @@ static void svc_sock_detach(struct svc_xprt *xprt) struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct sock *sk = svsk->sk_sk; - dprintk("svc: svc_sock_detach(%p)\n", svsk); - /* put back the old socket callbacks */ lock_sock(sk); sk->sk_state_change = svsk->sk_ostate; @@ -1496,8 +1509,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - dprintk("svc: svc_tcp_sock_detach(%p)\n", svsk); - svc_sock_detach(xprt); if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) { @@ -1512,7 +1523,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) static void svc_sock_free(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - dprintk("svc: svc_sock_free(%p)\n", svsk); if (svsk->sk_sock->file) sockfd_put(svsk->sk_sock); diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index d75f17b56f0e..3aad6ef18504 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -60,25 +60,32 @@ rpc_unregister_sysctl(void) } static int proc_do_xprt(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; - size_t len; + ssize_t len; - if ((*ppos && !write) || !*lenp) { + if (write || *ppos) { *lenp = 0; return 0; } len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + len = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + + if (len < 0) { + *lenp = 0; + return -EINVAL; + } + *lenp = len; + return 0; } static int -proc_dodebug(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp, + loff_t *ppos) { - char tmpbuf[20], c, *s = NULL; - char __user *p; + char tmpbuf[20], *s = NULL; + char *p; unsigned int value; size_t left, len; @@ -90,18 +97,17 @@ proc_dodebug(struct ctl_table *table, int write, left = *lenp; if (write) { - if (!access_ok(buffer, left)) - return -EFAULT; p = buffer; - while (left && __get_user(c, p) >= 0 && isspace(c)) - left--, p++; + while (left && isspace(*p)) { + left--; + p++; + } if (!left) goto done; if (left > sizeof(tmpbuf) - 1) return -EINVAL; - if (copy_from_user(tmpbuf, p, left)) - return -EFAULT; + memcpy(tmpbuf, p, left); tmpbuf[left] = '\0'; value = simple_strtol(tmpbuf, &s, 0); @@ -109,8 +115,10 @@ proc_dodebug(struct ctl_table *table, int write, left -= (s - tmpbuf); if (left && !isspace(*s)) return -EINVAL; - while (left && isspace(*s)) - left--, s++; + while (left && isspace(*s)) { + left--; + s++; + } } else left = 0; *(unsigned int *) table->data = value; @@ -121,11 +129,9 @@ proc_dodebug(struct ctl_table *table, int write, len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data); if (len > left) len = left; - if (copy_to_user(buffer, tmpbuf, len)) - return -EFAULT; + memcpy(buffer, tmpbuf, len); if ((left -= len) > 0) { - if (put_user('\n', (char __user *)buffer + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; left--; } } diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c new file mode 100644 index 000000000000..c1f559892ae8 --- /dev/null +++ b/net/sunrpc/sysfs.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Anna Schumaker <Anna.Schumaker@Netapp.com> + */ +#include <linux/sunrpc/clnt.h> +#include <linux/kobject.h> +#include <linux/sunrpc/addr.h> +#include <linux/sunrpc/xprtsock.h> + +#include "sysfs.h" + +struct xprt_addr { + const char *addr; + struct rcu_head rcu; +}; + +static void free_xprt_addr(struct rcu_head *head) +{ + struct xprt_addr *addr = container_of(head, struct xprt_addr, rcu); + + kfree(addr->addr); + kfree(addr); +} + +static struct kset *rpc_sunrpc_kset; +static struct kobject *rpc_sunrpc_client_kobj, *rpc_sunrpc_xprt_switch_kobj; + +static void rpc_sysfs_object_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct kobj_ns_type_operations * +rpc_sysfs_object_child_ns_type(struct kobject *kobj) +{ + return &net_ns_type_operations; +} + +static struct kobj_type rpc_sysfs_object_type = { + .release = rpc_sysfs_object_release, + .sysfs_ops = &kobj_sysfs_ops, + .child_ns_type = rpc_sysfs_object_child_ns_type, +}; + +static struct kobject *rpc_sysfs_object_alloc(const char *name, + struct kset *kset, + struct kobject *parent) +{ + struct kobject *kobj; + + kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + if (kobj) { + kobj->kset = kset; + if (kobject_init_and_add(kobj, &rpc_sysfs_object_type, + parent, "%s", name) == 0) + return kobj; + kobject_put(kobj); + } + return NULL; +} + +static inline struct rpc_xprt * +rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj) +{ + struct rpc_sysfs_xprt *x = container_of(kobj, + struct rpc_sysfs_xprt, kobject); + + return xprt_get(x->xprt); +} + +static inline struct rpc_xprt_switch * +rpc_sysfs_xprt_kobj_get_xprt_switch(struct kobject *kobj) +{ + struct rpc_sysfs_xprt *x = container_of(kobj, + struct rpc_sysfs_xprt, kobject); + + return xprt_switch_get(x->xprt_switch); +} + +static inline struct rpc_xprt_switch * +rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj) +{ + struct rpc_sysfs_xprt_switch *x = container_of(kobj, + struct rpc_sysfs_xprt_switch, kobject); + + return xprt_switch_get(x->xprt_switch); +} + +static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + ssize_t ret; + + if (!xprt) { + ret = sprintf(buf, "<closed>\n"); + goto out; + } + ret = sprintf(buf, "%s\n", xprt->address_strings[RPC_DISPLAY_ADDR]); + xprt_put(xprt); +out: + return ret; +} + +static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + size_t buflen = PAGE_SIZE; + ssize_t ret; + + if (!xprt || !xprt_connected(xprt)) { + ret = sprintf(buf, "<closed>\n"); + } else if (xprt->ops->get_srcaddr) { + ret = xprt->ops->get_srcaddr(xprt, buf, buflen); + if (ret > 0) { + if (ret < buflen - 1) { + buf[ret] = '\n'; + ret++; + buf[ret] = '\0'; + } + } else + ret = sprintf(buf, "<closed>\n"); + } else + ret = sprintf(buf, "<not a socket>\n"); + xprt_put(xprt); + return ret; +} + +static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + unsigned short srcport = 0; + size_t buflen = PAGE_SIZE; + ssize_t ret; + + if (!xprt || !xprt_connected(xprt)) { + ret = sprintf(buf, "<closed>\n"); + goto out; + } + + if (xprt->ops->get_srcport) + srcport = xprt->ops->get_srcport(xprt); + + ret = snprintf(buf, buflen, + "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" + "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" + "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" + "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n" + "tasks_queuelen=%ld\ndst_port=%s\n", + xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs, + xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen, + xprt->sending.qlen, xprt->pending.qlen, + xprt->backlog.qlen, xprt->main, srcport, + atomic_long_read(&xprt->queuelen), + xprt->address_strings[RPC_DISPLAY_PORT]); +out: + xprt_put(xprt); + return ret; +} + +static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + ssize_t ret; + int locked, connected, connecting, close_wait, bound, binding, + closing, congested, cwnd_wait, write_space, offline, remove; + + if (!(xprt && xprt->state)) { + ret = sprintf(buf, "state=CLOSED\n"); + } else { + locked = test_bit(XPRT_LOCKED, &xprt->state); + connected = test_bit(XPRT_CONNECTED, &xprt->state); + connecting = test_bit(XPRT_CONNECTING, &xprt->state); + close_wait = test_bit(XPRT_CLOSE_WAIT, &xprt->state); + bound = test_bit(XPRT_BOUND, &xprt->state); + binding = test_bit(XPRT_BINDING, &xprt->state); + closing = test_bit(XPRT_CLOSING, &xprt->state); + congested = test_bit(XPRT_CONGESTED, &xprt->state); + cwnd_wait = test_bit(XPRT_CWND_WAIT, &xprt->state); + write_space = test_bit(XPRT_WRITE_SPACE, &xprt->state); + offline = test_bit(XPRT_OFFLINE, &xprt->state); + remove = test_bit(XPRT_REMOVE, &xprt->state); + + ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s %s\n", + locked ? "LOCKED" : "", + connected ? "CONNECTED" : "", + connecting ? "CONNECTING" : "", + close_wait ? "CLOSE_WAIT" : "", + bound ? "BOUND" : "", + binding ? "BOUNDING" : "", + closing ? "CLOSING" : "", + congested ? "CONGESTED" : "", + cwnd_wait ? "CWND_WAIT" : "", + write_space ? "WRITE_SPACE" : "", + offline ? "OFFLINE" : "", + remove ? "REMOVE" : ""); + } + + xprt_put(xprt); + return ret; +} + +static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt_switch *xprt_switch = + rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); + ssize_t ret; + + if (!xprt_switch) + return 0; + ret = sprintf(buf, "num_xprts=%u\nnum_active=%u\n" + "num_unique_destaddr=%u\nqueue_len=%ld\n", + xprt_switch->xps_nxprts, xprt_switch->xps_nactive, + xprt_switch->xps_nunique_destaddr_xprts, + atomic_long_read(&xprt_switch->xps_queuelen)); + xprt_switch_put(xprt_switch); + return ret; +} + +static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + struct sockaddr *saddr; + char *dst_addr; + int port; + struct xprt_addr *saved_addr; + size_t buf_len; + + if (!xprt) + return 0; + if (!(xprt->xprt_class->ident == XPRT_TRANSPORT_TCP || + xprt->xprt_class->ident == XPRT_TRANSPORT_RDMA)) { + xprt_put(xprt); + return -EOPNOTSUPP; + } + + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { + count = -EINTR; + goto out_put; + } + saddr = (struct sockaddr *)&xprt->addr; + port = rpc_get_port(saddr); + + /* buf_len is the len until the first occurence of either + * '\n' or '\0' + */ + buf_len = strcspn(buf, "\n"); + + dst_addr = kstrndup(buf, buf_len, GFP_KERNEL); + if (!dst_addr) + goto out_err; + saved_addr = kzalloc(sizeof(*saved_addr), GFP_KERNEL); + if (!saved_addr) + goto out_err_free; + saved_addr->addr = + rcu_dereference_raw(xprt->address_strings[RPC_DISPLAY_ADDR]); + rcu_assign_pointer(xprt->address_strings[RPC_DISPLAY_ADDR], dst_addr); + call_rcu(&saved_addr->rcu, free_xprt_addr); + xprt->addrlen = rpc_pton(xprt->xprt_net, buf, buf_len, saddr, + sizeof(*saddr)); + rpc_set_port(saddr, port); + + xprt_force_disconnect(xprt); +out: + xprt_release_write(xprt, NULL); +out_put: + xprt_put(xprt); + return count; +out_err_free: + kfree(dst_addr); +out_err: + count = -ENOMEM; + goto out; +} + +static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + int offline = 0, online = 0, remove = 0; + struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); + + if (!xprt || !xps) { + count = 0; + goto out_put; + } + + if (!strncmp(buf, "offline", 7)) + offline = 1; + else if (!strncmp(buf, "online", 6)) + online = 1; + else if (!strncmp(buf, "remove", 6)) + remove = 1; + else { + count = -EINVAL; + goto out_put; + } + + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { + count = -EINTR; + goto out_put; + } + if (xprt->main) { + count = -EINVAL; + goto release_tasks; + } + if (offline) { + xprt_set_offline_locked(xprt, xps); + } else if (online) { + xprt_set_online_locked(xprt, xps); + } else if (remove) { + if (test_bit(XPRT_OFFLINE, &xprt->state)) + xprt_delete_locked(xprt, xps); + else + count = -EINVAL; + } + +release_tasks: + xprt_release_write(xprt, NULL); +out_put: + xprt_put(xprt); + xprt_switch_put(xps); + return count; +} + +int rpc_sysfs_init(void) +{ + rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); + if (!rpc_sunrpc_kset) + return -ENOMEM; + rpc_sunrpc_client_kobj = + rpc_sysfs_object_alloc("rpc-clients", rpc_sunrpc_kset, NULL); + if (!rpc_sunrpc_client_kobj) + goto err_client; + rpc_sunrpc_xprt_switch_kobj = + rpc_sysfs_object_alloc("xprt-switches", rpc_sunrpc_kset, NULL); + if (!rpc_sunrpc_xprt_switch_kobj) + goto err_switch; + return 0; +err_switch: + kobject_put(rpc_sunrpc_client_kobj); + rpc_sunrpc_client_kobj = NULL; +err_client: + kset_unregister(rpc_sunrpc_kset); + rpc_sunrpc_kset = NULL; + return -ENOMEM; +} + +static void rpc_sysfs_client_release(struct kobject *kobj) +{ + struct rpc_sysfs_client *c; + + c = container_of(kobj, struct rpc_sysfs_client, kobject); + kfree(c); +} + +static void rpc_sysfs_xprt_switch_release(struct kobject *kobj) +{ + struct rpc_sysfs_xprt_switch *xprt_switch; + + xprt_switch = container_of(kobj, struct rpc_sysfs_xprt_switch, kobject); + kfree(xprt_switch); +} + +static void rpc_sysfs_xprt_release(struct kobject *kobj) +{ + struct rpc_sysfs_xprt *xprt; + + xprt = container_of(kobj, struct rpc_sysfs_xprt, kobject); + kfree(xprt); +} + +static const void *rpc_sysfs_client_namespace(struct kobject *kobj) +{ + return container_of(kobj, struct rpc_sysfs_client, kobject)->net; +} + +static const void *rpc_sysfs_xprt_switch_namespace(struct kobject *kobj) +{ + return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net; +} + +static const void *rpc_sysfs_xprt_namespace(struct kobject *kobj) +{ + return container_of(kobj, struct rpc_sysfs_xprt, + kobject)->xprt->xprt_net; +} + +static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, + 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); + +static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr, + 0644, rpc_sysfs_xprt_srcaddr_show, NULL); + +static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, + 0444, rpc_sysfs_xprt_info_show, NULL); + +static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, + 0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change); + +static struct attribute *rpc_sysfs_xprt_attrs[] = { + &rpc_sysfs_xprt_dstaddr.attr, + &rpc_sysfs_xprt_srcaddr.attr, + &rpc_sysfs_xprt_info.attr, + &rpc_sysfs_xprt_change_state.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rpc_sysfs_xprt); + +static struct kobj_attribute rpc_sysfs_xprt_switch_info = + __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL); + +static struct attribute *rpc_sysfs_xprt_switch_attrs[] = { + &rpc_sysfs_xprt_switch_info.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rpc_sysfs_xprt_switch); + +static struct kobj_type rpc_sysfs_client_type = { + .release = rpc_sysfs_client_release, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = rpc_sysfs_client_namespace, +}; + +static struct kobj_type rpc_sysfs_xprt_switch_type = { + .release = rpc_sysfs_xprt_switch_release, + .default_groups = rpc_sysfs_xprt_switch_groups, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = rpc_sysfs_xprt_switch_namespace, +}; + +static struct kobj_type rpc_sysfs_xprt_type = { + .release = rpc_sysfs_xprt_release, + .default_groups = rpc_sysfs_xprt_groups, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = rpc_sysfs_xprt_namespace, +}; + +void rpc_sysfs_exit(void) +{ + kobject_put(rpc_sunrpc_client_kobj); + kobject_put(rpc_sunrpc_xprt_switch_kobj); + kset_unregister(rpc_sunrpc_kset); +} + +static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent, + struct net *net, + int clid) +{ + struct rpc_sysfs_client *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (p) { + p->net = net; + p->kobject.kset = rpc_sunrpc_kset; + if (kobject_init_and_add(&p->kobject, &rpc_sysfs_client_type, + parent, "clnt-%d", clid) == 0) + return p; + kobject_put(&p->kobject); + } + return NULL; +} + +static struct rpc_sysfs_xprt_switch * +rpc_sysfs_xprt_switch_alloc(struct kobject *parent, + struct rpc_xprt_switch *xprt_switch, + struct net *net, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt_switch *p; + + p = kzalloc(sizeof(*p), gfp_flags); + if (p) { + p->net = net; + p->kobject.kset = rpc_sunrpc_kset; + if (kobject_init_and_add(&p->kobject, + &rpc_sysfs_xprt_switch_type, + parent, "switch-%d", + xprt_switch->xps_id) == 0) + return p; + kobject_put(&p->kobject); + } + return NULL; +} + +static struct rpc_sysfs_xprt *rpc_sysfs_xprt_alloc(struct kobject *parent, + struct rpc_xprt *xprt, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt *p; + + p = kzalloc(sizeof(*p), gfp_flags); + if (!p) + goto out; + p->kobject.kset = rpc_sunrpc_kset; + if (kobject_init_and_add(&p->kobject, &rpc_sysfs_xprt_type, + parent, "xprt-%d-%s", xprt->id, + xprt->address_strings[RPC_DISPLAY_PROTO]) == 0) + return p; + kobject_put(&p->kobject); +out: + return NULL; +} + +void rpc_sysfs_client_setup(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xprt_switch, + struct net *net) +{ + struct rpc_sysfs_client *rpc_client; + struct rpc_sysfs_xprt_switch *xswitch = + (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; + + if (!xswitch) + return; + + rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj, + net, clnt->cl_clid); + if (rpc_client) { + char name[] = "switch"; + int ret; + + clnt->cl_sysfs = rpc_client; + rpc_client->clnt = clnt; + rpc_client->xprt_switch = xprt_switch; + kobject_uevent(&rpc_client->kobject, KOBJ_ADD); + ret = sysfs_create_link_nowarn(&rpc_client->kobject, + &xswitch->kobject, name); + if (ret) + pr_warn("can't create link to %s in sysfs (%d)\n", + name, ret); + } +} + +void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt_switch *rpc_xprt_switch; + struct net *net; + + if (xprt_switch->xps_net) + net = xprt_switch->xps_net; + else + net = xprt->xprt_net; + rpc_xprt_switch = + rpc_sysfs_xprt_switch_alloc(rpc_sunrpc_xprt_switch_kobj, + xprt_switch, net, gfp_flags); + if (rpc_xprt_switch) { + xprt_switch->xps_sysfs = rpc_xprt_switch; + rpc_xprt_switch->xprt_switch = xprt_switch; + rpc_xprt_switch->xprt = xprt; + kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_ADD); + } else { + xprt_switch->xps_sysfs = NULL; + } +} + +void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt *rpc_xprt; + struct rpc_sysfs_xprt_switch *switch_obj = + (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; + + if (!switch_obj) + return; + + rpc_xprt = rpc_sysfs_xprt_alloc(&switch_obj->kobject, xprt, gfp_flags); + if (rpc_xprt) { + xprt->xprt_sysfs = rpc_xprt; + rpc_xprt->xprt = xprt; + rpc_xprt->xprt_switch = xprt_switch; + kobject_uevent(&rpc_xprt->kobject, KOBJ_ADD); + } +} + +void rpc_sysfs_client_destroy(struct rpc_clnt *clnt) +{ + struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs; + + if (rpc_client) { + char name[] = "switch"; + + sysfs_remove_link(&rpc_client->kobject, name); + kobject_uevent(&rpc_client->kobject, KOBJ_REMOVE); + kobject_del(&rpc_client->kobject); + kobject_put(&rpc_client->kobject); + clnt->cl_sysfs = NULL; + } +} + +void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt_switch) +{ + struct rpc_sysfs_xprt_switch *rpc_xprt_switch = xprt_switch->xps_sysfs; + + if (rpc_xprt_switch) { + kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_REMOVE); + kobject_del(&rpc_xprt_switch->kobject); + kobject_put(&rpc_xprt_switch->kobject); + xprt_switch->xps_sysfs = NULL; + } +} + +void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt) +{ + struct rpc_sysfs_xprt *rpc_xprt = xprt->xprt_sysfs; + + if (rpc_xprt) { + kobject_uevent(&rpc_xprt->kobject, KOBJ_REMOVE); + kobject_del(&rpc_xprt->kobject); + kobject_put(&rpc_xprt->kobject); + xprt->xprt_sysfs = NULL; + } +} diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h new file mode 100644 index 000000000000..6620cebd1037 --- /dev/null +++ b/net/sunrpc/sysfs.h @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Anna Schumaker <Anna.Schumaker@Netapp.com> + */ +#ifndef __SUNRPC_SYSFS_H +#define __SUNRPC_SYSFS_H + +struct rpc_sysfs_client { + struct kobject kobject; + struct net *net; + struct rpc_clnt *clnt; + struct rpc_xprt_switch *xprt_switch; +}; + +struct rpc_sysfs_xprt_switch { + struct kobject kobject; + struct net *net; + struct rpc_xprt_switch *xprt_switch; + struct rpc_xprt *xprt; +}; + +struct rpc_sysfs_xprt { + struct kobject kobject; + struct rpc_xprt *xprt; + struct rpc_xprt_switch *xprt_switch; +}; + +int rpc_sysfs_init(void); +void rpc_sysfs_exit(void); + +void rpc_sysfs_client_setup(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xprt_switch, + struct net *net); +void rpc_sysfs_client_destroy(struct rpc_clnt *clnt); +void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, gfp_t gfp_flags); +void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt); +void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, gfp_t gfp_flags); +void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt); + +#endif diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index e5497dc2475b..336a7c7833e4 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -19,6 +19,9 @@ #include <linux/bvec.h> #include <trace/events/sunrpc.h> +static void _copy_to_pages(struct page **, size_t, const char *, size_t); + + /* * XDR functions for basic NFS types */ @@ -120,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); * @len: length of string, in bytes * */ -void -xdr_terminate_string(struct xdr_buf *buf, const u32 len) +void xdr_terminate_string(const struct xdr_buf *buf, const u32 len) { char *kaddr; @@ -131,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len) } EXPORT_SYMBOL_GPL(xdr_terminate_string); -size_t -xdr_buf_pagecount(struct xdr_buf *buf) +size_t xdr_buf_pagecount(const struct xdr_buf *buf) { if (!buf->page_len) return 0; @@ -190,9 +191,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, tail->iov_base = buf + offset; tail->iov_len = buflen - offset; - if ((xdr->page_len & 3) == 0) - tail->iov_len -= sizeof(__be32); - xdr->buflen += len; } EXPORT_SYMBOL_GPL(xdr_inline_pages); @@ -202,6 +200,71 @@ EXPORT_SYMBOL_GPL(xdr_inline_pages); */ /** + * _shift_data_left_pages + * @pages: vector of pages containing both the source and dest memory area. + * @pgto_base: page vector address of destination + * @pgfrom_base: page vector address of source + * @len: number of bytes to copy + * + * Note: the addresses pgto_base and pgfrom_base are both calculated in + * the same way: + * if a memory area starts at byte 'base' in page 'pages[i]', + * then its address is given as (i << PAGE_CACHE_SHIFT) + base + * Alse note: pgto_base must be < pgfrom_base, but the memory areas + * they point to may overlap. + */ +static void +_shift_data_left_pages(struct page **pages, size_t pgto_base, + size_t pgfrom_base, size_t len) +{ + struct page **pgfrom, **pgto; + char *vfrom, *vto; + size_t copy; + + BUG_ON(pgfrom_base <= pgto_base); + + if (!len) + return; + + pgto = pages + (pgto_base >> PAGE_SHIFT); + pgfrom = pages + (pgfrom_base >> PAGE_SHIFT); + + pgto_base &= ~PAGE_MASK; + pgfrom_base &= ~PAGE_MASK; + + do { + if (pgto_base >= PAGE_SIZE) { + pgto_base = 0; + pgto++; + } + if (pgfrom_base >= PAGE_SIZE){ + pgfrom_base = 0; + pgfrom++; + } + + copy = len; + if (copy > (PAGE_SIZE - pgto_base)) + copy = PAGE_SIZE - pgto_base; + if (copy > (PAGE_SIZE - pgfrom_base)) + copy = PAGE_SIZE - pgfrom_base; + + vto = kmap_atomic(*pgto); + if (*pgto != *pgfrom) { + vfrom = kmap_atomic(*pgfrom); + memcpy(vto + pgto_base, vfrom + pgfrom_base, copy); + kunmap_atomic(vfrom); + } else + memmove(vto + pgto_base, vto + pgfrom_base, copy); + flush_dcache_page(*pgto); + kunmap_atomic(vto); + + pgto_base += copy; + pgfrom_base += copy; + + } while ((len -= copy) != 0); +} + +/** * _shift_data_right_pages * @pages: vector of pages containing both the source and dest memory area. * @pgto_base: page vector address of destination @@ -225,6 +288,9 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, BUG_ON(pgto_base <= pgfrom_base); + if (!len) + return; + pgto_base += len; pgfrom_base += len; @@ -283,6 +349,9 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) char *vto; size_t copy; + if (!len) + return; + pgto = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -327,6 +396,9 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) char *vfrom; size_t copy; + if (!len) + return; + pgfrom = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -350,148 +422,451 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) } EXPORT_SYMBOL_GPL(_copy_from_pages); +static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base, + unsigned int len) +{ + if (base >= iov->iov_len) + return; + if (len > iov->iov_len - base) + len = iov->iov_len - base; + memset(iov->iov_base + base, 0, len); +} + /** - * xdr_shrink_bufhead + * xdr_buf_pages_zero * @buf: xdr_buf - * @len: bytes to remove from buf->head[0] - * - * Shrinks XDR buffer's header kvec buf->head[0] by - * 'len' bytes. The extra data is not lost, but is instead - * moved into the inlined pages and/or the tail. + * @pgbase: beginning offset + * @len: length */ -static unsigned int -xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) +static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase, + unsigned int len) { - struct kvec *head, *tail; - size_t copy, offs; - unsigned int pglen = buf->page_len; - unsigned int result; + struct page **pages = buf->pages; + struct page **page; + char *vpage; + unsigned int zero; - result = 0; - tail = buf->tail; - head = buf->head; + if (!len) + return; + if (pgbase >= buf->page_len) { + xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len); + return; + } + if (pgbase + len > buf->page_len) { + xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len); + len = buf->page_len - pgbase; + } - WARN_ON_ONCE(len > head->iov_len); - if (len > head->iov_len) - len = head->iov_len; - - /* Shift the tail first */ - if (tail->iov_len != 0) { - if (tail->iov_len > len) { - copy = tail->iov_len - len; - memmove((char *)tail->iov_base + len, - tail->iov_base, copy); - result += copy; - } - /* Copy from the inlined pages into the tail */ - copy = len; - if (copy > pglen) - copy = pglen; - offs = len - copy; - if (offs >= tail->iov_len) - copy = 0; - else if (copy > tail->iov_len - offs) - copy = tail->iov_len - offs; - if (copy != 0) { - _copy_from_pages((char *)tail->iov_base + offs, - buf->pages, - buf->page_base + pglen + offs - len, - copy); - result += copy; + pgbase += buf->page_base; + + page = pages + (pgbase >> PAGE_SHIFT); + pgbase &= ~PAGE_MASK; + + do { + zero = PAGE_SIZE - pgbase; + if (zero > len) + zero = len; + + vpage = kmap_atomic(*page); + memset(vpage + pgbase, 0, zero); + kunmap_atomic(vpage); + + flush_dcache_page(*page); + pgbase = 0; + page++; + + } while ((len -= zero) != 0); +} + +static unsigned int xdr_buf_pages_fill_sparse(const struct xdr_buf *buf, + unsigned int buflen, gfp_t gfp) +{ + unsigned int i, npages, pagelen; + + if (!(buf->flags & XDRBUF_SPARSE_PAGES)) + return buflen; + if (buflen <= buf->head->iov_len) + return buflen; + pagelen = buflen - buf->head->iov_len; + if (pagelen > buf->page_len) + pagelen = buf->page_len; + npages = (pagelen + buf->page_base + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + if (!buf->pages[i]) + continue; + buf->pages[i] = alloc_page(gfp); + if (likely(buf->pages[i])) + continue; + buflen -= pagelen; + pagelen = i << PAGE_SHIFT; + if (pagelen > buf->page_base) + buflen += pagelen - buf->page_base; + break; + } + return buflen; +} + +static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len) +{ + struct kvec *head = buf->head; + struct kvec *tail = buf->tail; + unsigned int sum = head->iov_len + buf->page_len + tail->iov_len; + unsigned int free_space, newlen; + + if (sum > buf->len) { + free_space = min_t(unsigned int, sum - buf->len, len); + newlen = xdr_buf_pages_fill_sparse(buf, buf->len + free_space, + GFP_KERNEL); + free_space = newlen - buf->len; + buf->len = newlen; + len -= free_space; + if (!len) + return; + } + + if (buf->buflen > sum) { + /* Expand the tail buffer */ + free_space = min_t(unsigned int, buf->buflen - sum, len); + tail->iov_len += free_space; + buf->len += free_space; + } +} + +static void xdr_buf_tail_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + + if (to >= tail->iov_len) + return; + if (len + to > tail->iov_len) + len = tail->iov_len - to; + memmove(tail->iov_base + to, tail->iov_base + base, len); +} + +static void xdr_buf_pages_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + unsigned int pglen = 0; + unsigned int talen = 0, tato = 0; + + if (base >= buf->page_len) + return; + if (len > buf->page_len - base) + len = buf->page_len - base; + if (to >= buf->page_len) { + tato = to - buf->page_len; + if (tail->iov_len >= len + tato) + talen = len; + else if (tail->iov_len > tato) + talen = tail->iov_len - tato; + } else if (len + to >= buf->page_len) { + pglen = buf->page_len - to; + talen = len - pglen; + if (talen > tail->iov_len) + talen = tail->iov_len; + } else + pglen = len; + + _copy_from_pages(tail->iov_base + tato, buf->pages, + buf->page_base + base + pglen, talen); + _shift_data_right_pages(buf->pages, buf->page_base + to, + buf->page_base + base, pglen); +} + +static void xdr_buf_head_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + unsigned int pglen = 0, pgto = 0; + unsigned int talen = 0, tato = 0; + + if (base >= head->iov_len) + return; + if (len > head->iov_len - base) + len = head->iov_len - base; + if (to >= buf->page_len + head->iov_len) { + tato = to - buf->page_len - head->iov_len; + talen = len; + } else if (to >= head->iov_len) { + pgto = to - head->iov_len; + pglen = len; + if (pgto + pglen > buf->page_len) { + talen = pgto + pglen - buf->page_len; + pglen -= talen; } - /* Do we also need to copy data from the head into the tail ? */ - if (len > pglen) { - offs = copy = len - pglen; - if (copy > tail->iov_len) - copy = tail->iov_len; - memcpy(tail->iov_base, - (char *)head->iov_base + - head->iov_len - offs, - copy); - result += copy; + } else { + pglen = len - to; + if (pglen > buf->page_len) { + talen = pglen - buf->page_len; + pglen = buf->page_len; } } - /* Now handle pages */ - if (pglen != 0) { - if (pglen > len) - _shift_data_right_pages(buf->pages, - buf->page_base + len, - buf->page_base, - pglen - len); - copy = len; - if (len > pglen) - copy = pglen; - _copy_to_pages(buf->pages, buf->page_base, - (char *)head->iov_base + head->iov_len - len, - copy); - result += copy; + + len -= talen; + base += len; + if (talen + tato > tail->iov_len) + talen = tail->iov_len > tato ? tail->iov_len - tato : 0; + memcpy(tail->iov_base + tato, head->iov_base + base, talen); + + len -= pglen; + base -= pglen; + _copy_to_pages(buf->pages, buf->page_base + pgto, head->iov_base + base, + pglen); + + base -= len; + memmove(head->iov_base + to, head->iov_base + base, len); +} + +static void xdr_buf_tail_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + + if (base >= tail->iov_len || !shift || !len) + return; + xdr_buf_tail_copy_right(buf, base, len, shift); +} + +static void xdr_buf_pages_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + if (base >= buf->page_len) { + xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift); + return; } - head->iov_len -= len; - buf->buflen -= len; - /* Have we truncated the message? */ - if (buf->len > buf->buflen) - buf->len = buf->buflen; + if (base + len > buf->page_len) + xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len, + shift); + xdr_buf_pages_copy_right(buf, base, len, shift); +} + +static void xdr_buf_head_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; - return result; + if (!shift) + return; + if (base >= head->iov_len) { + xdr_buf_pages_shift_right(buf, head->iov_len - base, len, + shift); + return; + } + if (base + len > head->iov_len) + xdr_buf_pages_shift_right(buf, 0, base + len - head->iov_len, + shift); + xdr_buf_head_copy_right(buf, base, len, shift); +} + +static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base, + unsigned int len, unsigned int shift) +{ + const struct kvec *tail = buf->tail; + + if (base >= tail->iov_len) + return; + if (len > tail->iov_len - base) + len = tail->iov_len - base; + /* Shift data into head */ + if (shift > buf->page_len + base) { + const struct kvec *head = buf->head; + unsigned int hdto = + head->iov_len + buf->page_len + base - shift; + unsigned int hdlen = len; + + if (WARN_ONCE(shift > head->iov_len + buf->page_len + base, + "SUNRPC: Misaligned data.\n")) + return; + if (hdto + hdlen > head->iov_len) + hdlen = head->iov_len - hdto; + memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen); + base += hdlen; + len -= hdlen; + if (!len) + return; + } + /* Shift data into pages */ + if (shift > base) { + unsigned int pgto = buf->page_len + base - shift; + unsigned int pglen = len; + + if (pgto + pglen > buf->page_len) + pglen = buf->page_len - pgto; + _copy_to_pages(buf->pages, buf->page_base + pgto, + tail->iov_base + base, pglen); + base += pglen; + len -= pglen; + if (!len) + return; + } + memmove(tail->iov_base + base - shift, tail->iov_base + base, len); +} + +static void xdr_buf_pages_copy_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + unsigned int pgto; + + if (base >= buf->page_len) + return; + if (len > buf->page_len - base) + len = buf->page_len - base; + /* Shift data into head */ + if (shift > base) { + const struct kvec *head = buf->head; + unsigned int hdto = head->iov_len + base - shift; + unsigned int hdlen = len; + + if (WARN_ONCE(shift > head->iov_len + base, + "SUNRPC: Misaligned data.\n")) + return; + if (hdto + hdlen > head->iov_len) + hdlen = head->iov_len - hdto; + _copy_from_pages(head->iov_base + hdto, buf->pages, + buf->page_base + base, hdlen); + base += hdlen; + len -= hdlen; + if (!len) + return; + } + pgto = base - shift; + _shift_data_left_pages(buf->pages, buf->page_base + pgto, + buf->page_base + base, len); +} + +static void xdr_buf_tail_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + xdr_buf_tail_copy_left(buf, base, len, shift); +} + +static void xdr_buf_pages_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + if (base >= buf->page_len) { + xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift); + return; + } + xdr_buf_pages_copy_left(buf, base, len, shift); + len += base; + if (len <= buf->page_len) + return; + xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift); +} + +static void xdr_buf_head_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; + unsigned int bytes; + + if (!shift || !len) + return; + + if (shift > base) { + bytes = (shift - base); + if (bytes >= len) + return; + base += bytes; + len -= bytes; + } + + if (base < head->iov_len) { + bytes = min_t(unsigned int, len, head->iov_len - base); + memmove(head->iov_base + (base - shift), + head->iov_base + base, bytes); + base += bytes; + len -= bytes; + } + xdr_buf_pages_shift_left(buf, base - head->iov_len, len, shift); +} + +/** + * xdr_shrink_bufhead + * @buf: xdr_buf + * @len: new length of buf->head[0] + * + * Shrinks XDR buffer's header kvec buf->head[0], setting it to + * 'len' bytes. The extra data is not lost, but is instead + * moved into the inlined pages and/or the tail. + */ +static unsigned int xdr_shrink_bufhead(struct xdr_buf *buf, unsigned int len) +{ + struct kvec *head = buf->head; + unsigned int shift, buflen = max(buf->len, len); + + WARN_ON_ONCE(len > head->iov_len); + if (head->iov_len > buflen) { + buf->buflen -= head->iov_len - buflen; + head->iov_len = buflen; + } + if (len >= head->iov_len) + return 0; + shift = head->iov_len - len; + xdr_buf_try_expand(buf, shift); + xdr_buf_head_shift_right(buf, len, buflen - len, shift); + head->iov_len = len; + buf->buflen -= shift; + buf->len -= shift; + return shift; } /** - * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes + * xdr_shrink_pagelen - shrinks buf->pages to @len bytes * @buf: xdr_buf - * @len: bytes to remove from buf->pages + * @len: new page buffer length * * The extra data is not lost, but is instead moved into buf->tail. * Returns the actual number of bytes moved. */ -static unsigned int -xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) +static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len) { - struct kvec *tail; - size_t copy; - unsigned int pglen = buf->page_len; - unsigned int tailbuf_len; - unsigned int result; - - result = 0; - tail = buf->tail; - if (len > buf->page_len) - len = buf-> page_len; - tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; - - /* Shift the tail first */ - if (tailbuf_len != 0) { - unsigned int free_space = tailbuf_len - tail->iov_len; - - if (len < free_space) - free_space = len; - tail->iov_len += free_space; - - copy = len; - if (tail->iov_len > len) { - char *p = (char *)tail->iov_base + len; - memmove(p, tail->iov_base, tail->iov_len - len); - result += tail->iov_len - len; - } else - copy = tail->iov_len; - /* Copy from the inlined pages into the tail */ - _copy_from_pages((char *)tail->iov_base, - buf->pages, buf->page_base + pglen - len, - copy); - result += copy; + unsigned int shift, buflen = buf->len - buf->head->iov_len; + + WARN_ON_ONCE(len > buf->page_len); + if (buf->head->iov_len >= buf->len || len > buflen) + buflen = len; + if (buf->page_len > buflen) { + buf->buflen -= buf->page_len - buflen; + buf->page_len = buflen; } - buf->page_len -= len; - buf->buflen -= len; - /* Have we truncated the message? */ - if (buf->len > buf->buflen) - buf->len = buf->buflen; - - return result; + if (len >= buf->page_len) + return 0; + shift = buf->page_len - len; + xdr_buf_try_expand(buf, shift); + xdr_buf_pages_shift_right(buf, len, buflen - len, shift); + buf->page_len = len; + buf->len -= shift; + buf->buflen -= shift; + return shift; } void xdr_shift_buf(struct xdr_buf *buf, size_t len) { - xdr_shrink_bufhead(buf, len); + xdr_shrink_bufhead(buf, buf->head->iov_len - len); } EXPORT_SYMBOL_GPL(xdr_shift_buf); @@ -505,6 +880,31 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr) } EXPORT_SYMBOL_GPL(xdr_stream_pos); +static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos) +{ + unsigned int blen = xdr->buf->len; + + xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0; +} + +static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos) +{ + xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len); +} + +/** + * xdr_page_pos - Return the current offset from the start of the xdr pages + * @xdr: pointer to struct xdr_stream + */ +unsigned int xdr_page_pos(const struct xdr_stream *xdr) +{ + unsigned int pos = xdr_stream_pos(xdr); + + WARN_ON(pos < xdr->buf->head[0].iov_len); + return pos - xdr->buf->head[0].iov_len; +} +EXPORT_SYMBOL_GPL(xdr_page_pos); + /** * xdr_init_encode - Initialize a struct xdr_stream for sending data. * @xdr: pointer to xdr_stream struct @@ -525,7 +925,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct kvec *iov = buf->head; int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len; - xdr_set_scratch_buffer(xdr, NULL, 0); + xdr_reset_scratch_buffer(xdr); BUG_ON(scratch_len < 0); xdr->buf = buf; xdr->iov = iov; @@ -547,7 +947,29 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, EXPORT_SYMBOL_GPL(xdr_init_encode); /** - * xdr_commit_encode - Ensure all data is written to buffer + * xdr_init_encode_pages - Initialize an xdr_stream for encoding into pages + * @xdr: pointer to xdr_stream struct + * @buf: pointer to XDR buffer into which to encode data + * @pages: list of pages to decode into + * @rqst: pointer to controlling rpc_rqst, for debugging + * + */ +void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, + struct page **pages, struct rpc_rqst *rqst) +{ + xdr_reset_scratch_buffer(xdr); + + xdr->buf = buf; + xdr->page_ptr = pages; + xdr->iov = NULL; + xdr->p = page_address(*pages); + xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); + xdr->rqst = rqst; +} +EXPORT_SYMBOL_GPL(xdr_init_encode_pages); + +/** + * __xdr_commit_encode - Ensure all data is written to buffer * @xdr: pointer to xdr_stream * * We handle encoding across page boundaries by giving the caller a @@ -559,26 +981,29 @@ EXPORT_SYMBOL_GPL(xdr_init_encode); * required at the end of encoding, or any other time when the xdr_buf * data might be read. */ -inline void xdr_commit_encode(struct xdr_stream *xdr) +void __xdr_commit_encode(struct xdr_stream *xdr) { - int shift = xdr->scratch.iov_len; + size_t shift = xdr->scratch.iov_len; void *page; - if (shift == 0) - return; page = page_address(*xdr->page_ptr); memcpy(xdr->scratch.iov_base, page, shift); memmove(page, page + shift, (void *)xdr->p - page); - xdr->scratch.iov_len = 0; + xdr_reset_scratch_buffer(xdr); } -EXPORT_SYMBOL_GPL(xdr_commit_encode); +EXPORT_SYMBOL_GPL(__xdr_commit_encode); -static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, - size_t nbytes) +/* + * The buffer space to be reserved crosses the boundary between + * xdr->buf->head and xdr->buf->pages, or between two pages + * in xdr->buf->pages. + */ +static noinline __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, + size_t nbytes) { - __be32 *p; int space_left; int frag1bytes, frag2bytes; + void *p; if (nbytes > PAGE_SIZE) goto out_overflow; /* Bigger buffers require special handling */ @@ -592,6 +1017,7 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, xdr->buf->page_len += frag1bytes; xdr->page_ptr++; xdr->iov = NULL; + /* * If the last encode didn't end exactly on a page boundary, the * next one will straddle boundaries. Encode into the next @@ -599,16 +1025,20 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, * the "scratch" iov to track any temporarily unused fragment of * space at the end of the previous buffer: */ - xdr->scratch.iov_base = xdr->p; - xdr->scratch.iov_len = frag1bytes; - p = page_address(*xdr->page_ptr); + xdr_set_scratch_buffer(xdr, xdr->p, frag1bytes); + /* - * Note this is where the next encode will start after we've - * shifted this one back: + * xdr->p is where the next encode will start after + * xdr_commit_encode() has shifted this one back: */ - xdr->p = (void *)p + frag2bytes; + p = page_address(*xdr->page_ptr); + xdr->p = p + frag2bytes; space_left = xdr->buf->buflen - xdr->buf->len; - xdr->end = (void *)p + min_t(int, space_left, PAGE_SIZE); + if (space_left - frag1bytes >= PAGE_SIZE) + xdr->end = p + PAGE_SIZE; + else + xdr->end = p + space_left - frag1bytes; + xdr->buf->page_len += frag2bytes; xdr->buf->len += nbytes; return p; @@ -648,6 +1078,51 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) } EXPORT_SYMBOL_GPL(xdr_reserve_space); + +/** + * xdr_reserve_space_vec - Reserves a large amount of buffer space for sending + * @xdr: pointer to xdr_stream + * @vec: pointer to a kvec array + * @nbytes: number of bytes to reserve + * + * Reserves enough buffer space to encode 'nbytes' of data and stores the + * pointers in 'vec'. The size argument passed to xdr_reserve_space() is + * determined based on the number of bytes remaining in the current page to + * avoid invalidating iov_base pointers when xdr_commit_encode() is called. + */ +int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes) +{ + int thislen; + int v = 0; + __be32 *p; + + /* + * svcrdma requires every READ payload to start somewhere + * in xdr->pages. + */ + if (xdr->iov == xdr->buf->head) { + xdr->iov = NULL; + xdr->end = xdr->p; + } + + while (nbytes) { + thislen = xdr->buf->page_len % PAGE_SIZE; + thislen = min_t(size_t, nbytes, PAGE_SIZE - thislen); + + p = xdr_reserve_space(xdr, thislen); + if (!p) + return -EIO; + + vec[v].iov_base = p; + vec[v].iov_len = thislen; + v++; + nbytes -= thislen; + } + + return v; +} +EXPORT_SYMBOL_GPL(xdr_reserve_space_vec); + /** * xdr_truncate_encode - truncate an encode buffer * @xdr: pointer to xdr_stream @@ -658,7 +1133,7 @@ EXPORT_SYMBOL_GPL(xdr_reserve_space); * head, tail, and page lengths are adjusted to correspond. * * If this means moving xdr->p to a different buffer, we assume that - * that the end pointer should be set to the end of the current page, + * the end pointer should be set to the end of the current page, * except in the case of the head buffer when we assume the head * buffer's current length represents the end of the available buffer. * @@ -781,19 +1256,31 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b } EXPORT_SYMBOL_GPL(xdr_write_pages); -static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, - unsigned int len) +static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, + unsigned int base, unsigned int len) { if (len > iov->iov_len) len = iov->iov_len; - xdr->p = (__be32*)iov->iov_base; + if (unlikely(base > len)) + base = len; + xdr->p = (__be32*)(iov->iov_base + base); xdr->end = (__be32*)(iov->iov_base + len); xdr->iov = iov; xdr->page_ptr = NULL; + return len - base; } -static int xdr_set_page_base(struct xdr_stream *xdr, - unsigned int base, unsigned int len) +static unsigned int xdr_set_tail_base(struct xdr_stream *xdr, + unsigned int base, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + + xdr_stream_set_pos(xdr, base + buf->page_len + buf->head->iov_len); + return xdr_set_iov(xdr, buf->tail, base, len); +} + +static unsigned int xdr_set_page_base(struct xdr_stream *xdr, + unsigned int base, unsigned int len) { unsigned int pgnr; unsigned int maxlen; @@ -803,11 +1290,13 @@ static int xdr_set_page_base(struct xdr_stream *xdr, maxlen = xdr->buf->page_len; if (base >= maxlen) - return -EINVAL; - maxlen -= base; + return 0; + else + maxlen -= base; if (len > maxlen) len = maxlen; + xdr_stream_page_set_pos(xdr, base); base += xdr->buf->page_base; pgnr = base >> PAGE_SHIFT; @@ -822,7 +1311,16 @@ static int xdr_set_page_base(struct xdr_stream *xdr, pgend = PAGE_SIZE; xdr->end = (__be32*)(kaddr + pgend); xdr->iov = NULL; - return 0; + return len; +} + +static void xdr_set_page(struct xdr_stream *xdr, unsigned int base, + unsigned int len) +{ + if (xdr_set_page_base(xdr, base, len) == 0) { + base -= xdr->buf->page_len; + xdr_set_tail_base(xdr, base, len); + } } static void xdr_set_next_page(struct xdr_stream *xdr) @@ -831,19 +1329,18 @@ static void xdr_set_next_page(struct xdr_stream *xdr) newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT; newbase -= xdr->buf->page_base; - - if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0) - xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2); + if (newbase < xdr->buf->page_len) + xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr)); + else + xdr_set_tail_base(xdr, 0, xdr_stream_remaining(xdr)); } static bool xdr_set_next_buffer(struct xdr_stream *xdr) { if (xdr->page_ptr != NULL) xdr_set_next_page(xdr); - else if (xdr->iov == xdr->buf->head) { - if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0) - xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2); - } + else if (xdr->iov == xdr->buf->head) + xdr_set_page(xdr, 0, xdr_stream_remaining(xdr)); return xdr->p != xdr->end; } @@ -858,15 +1355,11 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst) { xdr->buf = buf; - xdr->scratch.iov_base = NULL; - xdr->scratch.iov_len = 0; + xdr_reset_scratch_buffer(xdr); xdr->nwords = XDR_QUADLEN(buf->len); - if (buf->head[0].iov_len != 0) - xdr_set_iov(xdr, buf->head, buf->len); - else if (buf->page_len != 0) - xdr_set_page_base(xdr, 0, buf->len); - else - xdr_set_iov(xdr, buf->head, buf->len); + if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 && + xdr_set_page_base(xdr, 0, buf->len) == 0) + xdr_set_iov(xdr, buf->tail, 0, buf->len); if (p != NULL && p > xdr->p && xdr->end >= p) { xdr->nwords -= p - xdr->p; xdr->p = p; @@ -907,24 +1400,6 @@ static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) return p; } -/** - * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data. - * @xdr: pointer to xdr_stream struct - * @buf: pointer to an empty buffer - * @buflen: size of 'buf' - * - * The scratch buffer is used when decoding from an array of pages. - * If an xdr_inline_decode() call spans across page boundaries, then - * we copy the data into the scratch buffer in order to allow linear - * access. - */ -void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) -{ - xdr->scratch.iov_base = buf; - xdr->scratch.iov_len = buflen; -} -EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer); - static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes) { __be32 *p; @@ -979,26 +1454,31 @@ out_overflow: } EXPORT_SYMBOL_GPL(xdr_inline_decode); -static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) +static void xdr_realign_pages(struct xdr_stream *xdr) { struct xdr_buf *buf = xdr->buf; - struct kvec *iov; - unsigned int nwords = XDR_QUADLEN(len); + struct kvec *iov = buf->head; unsigned int cur = xdr_stream_pos(xdr); - unsigned int copied, offset; - - if (xdr->nwords == 0) - return 0; + unsigned int copied; /* Realign pages to current pointer position */ - iov = buf->head; if (iov->iov_len > cur) { - offset = iov->iov_len - cur; - copied = xdr_shrink_bufhead(buf, offset); - trace_rpc_xdr_alignment(xdr, offset, copied); - xdr->nwords = XDR_QUADLEN(buf->len - cur); + copied = xdr_shrink_bufhead(buf, cur); + trace_rpc_xdr_alignment(xdr, cur, copied); + xdr_set_page(xdr, 0, buf->page_len); } +} + +static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + unsigned int nwords = XDR_QUADLEN(len); + unsigned int copied; + + if (xdr->nwords == 0) + return 0; + xdr_realign_pages(xdr); if (nwords > xdr->nwords) { nwords = xdr->nwords; len = nwords << 2; @@ -1007,57 +1487,72 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) len = buf->page_len; else if (nwords < xdr->nwords) { /* Truncate page data and move it into the tail */ - offset = buf->page_len - len; - copied = xdr_shrink_pagelen(buf, offset); - trace_rpc_xdr_alignment(xdr, offset, copied); - xdr->nwords = XDR_QUADLEN(buf->len - cur); + copied = xdr_shrink_pagelen(buf, len); + trace_rpc_xdr_alignment(xdr, len, copied); } return len; } /** - * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position + * xdr_read_pages - align page-based XDR data to current pointer position * @xdr: pointer to xdr_stream struct * @len: number of bytes of page data * * Moves data beyond the current pointer position from the XDR head[] buffer - * into the page list. Any data that lies beyond current position + "len" - * bytes is moved into the XDR tail[]. + * into the page list. Any data that lies beyond current position + @len + * bytes is moved into the XDR tail[]. The xdr_stream current position is + * then advanced past that data to align to the next XDR object in the tail. * * Returns the number of XDR encoded bytes now contained in the pages */ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len) { - struct xdr_buf *buf = xdr->buf; - struct kvec *iov; - unsigned int nwords; - unsigned int end; - unsigned int padding; + unsigned int nwords = XDR_QUADLEN(len); + unsigned int base, end, pglen; - len = xdr_align_pages(xdr, len); - if (len == 0) + pglen = xdr_align_pages(xdr, nwords << 2); + if (pglen == 0) return 0; - nwords = XDR_QUADLEN(len); - padding = (nwords << 2) - len; - xdr->iov = iov = buf->tail; - /* Compute remaining message length. */ - end = ((xdr->nwords - nwords) << 2) + padding; - if (end > iov->iov_len) - end = iov->iov_len; - /* - * Position current pointer at beginning of tail, and - * set remaining message length. - */ - xdr->p = (__be32 *)((char *)iov->iov_base + padding); - xdr->end = (__be32 *)((char *)iov->iov_base + end); - xdr->page_ptr = NULL; - xdr->nwords = XDR_QUADLEN(end - padding); - return len; + base = (nwords << 2) - pglen; + end = xdr_stream_remaining(xdr) - pglen; + + xdr_set_tail_base(xdr, base, end); + return len <= pglen ? len : pglen; } EXPORT_SYMBOL_GPL(xdr_read_pages); /** + * xdr_set_pagelen - Sets the length of the XDR pages + * @xdr: pointer to xdr_stream struct + * @len: new length of the XDR page data + * + * Either grows or shrinks the length of the xdr pages by setting pagelen to + * @len bytes. When shrinking, any extra data is moved into buf->tail, whereas + * when growing any data beyond the current pointer is moved into the tail. + * + * Returns True if the operation was successful, and False otherwise. + */ +void xdr_set_pagelen(struct xdr_stream *xdr, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + size_t remaining = xdr_stream_remaining(xdr); + size_t base = 0; + + if (len < buf->page_len) { + base = buf->page_len - len; + xdr_shrink_pagelen(buf, len); + } else { + xdr_buf_head_shift_right(buf, xdr_stream_pos(xdr), + buf->page_len, remaining); + if (len > buf->page_len) + xdr_buf_try_expand(buf, len - buf->page_len); + } + xdr_set_tail_base(xdr, base, remaining); +} +EXPORT_SYMBOL_GPL(xdr_set_pagelen); + +/** * xdr_enter_page - decode data from the XDR page * @xdr: pointer to xdr_stream struct * @len: number of bytes of page data @@ -1081,8 +1576,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page); static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; -void -xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) +void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf) { buf->head[0] = *iov; buf->tail[0] = empty_iov; @@ -1103,11 +1597,10 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov); * * @buf and @subbuf may be pointers to the same struct xdr_buf. * - * Returns -1 if base of length are out of bounds. + * Returns -1 if base or length are out of bounds. */ -int -xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, - unsigned int base, unsigned int len) +int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf, + unsigned int base, unsigned int len) { subbuf->buflen = subbuf->len = len; if (base < buf->head[0].iov_len) { @@ -1118,6 +1611,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, base = 0; } else { base -= buf->head[0].iov_len; + subbuf->head[0].iov_base = buf->head[0].iov_base; subbuf->head[0].iov_len = 0; } @@ -1130,6 +1624,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, base = 0; } else { base -= buf->page_len; + subbuf->pages = buf->pages; + subbuf->page_base = 0; subbuf->page_len = 0; } @@ -1141,6 +1637,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, base = 0; } else { base -= buf->tail[0].iov_len; + subbuf->tail[0].iov_base = buf->tail[0].iov_base; subbuf->tail[0].iov_len = 0; } @@ -1150,7 +1647,150 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, } EXPORT_SYMBOL_GPL(xdr_buf_subsegment); -static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +/** + * xdr_stream_subsegment - set @subbuf to a portion of @xdr + * @xdr: an xdr_stream set up for decoding + * @subbuf: the result buffer + * @nbytes: length of @xdr to extract, in bytes + * + * Sets up @subbuf to represent a portion of @xdr. The portion + * starts at the current offset in @xdr, and extends for a length + * of @nbytes. If this is successful, @xdr is advanced to the next + * XDR data item following that portion. + * + * Return values: + * %true: @subbuf has been initialized, and @xdr has been advanced. + * %false: a bounds error has occurred + */ +bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, + unsigned int nbytes) +{ + unsigned int start = xdr_stream_pos(xdr); + unsigned int remaining, len; + + /* Extract @subbuf and bounds-check the fn arguments */ + if (xdr_buf_subsegment(xdr->buf, subbuf, start, nbytes)) + return false; + + /* Advance @xdr by @nbytes */ + for (remaining = nbytes; remaining;) { + if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr)) + return false; + + len = (char *)xdr->end - (char *)xdr->p; + if (remaining <= len) { + xdr->p = (__be32 *)((char *)xdr->p + + (remaining + xdr_pad_size(nbytes))); + break; + } + + xdr->p = (__be32 *)((char *)xdr->p + len); + xdr->end = xdr->p; + remaining -= len; + } + + xdr_stream_set_pos(xdr, start + nbytes); + return true; +} +EXPORT_SYMBOL_GPL(xdr_stream_subsegment); + +/** + * xdr_stream_move_subsegment - Move part of a stream to another position + * @xdr: the source xdr_stream + * @offset: the source offset of the segment + * @target: the target offset of the segment + * @length: the number of bytes to move + * + * Moves @length bytes from @offset to @target in the xdr_stream, overwriting + * anything in its space. Returns the number of bytes in the segment. + */ +unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset, + unsigned int target, unsigned int length) +{ + struct xdr_buf buf; + unsigned int shift; + + if (offset < target) { + shift = target - offset; + if (xdr_buf_subsegment(xdr->buf, &buf, offset, shift + length) < 0) + return 0; + xdr_buf_head_shift_right(&buf, 0, length, shift); + } else if (offset > target) { + shift = offset - target; + if (xdr_buf_subsegment(xdr->buf, &buf, target, shift + length) < 0) + return 0; + xdr_buf_head_shift_left(&buf, shift, length, shift); + } + return length; +} +EXPORT_SYMBOL_GPL(xdr_stream_move_subsegment); + +/** + * xdr_stream_zero - zero out a portion of an xdr_stream + * @xdr: an xdr_stream to zero out + * @offset: the starting point in the stream + * @length: the number of bytes to zero + */ +unsigned int xdr_stream_zero(struct xdr_stream *xdr, unsigned int offset, + unsigned int length) +{ + struct xdr_buf buf; + + if (xdr_buf_subsegment(xdr->buf, &buf, offset, length) < 0) + return 0; + if (buf.head[0].iov_len) + xdr_buf_iov_zero(buf.head, 0, buf.head[0].iov_len); + if (buf.page_len > 0) + xdr_buf_pages_zero(&buf, 0, buf.page_len); + if (buf.tail[0].iov_len) + xdr_buf_iov_zero(buf.tail, 0, buf.tail[0].iov_len); + return length; +} +EXPORT_SYMBOL_GPL(xdr_stream_zero); + +/** + * xdr_buf_trim - lop at most "len" bytes off the end of "buf" + * @buf: buf to be trimmed + * @len: number of bytes to reduce "buf" by + * + * Trim an xdr_buf by the given number of bytes by fixing up the lengths. Note + * that it's possible that we'll trim less than that amount if the xdr_buf is + * too small, or if (for instance) it's all in the head and the parser has + * already read too far into it. + */ +void xdr_buf_trim(struct xdr_buf *buf, unsigned int len) +{ + size_t cur; + unsigned int trim = len; + + if (buf->tail[0].iov_len) { + cur = min_t(size_t, buf->tail[0].iov_len, trim); + buf->tail[0].iov_len -= cur; + trim -= cur; + if (!trim) + goto fix_len; + } + + if (buf->page_len) { + cur = min_t(unsigned int, buf->page_len, trim); + buf->page_len -= cur; + trim -= cur; + if (!trim) + goto fix_len; + } + + if (buf->head[0].iov_len) { + cur = min_t(size_t, buf->head[0].iov_len, trim); + buf->head[0].iov_len -= cur; + trim -= cur; + } +fix_len: + buf->len -= (len - trim); +} +EXPORT_SYMBOL_GPL(xdr_buf_trim); + +static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf, + void *obj, unsigned int len) { unsigned int this_len; @@ -1159,8 +1799,7 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); - if (this_len) - _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); + _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); @@ -1168,7 +1807,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne } /* obj is assumed to point to allocated memory of size at least len: */ -int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base, + void *obj, unsigned int len) { struct xdr_buf subbuf; int status; @@ -1181,7 +1821,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u } EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf); -static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf, + void *obj, unsigned int len) { unsigned int this_len; @@ -1190,8 +1831,7 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); - if (this_len) - _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); + _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); @@ -1199,7 +1839,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned } /* obj is assumed to point to allocated memory of size at least len: */ -int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base, + void *obj, unsigned int len) { struct xdr_buf subbuf; int status; @@ -1212,8 +1853,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un } EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf); -int -xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) +int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj) { __be32 raw; int status; @@ -1226,8 +1866,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) } EXPORT_SYMBOL_GPL(xdr_decode_word); -int -xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) +int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj) { __be32 raw = cpu_to_be32(obj); @@ -1235,65 +1874,9 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) } EXPORT_SYMBOL_GPL(xdr_encode_word); -/** - * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf - * @buf: pointer to buffer containing a mic - * @mic: on success, returns the address of the mic - * @offset: the offset in buf where mic may be found - * - * This function may modify the xdr buf if the mic is found to be straddling - * a boundary between head, pages, and tail. On success the mic can be read - * from the address returned. There is no need to free the mic. - * - * Return: Success returns 0, otherwise an integer error. - */ -int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset) -{ - struct xdr_buf subbuf; - unsigned int boundary; - - if (xdr_decode_word(buf, offset, &mic->len)) - return -EFAULT; - offset += 4; - - /* Is the mic partially in the head? */ - boundary = buf->head[0].iov_len; - if (offset < boundary && (offset + mic->len) > boundary) - xdr_shift_buf(buf, boundary - offset); - - /* Is the mic partially in the pages? */ - boundary += buf->page_len; - if (offset < boundary && (offset + mic->len) > boundary) - xdr_shrink_pagelen(buf, boundary - offset); - - if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len)) - return -EFAULT; - - /* Is the mic contained entirely in the head? */ - mic->data = subbuf.head[0].iov_base; - if (subbuf.head[0].iov_len == mic->len) - return 0; - /* ..or is the mic contained entirely in the tail? */ - mic->data = subbuf.tail[0].iov_base; - if (subbuf.tail[0].iov_len == mic->len) - return 0; - - /* Find a contiguous area in @buf to hold all of @mic */ - if (mic->len > buf->buflen - buf->len) - return -ENOMEM; - if (buf->tail[0].iov_len != 0) - mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len; - else - mic->data = buf->head[0].iov_base + buf->head[0].iov_len; - __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len); - return 0; -} -EXPORT_SYMBOL_GPL(xdr_buf_read_mic); - /* Returns 0 on success, or else a negative error code. */ -static int -xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc, int encode) +static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc, int encode) { char *elem = NULL, *c; unsigned int copied = 0, todo, avail_here; @@ -1485,9 +2068,8 @@ out: return err; } -int -xdr_decode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc) +int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) { if (base >= buf->len) return -EINVAL; @@ -1496,9 +2078,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base, } EXPORT_SYMBOL_GPL(xdr_decode_array2); -int -xdr_encode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc) +int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) { if ((unsigned long) base + 4 + desc->array_len * desc->elem_size > buf->head->iov_len + buf->page_len + buf->tail->iov_len) @@ -1508,9 +2089,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base, } EXPORT_SYMBOL_GPL(xdr_encode_array2); -int -xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, - int (*actor)(struct scatterlist *, void *), void *data) +int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, + unsigned int len, + int (*actor)(struct scatterlist *, void *), void *data) { int i, ret = 0; unsigned int page_len, thislen, page_offset; @@ -1678,10 +2259,8 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); if (ret > 0) { - char *s = kmalloc(ret + 1, gfp_flags); + char *s = kmemdup_nul(p, ret, gfp_flags); if (s != NULL) { - memcpy(s, p, ret); - s[ret] = '\0'; *str = s; return strlen(s); } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1aafe8d3f3f4..656cec208371 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -55,6 +55,8 @@ #include <trace/events/sunrpc.h> #include "sunrpc.h" +#include "sysfs.h" +#include "fail.h" /* * Local variables @@ -67,9 +69,11 @@ /* * Local functions */ -static void xprt_init(struct rpc_xprt *xprt, struct net *net); +static void xprt_init(struct rpc_xprt *xprt, struct net *net); static __be32 xprt_alloc_xid(struct rpc_xprt *xprt); -static void xprt_destroy(struct rpc_xprt *xprt); +static void xprt_destroy(struct rpc_xprt *xprt); +static void xprt_request_init(struct rpc_task *task); +static int xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf); static DEFINE_SPINLOCK(xprt_list_lock); static LIST_HEAD(xprt_list); @@ -151,42 +155,101 @@ out: } EXPORT_SYMBOL_GPL(xprt_unregister_transport); -/** - * xprt_load_transport - load a transport implementation - * @transport_name: transport to load - * - * Returns: - * 0: transport successfully loaded - * -ENOENT: transport module not available - */ -int xprt_load_transport(const char *transport_name) +static void +xprt_class_release(const struct xprt_class *t) { - struct xprt_class *t; - int result; + module_put(t->owner); +} + +static const struct xprt_class * +xprt_class_find_by_ident_locked(int ident) +{ + const struct xprt_class *t; + + list_for_each_entry(t, &xprt_list, list) { + if (t->ident != ident) + continue; + if (!try_module_get(t->owner)) + continue; + return t; + } + return NULL; +} + +static const struct xprt_class * +xprt_class_find_by_ident(int ident) +{ + const struct xprt_class *t; - result = 0; spin_lock(&xprt_list_lock); + t = xprt_class_find_by_ident_locked(ident); + spin_unlock(&xprt_list_lock); + return t; +} + +static const struct xprt_class * +xprt_class_find_by_netid_locked(const char *netid) +{ + const struct xprt_class *t; + unsigned int i; + list_for_each_entry(t, &xprt_list, list) { - if (strcmp(t->name, transport_name) == 0) { - spin_unlock(&xprt_list_lock); - goto out; + for (i = 0; t->netid[i][0] != '\0'; i++) { + if (strcmp(t->netid[i], netid) != 0) + continue; + if (!try_module_get(t->owner)) + continue; + return t; } } + return NULL; +} + +static const struct xprt_class * +xprt_class_find_by_netid(const char *netid) +{ + const struct xprt_class *t; + + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_netid_locked(netid); + if (!t) { + spin_unlock(&xprt_list_lock); + request_module("rpc%s", netid); + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_netid_locked(netid); + } spin_unlock(&xprt_list_lock); - result = request_module("xprt%s", transport_name); -out: - return result; + return t; } -EXPORT_SYMBOL_GPL(xprt_load_transport); + +/** + * xprt_find_transport_ident - convert a netid into a transport identifier + * @netid: transport to load + * + * Returns: + * > 0: transport identifier + * -ENOENT: transport module not available + */ +int xprt_find_transport_ident(const char *netid) +{ + const struct xprt_class *t; + int ret; + + t = xprt_class_find_by_netid(netid); + if (!t) + return -ENOENT; + ret = t->ident; + xprt_class_release(t); + return ret; +} +EXPORT_SYMBOL_GPL(xprt_find_transport_ident); static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; - if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { - smp_mb__before_atomic(); - clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_atomic(); - } else + if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) + clear_bit_unlock(XPRT_LOCKED, &xprt->state); + else queue_work(xprtiod_workqueue, &xprt->task_cleanup); } @@ -381,7 +444,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); -static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task != task) return; @@ -607,6 +670,11 @@ static void xprt_reset_majortimeo(struct rpc_rqst *req) req->rq_majortimeo += xprt_calc_majortimeo(req); } +static void xprt_reset_minortimeo(struct rpc_rqst *req) +{ + req->rq_minortimeo += req->rq_timeout; +} + static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req) { unsigned long time_init; @@ -618,6 +686,7 @@ static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req) time_init = xprt_abs_ktime_to_jiffies(task->tk_start); req->rq_timeout = task->tk_client->cl_timeout->to_initval; req->rq_majortimeo = time_init + xprt_calc_majortimeo(req); + req->rq_minortimeo = time_init + req->rq_timeout; } /** @@ -632,6 +701,8 @@ int xprt_adjust_timeout(struct rpc_rqst *req) int status = 0; if (time_before(jiffies, req->rq_majortimeo)) { + if (time_before(jiffies, req->rq_minortimeo)) + return status; if (to->to_exponential) req->rq_timeout <<= 1; else @@ -649,6 +720,7 @@ int xprt_adjust_timeout(struct rpc_rqst *req) spin_unlock(&xprt->transport_lock); status = -ETIMEDOUT; } + xprt_reset_minortimeo(req); if (req->rq_timeout == 0) { printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n"); @@ -663,6 +735,9 @@ static void xprt_autoclose(struct work_struct *work) container_of(work, struct rpc_xprt, task_cleanup); unsigned int pflags = memalloc_nofs_save(); + trace_xprt_disconnect_auto(xprt); + xprt->connect_cookie++; + smp_mb__before_atomic(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); xprt->ops->close(xprt); xprt_release_write(xprt, NULL); @@ -677,7 +752,7 @@ static void xprt_autoclose(struct work_struct *work) */ void xprt_disconnect_done(struct rpc_xprt *xprt) { - dprintk("RPC: disconnected transport %p\n", xprt); + trace_xprt_disconnect_done(xprt); spin_lock(&xprt->transport_lock); xprt_clear_connected(xprt); xprt_clear_write_space_locked(xprt); @@ -688,21 +763,32 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) EXPORT_SYMBOL_GPL(xprt_disconnect_done); /** + * xprt_schedule_autoclose_locked - Try to schedule an autoclose RPC call + * @xprt: transport to disconnect + */ +static void xprt_schedule_autoclose_locked(struct rpc_xprt *xprt) +{ + if (test_and_set_bit(XPRT_CLOSE_WAIT, &xprt->state)) + return; + if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) + queue_work(xprtiod_workqueue, &xprt->task_cleanup); + else if (xprt->snd_task && !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) + rpc_wake_up_queued_task_set_status(&xprt->pending, + xprt->snd_task, -ENOTCONN); +} + +/** * xprt_force_disconnect - force a transport to disconnect * @xprt: transport to disconnect * */ void xprt_force_disconnect(struct rpc_xprt *xprt) { + trace_xprt_disconnect_force(xprt); + /* Don't race with the test_bit() in xprt_clear_locked() */ spin_lock(&xprt->transport_lock); - set_bit(XPRT_CLOSE_WAIT, &xprt->state); - /* Try to schedule an autoclose RPC call */ - if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(xprtiod_workqueue, &xprt->task_cleanup); - else if (xprt->snd_task) - rpc_wake_up_queued_task_set_status(&xprt->pending, - xprt->snd_task, -ENOTCONN); + xprt_schedule_autoclose_locked(xprt); spin_unlock(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_force_disconnect); @@ -742,11 +828,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) goto out; if (test_bit(XPRT_CLOSING, &xprt->state)) goto out; - set_bit(XPRT_CLOSE_WAIT, &xprt->state); - /* Try to schedule an autoclose RPC call */ - if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(xprtiod_workqueue, &xprt->task_cleanup); - xprt_wake_pending_tasks(xprt, -EAGAIN); + xprt_schedule_autoclose_locked(xprt); out: spin_unlock(&xprt->transport_lock); } @@ -780,6 +862,19 @@ xprt_init_autodisconnect(struct timer_list *t) queue_work(xprtiod_workqueue, &xprt->task_cleanup); } +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +static void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ + if (!fail_sunrpc.ignore_client_disconnect && + should_fail(&fail_sunrpc.attr, 1)) + xprt->ops->inject_disconnect(xprt); +} +#else +static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ +} +#endif + bool xprt_lock_connect(struct rpc_xprt *xprt, struct rpc_task *task, void *cookie) @@ -791,12 +886,14 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; + set_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->snd_task = cookie; ret = true; out: spin_unlock(&xprt->transport_lock); return ret; } +EXPORT_SYMBOL_GPL(xprt_lock_connect); void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) { @@ -806,12 +903,14 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) if (!test_bit(XPRT_LOCKED, &xprt->state)) goto out; xprt->snd_task =NULL; + clear_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->ops->release_xprt(xprt, NULL); xprt_schedule_autodisconnect(xprt); out: spin_unlock(&xprt->transport_lock); wake_up_bit(&xprt->state, XPRT_LOCKED); } +EXPORT_SYMBOL_GPL(xprt_unlock_connect); /** * xprt_connect - schedule a transport connect operation @@ -822,8 +921,7 @@ void xprt_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - dprintk("RPC: %5u xprt_connect xprt %p %s connected\n", task->tk_pid, - xprt, (xprt_connected(xprt) ? "is" : "is not")); + trace_xprt_connect(xprt); if (!xprt_bound(xprt)) { task->tk_status = -EAGAIN; @@ -832,10 +930,7 @@ void xprt_connect(struct rpc_task *task) if (!xprt_lock_write(xprt, task)) return; - if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state)) - xprt->ops->close(xprt); - - if (!xprt_connected(xprt)) { + if (!xprt_connected(xprt) && !test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; rpc_sleep_on_timeout(&xprt->pending, task, NULL, xprt_request_timeout(task->tk_rqstp)); @@ -1044,16 +1139,19 @@ xprt_request_need_enqueue_receive(struct rpc_task *task, struct rpc_rqst *req) * @task: RPC task * */ -void +int xprt_request_enqueue_receive(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + int ret; if (!xprt_request_need_enqueue_receive(task, req)) - return; + return 0; - xprt_request_prepare(task->tk_rqstp); + ret = xprt_request_prepare(task->tk_rqstp, &req->rq_rcv_buf); + if (ret) + return ret; spin_lock(&xprt->queue_lock); /* Update the softirq receive buffer */ @@ -1067,6 +1165,7 @@ xprt_request_enqueue_receive(struct rpc_task *task) /* Turn off autodisconnect */ del_singleshot_timer_sync(&xprt->timer); + return 0; } /** @@ -1117,12 +1216,10 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - dprintk("RPC: %5u xid %08x complete (%d bytes received)\n", - task->tk_pid, ntohl(req->rq_xid), copied); - trace_xprt_complete_rqst(xprt, req->rq_xid, copied); - xprt->stat.recvs++; + xdr_free_bvec(&req->rq_rcv_buf); + req->rq_private_buf.bvec = NULL; req->rq_private_buf.len = copied; /* Ensure all writes are done before we update */ /* req->rq_reply_bytes_recvd */ @@ -1241,8 +1338,14 @@ xprt_request_enqueue_transmit(struct rpc_task *task) { struct rpc_rqst *pos, *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + int ret; if (xprt_request_need_enqueue_transmit(task, req)) { + ret = xprt_request_prepare(task->tk_rqstp, &req->rq_snd_buf); + if (ret) { + task->tk_status = ret; + return; + } req->rq_bytes_sent = 0; spin_lock(&xprt->queue_lock); /* @@ -1257,19 +1360,6 @@ xprt_request_enqueue_transmit(struct rpc_task *task) /* Note: req is added _before_ pos */ list_add_tail(&req->rq_xmit, &pos->rq_xmit); INIT_LIST_HEAD(&req->rq_xmit2); - trace_xprt_enq_xmit(task, 1); - goto out; - } - } else if (RPC_IS_SWAPPER(task)) { - list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { - if (pos->rq_cong || pos->rq_bytes_sent) - continue; - if (RPC_IS_SWAPPER(pos->rq_task)) - continue; - /* Note: req is added _before_ pos */ - list_add_tail(&req->rq_xmit, &pos->rq_xmit); - INIT_LIST_HEAD(&req->rq_xmit2); - trace_xprt_enq_xmit(task, 2); goto out; } } else if (!req->rq_seqno) { @@ -1278,14 +1368,13 @@ xprt_request_enqueue_transmit(struct rpc_task *task) continue; list_add_tail(&req->rq_xmit2, &pos->rq_xmit2); INIT_LIST_HEAD(&req->rq_xmit); - trace_xprt_enq_xmit(task, 3); goto out; } } list_add_tail(&req->rq_xmit, &xprt->xmit_queue); INIT_LIST_HEAD(&req->rq_xmit2); - trace_xprt_enq_xmit(task, 4); out: + atomic_long_inc(&xprt->xmit_queuelen); set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); spin_unlock(&xprt->queue_lock); } @@ -1315,6 +1404,8 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task) } } else list_del(&req->rq_xmit2); + atomic_long_dec(&req->rq_xprt->xmit_queuelen); + xdr_free_bvec(&req->rq_snd_buf); } /** @@ -1351,8 +1442,6 @@ xprt_request_dequeue_xprt(struct rpc_task *task) test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || xprt_is_pinned_rqst(req)) { spin_lock(&xprt->queue_lock); - xprt_request_dequeue_transmit_locked(task); - xprt_request_dequeue_receive_locked(task); while (xprt_is_pinned_rqst(req)) { set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); spin_unlock(&xprt->queue_lock); @@ -1360,24 +1449,30 @@ xprt_request_dequeue_xprt(struct rpc_task *task) spin_lock(&xprt->queue_lock); clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); } + xprt_request_dequeue_transmit_locked(task); + xprt_request_dequeue_receive_locked(task); spin_unlock(&xprt->queue_lock); + xdr_free_bvec(&req->rq_rcv_buf); } } /** * xprt_request_prepare - prepare an encoded request for transport * @req: pointer to rpc_rqst + * @buf: pointer to send/rcv xdr_buf * * Calls into the transport layer to do whatever is needed to prepare * the request for transmission or receive. + * Returns error, or zero. */ -void -xprt_request_prepare(struct rpc_rqst *req) +static int +xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf) { struct rpc_xprt *xprt = req->rq_xprt; if (xprt->ops->prepare_request) - xprt->ops->prepare_request(req); + return xprt->ops->prepare_request(req, buf); + return 0; } /** @@ -1402,8 +1497,6 @@ bool xprt_prepare_transmit(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid); - if (!xprt_lock_write(xprt, task)) { /* Race breaker: someone may have transmitted us */ if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) @@ -1412,12 +1505,18 @@ bool xprt_prepare_transmit(struct rpc_task *task) return false; } + if (atomic_read(&xprt->swapper)) + /* This will be clear in __rpc_execute */ + current->flags |= PF_MEMALLOC; return true; } void xprt_end_transmit(struct rpc_task *task) { - xprt_release_write(task->tk_rqstp->rq_xprt, task); + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; + + xprt_inject_disconnect(xprt); + xprt_release_write(xprt, task); } /** @@ -1462,6 +1561,7 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) */ req->rq_ntrans++; + trace_rpc_xdr_sendto(task, &req->rq_snd_buf); connect_cookie = xprt->connect_cookie; status = xprt->ops->send_request(req); if (status != 0) { @@ -1470,8 +1570,10 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) return status; } - if (is_retrans) + if (is_retrans) { task->tk_client->cl_stats->rpcretrans++; + trace_xprt_retransmit(req); + } xprt_inject_disconnect(xprt); @@ -1510,39 +1612,66 @@ xprt_transmit(struct rpc_task *task) int status; spin_lock(&xprt->queue_lock); - while (!list_empty(&xprt->xmit_queue)) { - next = list_first_entry(&xprt->xmit_queue, - struct rpc_rqst, rq_xmit); + for (;;) { + next = list_first_entry_or_null(&xprt->xmit_queue, + struct rpc_rqst, rq_xmit); + if (!next) + break; xprt_pin_rqst(next); spin_unlock(&xprt->queue_lock); status = xprt_request_transmit(next, task); if (status == -EBADMSG && next != req) status = 0; - cond_resched(); spin_lock(&xprt->queue_lock); xprt_unpin_rqst(next); - if (status == 0) { - if (!xprt_request_data_received(task) || - test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) - continue; - } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) - task->tk_status = status; - break; + if (status < 0) { + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + task->tk_status = status; + break; + } + /* Was @task transmitted, and has it received a reply? */ + if (xprt_request_data_received(task) && + !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + break; + cond_resched_lock(&xprt->queue_lock); } spin_unlock(&xprt->queue_lock); } -static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) +static void xprt_complete_request_init(struct rpc_task *task) +{ + if (task->tk_rqstp) + xprt_request_init(task); +} + +void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) { set_bit(XPRT_CONGESTED, &xprt->state); - rpc_sleep_on(&xprt->backlog, task, NULL); + rpc_sleep_on(&xprt->backlog, task, xprt_complete_request_init); +} +EXPORT_SYMBOL_GPL(xprt_add_backlog); + +static bool __xprt_set_rq(struct rpc_task *task, void *data) +{ + struct rpc_rqst *req = data; + + if (task->tk_rqstp == NULL) { + memset(req, 0, sizeof(*req)); /* mark unused */ + task->tk_rqstp = req; + return true; + } + return false; } -static void xprt_wake_up_backlog(struct rpc_xprt *xprt) +bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req) { - if (rpc_wake_up_next(&xprt->backlog) == NULL) + if (rpc_wake_up_first(&xprt->backlog, __xprt_set_rq, req) == NULL) { clear_bit(XPRT_CONGESTED, &xprt->state); + return false; + } + return true; } +EXPORT_SYMBOL_GPL(xprt_wake_up_backlog); static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -1552,7 +1681,7 @@ static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task goto out; spin_lock(&xprt->reserve_lock); if (test_bit(XPRT_CONGESTED, &xprt->state)) { - rpc_sleep_on(&xprt->backlog, task, NULL); + xprt_add_backlog(xprt, task); ret = true; } spin_unlock(&xprt->reserve_lock); @@ -1568,7 +1697,7 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt) goto out; ++xprt->num_reqs; spin_unlock(&xprt->reserve_lock); - req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS); + req = kzalloc(sizeof(*req), rpc_task_gfp_mask()); spin_lock(&xprt->reserve_lock); if (req != NULL) goto out; @@ -1610,7 +1739,7 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) case -EAGAIN: xprt_add_backlog(xprt, task); dprintk("RPC: waiting for request slot\n"); - /* fall through */ + fallthrough; default: task->tk_status = -EAGAIN; } @@ -1629,11 +1758,11 @@ EXPORT_SYMBOL_GPL(xprt_alloc_slot); void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { spin_lock(&xprt->reserve_lock); - if (!xprt_dynamic_free_slot(xprt, req)) { + if (!xprt_wake_up_backlog(xprt, req) && + !xprt_dynamic_free_slot(xprt, req)) { memset(req, 0, sizeof(*req)); /* mark unused */ list_add(&req->rq_list, &xprt->free); } - xprt_wake_up_backlog(xprt); spin_unlock(&xprt->reserve_lock); } EXPORT_SYMBOL_GPL(xprt_free_slot); @@ -1648,6 +1777,30 @@ static void xprt_free_all_slots(struct rpc_xprt *xprt) } } +static DEFINE_IDA(rpc_xprt_ids); + +void xprt_cleanup_ids(void) +{ + ida_destroy(&rpc_xprt_ids); +} + +static int xprt_alloc_id(struct rpc_xprt *xprt) +{ + int id; + + id = ida_alloc(&rpc_xprt_ids, GFP_KERNEL); + if (id < 0) + return id; + + xprt->id = id; + return 0; +} + +static void xprt_free_id(struct rpc_xprt *xprt) +{ + ida_free(&rpc_xprt_ids, xprt->id); +} + struct rpc_xprt *xprt_alloc(struct net *net, size_t size, unsigned int num_prealloc, unsigned int max_alloc) @@ -1660,6 +1813,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, if (xprt == NULL) goto out; + xprt_alloc_id(xprt); xprt_init(xprt, net); for (i = 0; i < num_prealloc; i++) { @@ -1668,10 +1822,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, goto out_free; list_add(&req->rq_list, &xprt->free); } - if (max_alloc > num_prealloc) - xprt->max_reqs = max_alloc; - else - xprt->max_reqs = num_prealloc; + xprt->max_reqs = max_t(unsigned int, max_alloc, num_prealloc); xprt->min_reqs = num_prealloc; xprt->num_reqs = num_prealloc; @@ -1686,8 +1837,10 @@ EXPORT_SYMBOL_GPL(xprt_alloc); void xprt_free(struct rpc_xprt *xprt) { - put_net(xprt->xprt_net); + put_net_track(xprt->xprt_net, &xprt->ns_tracker); xprt_free_all_slots(xprt); + xprt_free_id(xprt); + rpc_sysfs_xprt_destroy(xprt); kfree_rcu(xprt, rcu); } EXPORT_SYMBOL_GPL(xprt_free); @@ -1712,7 +1865,7 @@ xprt_alloc_xid(struct rpc_xprt *xprt) static void xprt_init_xid(struct rpc_xprt *xprt) { - xprt->xid = prandom_u32(); + xprt->xid = get_random_u32(); } static void @@ -1734,8 +1887,8 @@ xprt_request_init(struct rpc_task *task) req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; xprt_init_majortimeo(task, req); - dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, - req, ntohl(req->rq_xid)); + + trace_xprt_reserve(req); } static void @@ -1816,16 +1969,12 @@ void xprt_release(struct rpc_task *task) spin_unlock(&xprt->transport_lock); if (req->rq_buffer) xprt->ops->buf_free(task); - xprt_inject_disconnect(xprt); - xdr_free_bvec(&req->rq_rcv_buf); - xdr_free_bvec(&req->rq_snd_buf); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); - task->tk_rqstp = NULL; if (req->rq_release_snd_buf) req->rq_release_snd_buf(req); - dprintk("RPC: %5u release request %p\n", task->tk_pid, req); + task->tk_rqstp = NULL; if (likely(!bc_prealloc(req))) xprt->ops->free_slot(xprt, req); else @@ -1878,7 +2027,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) xprt_init_xid(xprt); - xprt->xprt_net = get_net(net); + xprt->xprt_net = get_net_track(net, &xprt->ns_tracker, GFP_KERNEL); } /** @@ -1889,26 +2038,19 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { struct rpc_xprt *xprt; - struct xprt_class *t; + const struct xprt_class *t; - spin_lock(&xprt_list_lock); - list_for_each_entry(t, &xprt_list, list) { - if (t->ident == args->ident) { - spin_unlock(&xprt_list_lock); - goto found; - } + t = xprt_class_find_by_ident(args->ident); + if (!t) { + dprintk("RPC: transport (%d) not supported\n", args->ident); + return ERR_PTR(-EIO); } - spin_unlock(&xprt_list_lock); - dprintk("RPC: transport (%d) not supported\n", args->ident); - return ERR_PTR(-EIO); -found: xprt = t->setup(args); - if (IS_ERR(xprt)) { - dprintk("RPC: xprt_create_transport: failed, %ld\n", - -PTR_ERR(xprt)); + xprt_class_release(t); + + if (IS_ERR(xprt)) goto out; - } if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT) xprt->idle_timeout = 0; INIT_WORK(&xprt->task_cleanup, xprt_autoclose); @@ -1929,8 +2071,7 @@ found: rpc_xprt_debugfs_register(xprt); - dprintk("RPC: created transport %p with %u slots\n", xprt, - xprt->max_reqs); + trace_xprt_create(xprt); out: return xprt; } @@ -1940,6 +2081,8 @@ static void xprt_destroy_cb(struct work_struct *work) struct rpc_xprt *xprt = container_of(work, struct rpc_xprt, task_cleanup); + trace_xprt_destroy(xprt); + rpc_xprt_debugfs_unregister(xprt); rpc_destroy_wait_queue(&xprt->binding); rpc_destroy_wait_queue(&xprt->pending); @@ -1964,14 +2107,19 @@ static void xprt_destroy_cb(struct work_struct *work) */ static void xprt_destroy(struct rpc_xprt *xprt) { - dprintk("RPC: destroying transport %p\n", xprt); - /* * Exclude transport connect/disconnect handlers and autoclose */ wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_UNINTERRUPTIBLE); + /* + * xprt_schedule_autodisconnect() can run after XPRT_LOCKED + * is cleared. We use ->transport_lock to ensure the mod_timer() + * can only run *before* del_time_sync(), never after. + */ + spin_lock(&xprt->transport_lock); del_timer_sync(&xprt->timer); + spin_unlock(&xprt->transport_lock); /* * Destroy sockets etc from the system workqueue so they can @@ -2010,3 +2158,35 @@ void xprt_put(struct rpc_xprt *xprt) kref_put(&xprt->kref, xprt_destroy_kref); } EXPORT_SYMBOL_GPL(xprt_put); + +void xprt_set_offline_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) +{ + if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) { + spin_lock(&xps->xps_lock); + xps->xps_nactive--; + spin_unlock(&xps->xps_lock); + } +} + +void xprt_set_online_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) +{ + if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) { + spin_lock(&xps->xps_lock); + xps->xps_nactive++; + spin_unlock(&xps->xps_lock); + } +} + +void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) +{ + if (test_and_set_bit(XPRT_REMOVE, &xprt->state)) + return; + + xprt_force_disconnect(xprt); + if (!test_bit(XPRT_CONNECTED, &xprt->state)) + return; + + if (!xprt->sending.qlen && !xprt->pending.qlen && + !xprt->backlog.qlen && !atomic_long_read(&xprt->queuelen)) + rpc_xprt_switch_remove_xprt(xps, xprt, true); +} diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 78c075a68c04..701250b305db 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -7,24 +7,27 @@ * Trond Myklebust <trond.myklebust@primarydata.com> * */ +#include <linux/atomic.h> #include <linux/types.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/slab.h> -#include <asm/cmpxchg.h> #include <linux/spinlock.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtmultipath.h> +#include "sysfs.h" + typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct rpc_xprt_switch *xps, const struct rpc_xprt *cur); static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular; static const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin; static const struct rpc_xprt_iter_ops rpc_xprt_iter_listall; +static const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline; static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt) @@ -55,14 +58,16 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) xprt_switch_add_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); + rpc_sysfs_xprt_setup(xps, xprt, GFP_KERNEL); } static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, - struct rpc_xprt *xprt) + struct rpc_xprt *xprt, bool offline) { if (unlikely(xprt == NULL)) return; - xps->xps_nactive--; + if (!test_bit(XPRT_OFFLINE, &xprt->state) && offline) + xps->xps_nactive--; xps->xps_nxprts--; if (xps->xps_nxprts == 0) xps->xps_net = NULL; @@ -74,18 +79,43 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, * rpc_xprt_switch_remove_xprt - Removes an rpc_xprt from a rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * @xprt: pointer to struct rpc_xprt + * @offline: indicates if the xprt that's being removed is in an offline state * * Removes xprt from the list of struct rpc_xprt in xps. */ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, - struct rpc_xprt *xprt) + struct rpc_xprt *xprt, bool offline) { spin_lock(&xps->xps_lock); - xprt_switch_remove_xprt_locked(xps, xprt); + xprt_switch_remove_xprt_locked(xps, xprt, offline); spin_unlock(&xps->xps_lock); xprt_put(xprt); } +static DEFINE_IDA(rpc_xprtswitch_ids); + +void xprt_multipath_cleanup_ids(void) +{ + ida_destroy(&rpc_xprtswitch_ids); +} + +static int xprt_switch_alloc_id(struct rpc_xprt_switch *xps, gfp_t gfp_flags) +{ + int id; + + id = ida_alloc(&rpc_xprtswitch_ids, gfp_flags); + if (id < 0) + return id; + + xps->xps_id = id; + return 0; +} + +static void xprt_switch_free_id(struct rpc_xprt_switch *xps) +{ + ida_free(&rpc_xprtswitch_ids, xps->xps_id); +} + /** * xprt_switch_alloc - Allocate a new struct rpc_xprt_switch * @xprt: pointer to struct rpc_xprt @@ -103,12 +133,16 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, if (xps != NULL) { spin_lock_init(&xps->xps_lock); kref_init(&xps->xps_kref); + xprt_switch_alloc_id(xps, gfp_flags); xps->xps_nxprts = xps->xps_nactive = 0; atomic_long_set(&xps->xps_queuelen, 0); xps->xps_net = NULL; INIT_LIST_HEAD(&xps->xps_xprt_list); xps->xps_iter_ops = &rpc_xprt_iter_singular; + rpc_sysfs_xprt_switch_setup(xps, xprt, gfp_flags); xprt_switch_add_xprt_locked(xps, xprt); + xps->xps_nunique_destaddr_xprts = 1; + rpc_sysfs_xprt_setup(xps, xprt, gfp_flags); } return xps; @@ -122,7 +156,7 @@ static void xprt_switch_free_entries(struct rpc_xprt_switch *xps) xprt = list_first_entry(&xps->xps_xprt_list, struct rpc_xprt, xprt_switch); - xprt_switch_remove_xprt_locked(xps, xprt); + xprt_switch_remove_xprt_locked(xps, xprt, true); spin_unlock(&xps->xps_lock); xprt_put(xprt); spin_lock(&xps->xps_lock); @@ -136,6 +170,8 @@ static void xprt_switch_free(struct kref *kref) struct rpc_xprt_switch, xps_kref); xprt_switch_free_entries(xps); + rpc_sysfs_xprt_switch_destroy(xps); + xprt_switch_free_id(xps); kfree_rcu(xps, xps_rcu); } @@ -198,7 +234,8 @@ void xprt_iter_default_rewind(struct rpc_xprt_iter *xpi) static bool xprt_is_active(const struct rpc_xprt *xprt) { - return kref_read(&xprt->kref) != 0; + return (kref_read(&xprt->kref) != 0 && + !test_bit(XPRT_OFFLINE, &xprt->state)); } static @@ -214,6 +251,18 @@ struct rpc_xprt *xprt_switch_find_first_entry(struct list_head *head) } static +struct rpc_xprt *xprt_switch_find_first_entry_offline(struct list_head *head) +{ + struct rpc_xprt *pos; + + list_for_each_entry_rcu(pos, head, xprt_switch) { + if (!xprt_is_active(pos)) + return pos; + } + return NULL; +} + +static struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi) { struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); @@ -224,8 +273,9 @@ struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi) } static -struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head, - const struct rpc_xprt *cur) +struct rpc_xprt *_xprt_switch_find_current_entry(struct list_head *head, + const struct rpc_xprt *cur, + bool find_active) { struct rpc_xprt *pos; bool found = false; @@ -233,14 +283,25 @@ struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head, list_for_each_entry_rcu(pos, head, xprt_switch) { if (cur == pos) found = true; - if (found && xprt_is_active(pos)) + if (found && ((find_active && xprt_is_active(pos)) || + (!find_active && xprt_is_active(pos)))) return pos; } return NULL; } static -struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) +struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head, + const struct rpc_xprt *cur) +{ + return _xprt_switch_find_current_entry(head, cur, true); +} + +static +struct rpc_xprt * _xprt_iter_current_entry(struct rpc_xprt_iter *xpi, + struct rpc_xprt *first_entry(struct list_head *head), + struct rpc_xprt *current_entry(struct list_head *head, + const struct rpc_xprt *cur)) { struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); struct list_head *head; @@ -249,8 +310,30 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) return NULL; head = &xps->xps_xprt_list; if (xpi->xpi_cursor == NULL || xps->xps_nxprts < 2) - return xprt_switch_find_first_entry(head); - return xprt_switch_find_current_entry(head, xpi->xpi_cursor); + return first_entry(head); + return current_entry(head, xpi->xpi_cursor); +} + +static +struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) +{ + return _xprt_iter_current_entry(xpi, xprt_switch_find_first_entry, + xprt_switch_find_current_entry); +} + +static +struct rpc_xprt *xprt_switch_find_current_entry_offline(struct list_head *head, + const struct rpc_xprt *cur) +{ + return _xprt_switch_find_current_entry(head, cur, false); +} + +static +struct rpc_xprt *xprt_iter_current_entry_offline(struct rpc_xprt_iter *xpi) +{ + return _xprt_iter_current_entry(xpi, + xprt_switch_find_first_entry_offline, + xprt_switch_find_current_entry_offline); } bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, @@ -275,7 +358,7 @@ bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, static struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, - const struct rpc_xprt *cur) + const struct rpc_xprt *cur, bool check_active) { struct rpc_xprt *pos, *prev = NULL; bool found = false; @@ -283,7 +366,12 @@ struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, list_for_each_entry_rcu(pos, head, xprt_switch) { if (cur == prev) found = true; - if (found && xprt_is_active(pos)) + /* for request to return active transports return only + * active, for request to return offline transports + * return only offline + */ + if (found && ((check_active && xprt_is_active(pos)) || + (!check_active && !xprt_is_active(pos)))) return pos; prev = pos; } @@ -320,7 +408,7 @@ struct rpc_xprt *__xprt_switch_find_next_entry_roundrobin(struct list_head *head { struct rpc_xprt *ret; - ret = xprt_switch_find_next_entry(head, cur); + ret = xprt_switch_find_next_entry(head, cur, true); if (ret != NULL) return ret; return xprt_switch_find_first_entry(head); @@ -362,7 +450,14 @@ static struct rpc_xprt *xprt_switch_find_next_entry_all(struct rpc_xprt_switch *xps, const struct rpc_xprt *cur) { - return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur); + return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, true); +} + +static +struct rpc_xprt *xprt_switch_find_next_entry_offline(struct rpc_xprt_switch *xps, + const struct rpc_xprt *cur) +{ + return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, false); } static @@ -372,6 +467,13 @@ struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi) xprt_switch_find_next_entry_all); } +static +struct rpc_xprt *xprt_iter_next_entry_offline(struct rpc_xprt_iter *xpi) +{ + return xprt_iter_next_entry_multiple(xpi, + xprt_switch_find_next_entry_offline); +} + /* * xprt_iter_rewind - Resets the xprt iterator * @xpi: pointer to rpc_xprt_iter @@ -379,7 +481,6 @@ struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi) * Resets xpi to ensure that it points to the first entry in the list * of transports. */ -static void xprt_iter_rewind(struct rpc_xprt_iter *xpi) { rcu_read_lock(); @@ -425,6 +526,12 @@ void xprt_iter_init_listall(struct rpc_xprt_iter *xpi, __xprt_iter_init(xpi, xps, &rpc_xprt_iter_listall); } +void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi, + struct rpc_xprt_switch *xps) +{ + __xprt_iter_init(xpi, xps, &rpc_xprt_iter_listoffline); +} + /** * xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch * @xpi: pointer to rpc_xprt_iter @@ -539,3 +646,10 @@ const struct rpc_xprt_iter_ops rpc_xprt_iter_listall = { .xpi_xprt = xprt_iter_current_entry, .xpi_next = xprt_iter_next_entry_all, }; + +static +const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline = { + .xpi_rewind = xprt_iter_default_rewind, + .xpi_xprt = xprt_iter_current_entry_offline, + .xpi_next = xprt_iter_next_entry_offline, +}; diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 8ed0377d7a18..55b21bae866d 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ - module.o + svc_rdma_pcl.o module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 1a0ae0c61353..e4d84a13c566 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015-2020, Oracle and/or its affiliates. * - * Support for backward direction RPCs on RPC/RDMA. + * Support for reverse-direction RPCs on RPC/RDMA. */ #include <linux/sunrpc/xprt.h> @@ -13,10 +13,6 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - #undef RPCRDMA_BACKCHANNEL_DEBUG /** @@ -44,10 +40,10 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; size_t maxmsg; - maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv); + maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv); maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE); return maxmsg - RPCRDMA_HDRLEN_MIN; } @@ -82,7 +78,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) &rqst->rq_snd_buf, rpcrdma_noch_pullup)) return -EIO; - trace_xprtrdma_cb_reply(rqst); + trace_xprtrdma_cb_reply(r_xprt, rqst); return 0; } @@ -115,7 +111,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (rc < 0) goto failed_marshal; - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + if (frwr_send(r_xprt, req)) goto drop_connection; return 0; @@ -155,9 +151,11 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) { struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_rep *rep = req->rl_reply; struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - rpcrdma_recv_buffer_put(req->rl_reply); + rpcrdma_rep_put(&r_xprt->rx_buf, rep); req->rl_reply = NULL; spin_lock(&xprt->bc_pa_lock); @@ -190,8 +188,8 @@ create_req: if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS) return NULL; - size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE); - req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); + size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE); + req = rpcrdma_req_create(r_xprt, size); if (!req) return NULL; if (rpcrdma_req_setup(r_xprt, req)) { @@ -208,7 +206,7 @@ create_req: } /** - * rpcrdma_bc_receive_call - Handle a backward direction call + * rpcrdma_bc_receive_call - Handle a reverse-direction Call * @r_xprt: transport receiving the call * @rep: receive buffer containing the call * @@ -260,7 +258,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, */ req = rpcr_to_rdmar(rqst); req->rl_reply = rep; - trace_xprtrdma_cb_call(rqst); + trace_xprtrdma_cb_call(r_xprt, rqst); /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 125297c9aa3e..ffbf99894970 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -40,51 +40,56 @@ * New MRs are created on demand. */ -#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif +static void frwr_cid_init(struct rpcrdma_ep *ep, + struct rpcrdma_mr *mr) +{ + struct rpc_rdma_cid *cid = &mr->mr_cid; + + cid->ci_queue_id = ep->re_attr.send_cq->res.id; + cid->ci_completion_id = mr->mr_ibmr->res.id; +} + +static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) +{ + if (mr->mr_device) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents, + mr->mr_dir); + mr->mr_device = NULL; + } +} /** - * frwr_release_mr - Destroy one MR - * @mr: MR allocated by frwr_init_mr + * frwr_mr_release - Destroy one MR + * @mr: MR allocated by frwr_mr_init * */ -void frwr_release_mr(struct rpcrdma_mr *mr) +void frwr_mr_release(struct rpcrdma_mr *mr) { int rc; - rc = ib_dereg_mr(mr->frwr.fr_mr); + frwr_mr_unmap(mr->mr_xprt, mr); + + rc = ib_dereg_mr(mr->mr_ibmr); if (rc) trace_xprtrdma_frwr_dereg(mr, rc); kfree(mr->mr_sg); kfree(mr); } -static void frwr_mr_recycle(struct rpcrdma_mr *mr) +static void frwr_mr_put(struct rpcrdma_mr *mr) { - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - trace_xprtrdma_mr_recycle(mr); - - if (mr->mr_dir != DMA_NONE) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; - } - - spin_lock(&r_xprt->rx_buf.rb_lock); - list_del(&mr->mr_all); - r_xprt->rx_stats.mrs_recycled++; - spin_unlock(&r_xprt->rx_buf.rb_lock); + frwr_mr_unmap(mr->mr_xprt, mr); - frwr_release_mr(mr); + /* The MR is returned to the req's MR free list instead + * of to the xprt's MR free list. No spinlock is needed. + */ + rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); } /* frwr_reset - Place MRs back on the free list @@ -102,76 +107,70 @@ void frwr_reset(struct rpcrdma_req *req) struct rpcrdma_mr *mr; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) - rpcrdma_mr_put(mr); + frwr_mr_put(mr); } /** - * frwr_init_mr - Initialize one MR - * @ia: interface adapter + * frwr_mr_init - Initialize one MR + * @r_xprt: controlling transport instance * @mr: generic MR to prepare for FRWR * * Returns zero if successful. Otherwise a negative errno * is returned. */ -int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - unsigned int depth = ia->ri_max_frwr_depth; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + unsigned int depth = ep->re_max_fr_depth; struct scatterlist *sg; struct ib_mr *frmr; - int rc; - frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); + sg = kcalloc_node(depth, sizeof(*sg), XPRTRDMA_GFP_FLAGS, + ibdev_to_node(ep->re_id->device)); + if (!sg) + return -ENOMEM; + + frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth); if (IS_ERR(frmr)) goto out_mr_err; - sg = kcalloc(depth, sizeof(*sg), GFP_NOFS); - if (!sg) - goto out_list_err; - - mr->frwr.fr_mr = frmr; - mr->mr_dir = DMA_NONE; + mr->mr_xprt = r_xprt; + mr->mr_ibmr = frmr; + mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); - init_completion(&mr->frwr.fr_linv_done); + init_completion(&mr->mr_linv_done); + frwr_cid_init(ep, mr); sg_init_table(sg, depth); mr->mr_sg = sg; return 0; out_mr_err: - rc = PTR_ERR(frmr); - trace_xprtrdma_frwr_alloc(mr, rc); - return rc; - -out_list_err: - ib_dereg_mr(frmr); - return -ENOMEM; + kfree(sg); + trace_xprtrdma_frwr_alloc(mr, PTR_ERR(frmr)); + return PTR_ERR(frmr); } /** * frwr_query_device - Prepare a transport for use with FRWR - * @r_xprt: controlling transport instance + * @ep: endpoint to fill in * @device: RDMA device to query * * On success, sets: - * ep->rep_attr - * ep->rep_max_requests - * ia->ri_max_rdma_segs - * - * And these FRWR-related fields: - * ia->ri_max_frwr_depth - * ia->ri_mrtype + * ep->re_attr + * ep->re_max_requests + * ep->re_max_rdma_segs + * ep->re_max_fr_depth + * ep->re_mrtype * * Return values: * On success, returns zero. * %-EINVAL - the device does not support FRWR memory registration * %-ENOMEM - the device is not sufficiently capable for NFS/RDMA */ -int frwr_query_device(struct rpcrdma_xprt *r_xprt, - const struct ib_device *device) +int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) { const struct ib_device_attr *attrs = &device->attrs; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; int max_qp_wr, depth, delta; unsigned int max_sge; @@ -188,23 +187,23 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge); return -ENOMEM; } - ep->rep_attr.cap.max_send_sge = max_sge; - ep->rep_attr.cap.max_recv_sge = 1; + ep->re_attr.cap.max_send_sge = max_sge; + ep->re_attr.cap.max_recv_sge = 1; - ia->ri_mrtype = IB_MR_TYPE_MEM_REG; - if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) - ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; + ep->re_mrtype = IB_MR_TYPE_MEM_REG; + if (attrs->kernel_cap_flags & IBK_SG_GAPS_REG) + ep->re_mrtype = IB_MR_TYPE_SG_GAPS; /* Quirk: Some devices advertise a large max_fast_reg_page_list_len * capability, but perform optimally when the MRs are not larger * than a page. */ if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS) - ia->ri_max_frwr_depth = attrs->max_sge_rd; + ep->re_max_fr_depth = attrs->max_sge_rd; else - ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len; - if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS) - ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS; + ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len; + if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS) + ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS; /* Add room for frwr register and invalidate WRs. * 1. FRWR reg WR for head @@ -220,11 +219,11 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, /* Calculate N if the device max FRWR depth is smaller than * RPCRDMA_MAX_DATA_SEGS. */ - if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) { - delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth; + if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth; do { depth += 2; /* FRWR reg + invalidate */ - delta -= ia->ri_max_frwr_depth; + delta -= ep->re_max_fr_depth; } while (delta > 0); } @@ -233,34 +232,35 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, max_qp_wr -= 1; if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) return -ENOMEM; - if (ep->rep_max_requests > max_qp_wr) - ep->rep_max_requests = max_qp_wr; - ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; - if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { - ep->rep_max_requests = max_qp_wr / depth; - if (!ep->rep_max_requests) + if (ep->re_max_requests > max_qp_wr) + ep->re_max_requests = max_qp_wr; + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; + if (ep->re_attr.cap.max_send_wr > max_qp_wr) { + ep->re_max_requests = max_qp_wr / depth; + if (!ep->re_max_requests) return -ENOMEM; - ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; } - ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ - ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests; - ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ - - ia->ri_max_rdma_segs = - DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth); + ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ + ep->re_attr.cap.max_recv_wr = ep->re_max_requests; + ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH; + ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ + + ep->re_max_rdma_segs = + DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth); /* Reply chunks require segments for head and tail buffers */ - ia->ri_max_rdma_segs += 2; - if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) - ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; + ep->re_max_rdma_segs += 2; + if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) + ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; /* Ensure the underlying device is capable of conveying the * largest r/wsize NFS will ask for. This guarantees that * failing over from one RDMA device to another will not * break NFS I/O. */ - if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS) + if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS) return -ENOMEM; return 0; @@ -286,41 +286,36 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, int nsegs, bool writing, __be32 xid, struct rpcrdma_mr *mr) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_reg_wr *reg_wr; int i, n, dma_nents; struct ib_mr *ibmr; u8 key; - if (nsegs > ia->ri_max_frwr_depth) - nsegs = ia->ri_max_frwr_depth; + if (nsegs > ep->re_max_fr_depth) + nsegs = ep->re_max_fr_depth; for (i = 0; i < nsegs;) { - if (seg->mr_page) - sg_set_page(&mr->mr_sg[i], - seg->mr_page, - seg->mr_len, - offset_in_page(seg->mr_offset)); - else - sg_set_buf(&mr->mr_sg[i], seg->mr_offset, - seg->mr_len); + sg_set_page(&mr->mr_sg[i], seg->mr_page, + seg->mr_len, seg->mr_offset); ++seg; ++i; - if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS) + if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) continue; - if ((i < nsegs && offset_in_page(seg->mr_offset)) || + if ((i < nsegs && seg->mr_offset) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } mr->mr_dir = rpcrdma_data_dir(writing); mr->mr_nents = i; - dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents, + dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); if (!dma_nents) goto out_dmamap_err; + mr->mr_device = ep->re_id->device; - ibmr = mr->frwr.fr_mr; + ibmr = mr->mr_ibmr; n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); if (n != dma_nents) goto out_mapmr_err; @@ -330,7 +325,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, key = (u8)(ibmr->rkey & 0x000000FF); ib_update_fast_reg_key(ibmr, ++key); - reg_wr = &mr->frwr.fr_regwr; + reg_wr = &mr->mr_regwr; reg_wr->mr = ibmr; reg_wr->key = ibmr->rkey; reg_wr->access = writing ? @@ -345,7 +340,6 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, return seg; out_dmamap_err: - mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); return ERR_PTR(-EIO); @@ -356,54 +350,73 @@ out_mapmr_err: /** * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed FastReg WR * + * Each flushed MR gets destroyed after the QP has drained. */ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_fastreg(wc, frwr); - /* The MR will get recycled when the associated req is retransmitted */ + trace_xprtrdma_wc_fastreg(wc, &mr->mr_cid); + + rpcrdma_flush_disconnect(cq->cq_context, wc); } /** - * frwr_send - post Send WR containing the RPC Call message - * @ia: interface adapter - * @req: Prepared RPC Call + * frwr_send - post Send WRs containing the RPC Call message + * @r_xprt: controlling transport instance + * @req: prepared RPC Call * * For FRWR, chain any FastReg WRs to the Send WR. Only a * single ib_post_send call is needed to register memory * and then post the Send WR. * - * Returns the result of ib_post_send. + * Returns the return code from ib_post_send. + * + * Caller must hold the transport send lock to ensure that the + * pointers to the transport's rdma_cm_id and QP are stable. */ -int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { - struct ib_send_wr *post_wr; + struct ib_send_wr *post_wr, *send_wr = &req->rl_wr; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rpcrdma_mr *mr; + unsigned int num_wrs; + int ret; - post_wr = &req->rl_wr; + num_wrs = 1; + post_wr = send_wr; list_for_each_entry(mr, &req->rl_registered, mr_list) { - struct rpcrdma_frwr *frwr; - - frwr = &mr->frwr; - - frwr->fr_cqe.done = frwr_wc_fastreg; - frwr->fr_regwr.wr.next = post_wr; - frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; - frwr->fr_regwr.wr.num_sge = 0; - frwr->fr_regwr.wr.opcode = IB_WR_REG_MR; - frwr->fr_regwr.wr.send_flags = 0; + trace_xprtrdma_mr_fastreg(mr); + + mr->mr_cqe.done = frwr_wc_fastreg; + mr->mr_regwr.wr.next = post_wr; + mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe; + mr->mr_regwr.wr.num_sge = 0; + mr->mr_regwr.wr.opcode = IB_WR_REG_MR; + mr->mr_regwr.wr.send_flags = 0; + post_wr = &mr->mr_regwr.wr; + ++num_wrs; + } - post_wr = &frwr->fr_regwr.wr; + if ((kref_read(&req->rl_kref) > 1) || num_wrs > ep->re_send_count) { + send_wr->send_flags |= IB_SEND_SIGNALED; + ep->re_send_count = min_t(unsigned int, ep->re_send_batch, + num_wrs - ep->re_send_count); + } else { + send_wr->send_flags &= ~IB_SEND_SIGNALED; + ep->re_send_count -= num_wrs; } - return ib_post_send(ia->ri_id->qp, post_wr, NULL); + trace_xprtrdma_post_send(req); + ret = ib_post_send(ep->re_id->qp, post_wr, NULL); + if (ret) + trace_xprtrdma_post_send_err(r_xprt, req, ret); + return ret; } /** @@ -419,56 +432,54 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_mr_remoteinv(mr); - rpcrdma_mr_put(mr); + trace_xprtrdma_mr_reminv(mr); + frwr_mr_put(mr); break; /* only one invalidated MR per RPC */ } } -static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) +static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr) { - if (wc->status != IB_WC_SUCCESS) - frwr_mr_recycle(mr); - else - rpcrdma_mr_put(mr); + if (likely(wc->status == IB_WC_SUCCESS)) + frwr_mr_put(mr); } /** * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * */ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li(wc, &mr->mr_cid); + frwr_mr_done(wc, mr); + + rpcrdma_flush_disconnect(cq->cq_context, wc); } /** * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * * Awaken anyone waiting for an MR to finish being fenced. */ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_wake(wc, frwr); - __frwr_release_mr(wc, mr); - complete(&frwr->fr_linv_done); + trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid); + frwr_mr_done(wc, mr); + complete(&mr->mr_linv_done); + + rpcrdma_flush_disconnect(cq->cq_context, wc); } /** @@ -485,8 +496,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, **prev, *last; + struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; - struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; int rc; @@ -495,85 +506,86 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ - frwr = NULL; prev = &first; - while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { - + mr = rpcrdma_mr_pop(&req->rl_registered); + do { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; - frwr = &mr->frwr; - frwr->fr_cqe.done = frwr_wc_localinv; - last = &frwr->fr_invwr; + last = &mr->mr_invwr; last->next = NULL; - last->wr_cqe = &frwr->fr_cqe; + last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; last->num_sge = 0; last->opcode = IB_WR_LOCAL_INV; last->send_flags = IB_SEND_SIGNALED; last->ex.invalidate_rkey = mr->mr_handle; + last->wr_cqe->done = frwr_wc_localinv; + *prev = last; prev = &last->next; - } + } while ((mr = rpcrdma_mr_pop(&req->rl_registered))); + + mr = container_of(last, struct rpcrdma_mr, mr_invwr); /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain * are complete. */ - frwr->fr_cqe.done = frwr_wc_localinv_wake; - reinit_completion(&frwr->fr_linv_done); + last->wr_cqe->done = frwr_wc_localinv_wake; + reinit_completion(&mr->mr_linv_done); /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us - * unless ri_id->qp is a valid pointer. + * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, &bad_wr); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will * not happen, so don't wait in that case. */ if (bad_wr != first) - wait_for_completion(&frwr->fr_linv_done); + wait_for_completion(&mr->mr_linv_done); if (!rc) return; - /* Recycle MRs in the LOCAL_INV chain that did not get posted. + /* On error, the MRs get destroyed once the QP has drained. */ + trace_xprtrdma_post_linv_err(req, rc); + + /* Force a connection loss to ensure complete recovery. */ - trace_xprtrdma_post_linv(req, rc); - while (bad_wr) { - frwr = container_of(bad_wr, struct rpcrdma_frwr, - fr_invwr); - mr = container_of(frwr, struct rpcrdma_mr, frwr); - bad_wr = bad_wr->next; - - list_del_init(&mr->mr_list); - frwr_mr_recycle(mr); - } + rpcrdma_force_disconnect(ep); } /** * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * */ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); - struct rpcrdma_rep *rep = mr->mr_req->rl_reply; + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); + struct rpcrdma_rep *rep; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_done(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li_done(wc, &mr->mr_cid); - /* Ensure @rep is generated before __frwr_release_mr */ + /* Ensure that @rep is generated before the MR is released */ + rep = mr->mr_req->rl_reply; smp_rmb(); + + if (wc->status != IB_WC_SUCCESS) { + if (rep) + rpcrdma_unpin_rqst(rep); + rpcrdma_flush_disconnect(cq->cq_context, wc); + return; + } + frwr_mr_put(mr); rpcrdma_complete_rqst(rep); } @@ -590,66 +602,95 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, *last, **prev; - const struct ib_send_wr *bad_wr; - struct rpcrdma_frwr *frwr; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rpcrdma_mr *mr; int rc; /* Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ - frwr = NULL; prev = &first; - while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { - + mr = rpcrdma_mr_pop(&req->rl_registered); + do { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; - frwr = &mr->frwr; - frwr->fr_cqe.done = frwr_wc_localinv; - last = &frwr->fr_invwr; + last = &mr->mr_invwr; last->next = NULL; - last->wr_cqe = &frwr->fr_cqe; + last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; last->num_sge = 0; last->opcode = IB_WR_LOCAL_INV; last->send_flags = IB_SEND_SIGNALED; last->ex.invalidate_rkey = mr->mr_handle; + last->wr_cqe->done = frwr_wc_localinv; + *prev = last; prev = &last->next; - } + } while ((mr = rpcrdma_mr_pop(&req->rl_registered))); /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain * are complete. The last completion will wake up the * RPC waiter. */ - frwr->fr_cqe.done = frwr_wc_localinv_done; + last->wr_cqe->done = frwr_wc_localinv_done; /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us - * unless ri_id->qp is a valid pointer. + * unless re_id->qp is a valid pointer. */ - bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, NULL); if (!rc) return; - /* Recycle MRs in the LOCAL_INV chain that did not get posted. - */ - trace_xprtrdma_post_linv(req, rc); - while (bad_wr) { - frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); - mr = container_of(frwr, struct rpcrdma_mr, frwr); - bad_wr = bad_wr->next; - - frwr_mr_recycle(mr); - } + /* On error, the MRs get destroyed once the QP has drained. */ + trace_xprtrdma_post_linv_err(req, rc); /* The final LOCAL_INV WR in the chain is supposed to - * do the wake. If it was never posted, the wake will - * not happen, so wake here in that case. + * do the wake. If it was never posted, the wake does + * not happen. Unpin the rqst in preparation for its + * retransmission. + */ + rpcrdma_unpin_rqst(req->rl_reply); + + /* Force a connection loss to ensure complete recovery. */ - rpcrdma_complete_rqst(req->rl_reply); + rpcrdma_force_disconnect(ep); +} + +/** + * frwr_wp_create - Create an MR for padding Write chunks + * @r_xprt: transport resources to use + * + * Return 0 on success, negative errno on failure. + */ +int frwr_wp_create(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rpcrdma_mr_seg seg; + struct rpcrdma_mr *mr; + + mr = rpcrdma_mr_get(r_xprt); + if (!mr) + return -EAGAIN; + mr->mr_req = NULL; + ep->re_write_pad_mr = mr; + + seg.mr_len = XDR_UNIT; + seg.mr_page = virt_to_page(ep->re_write_pad); + seg.mr_offset = offset_in_page(ep->re_write_pad); + if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr))) + return -EIO; + trace_xprtrdma_mr_fastreg(mr); + + mr->mr_cqe.done = frwr_wc_fastreg; + mr->mr_regwr.wr.next = NULL; + mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe; + mr->mr_regwr.wr.num_sge = 0; + mr->mr_regwr.wr.opcode = IB_WR_REG_MR; + mr->mr_regwr.wr.send_flags = 0; + + return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL); } diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 620327c01302..45c5b41ac8dc 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("svcrdma"); MODULE_ALIAS("xprtrdma"); +MODULE_ALIAS("rpcrdma6"); static void __exit rpc_rdma_cleanup(void) { diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 28020ec104d4..190a4de239c8 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (c) 2014-2017 Oracle. All rights reserved. + * Copyright (c) 2014-2020, Oracle and/or its affiliates. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -54,10 +54,6 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - /* Returns size of largest RPC-over-RDMA header in a Call message * * The largest Call header contains a full-size Read list and a @@ -71,7 +67,7 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Read list size */ - size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); + size += maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); /* Minimal Read chunk size */ size += sizeof(__be32); /* segment count */ @@ -94,7 +90,7 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Write list size */ - size = sizeof(__be32); /* segment count */ + size += sizeof(__be32); /* segment count */ size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ @@ -103,21 +99,20 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) /** * rpcrdma_set_max_header_sizes - Initialize inline payload sizes - * @r_xprt: transport instance to initialize + * @ep: endpoint to initialize * * The max_inline fields contain the maximum size of an RPC message * so the marshaling code doesn't have to repeat this calculation * for every RPC. */ -void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) +void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep) { - unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + unsigned int maxsegs = ep->re_max_rdma_segs; - ep->rep_max_inline_send = - ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs); - ep->rep_max_inline_recv = - ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs); + ep->re_max_inline_send = + ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs); + ep->re_max_inline_recv = + ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs); } /* The client can send a request inline as long as the RPCRDMA header @@ -132,9 +127,10 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { struct xdr_buf *xdr = &rqst->rq_snd_buf; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned int count, remaining, offset; - if (xdr->len > r_xprt->rx_ep.rep_max_inline_send) + if (xdr->len > ep->re_max_inline_send) return false; if (xdr->page_len) { @@ -145,7 +141,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, remaining -= min_t(unsigned int, PAGE_SIZE - offset, remaining); offset = 0; - if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge) + if (++count > ep->re_attr.cap.max_send_sge) return false; } } @@ -162,7 +158,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv; + return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv; } /* The client is required to provide a Reply chunk if the maximum @@ -176,12 +172,35 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, const struct xdr_buf *buf = &rqst->rq_rcv_buf; return (buf->head[0].iov_len + buf->tail[0].iov_len) < - r_xprt->rx_ep.rep_max_inline_recv; + r_xprt->rx_ep->re_max_inline_recv; +} + +/* ACL likes to be lazy in allocating pages. For TCP, these + * pages can be allocated during receive processing. Not true + * for RDMA, which must always provision receive buffers + * up front. + */ +static noinline int +rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) +{ + struct page **ppages; + int len; + + len = buf->page_len; + ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); + while (len > 0) { + if (!*ppages) + *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + if (!*ppages) + return -ENOBUFS; + ppages++; + len -= PAGE_SIZE; + } + + return 0; } -/* Split @vec on page boundaries into SGEs. FMR registers pages, not - * a byte range. Other modes coalesce these SGEs into a single MR - * when they can. +/* Convert @vec to a single SGL element. * * Returns pointer to next available SGE, and bumps the total number * of SGEs consumed. @@ -190,22 +209,11 @@ static struct rpcrdma_mr_seg * rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, unsigned int *n) { - u32 remaining, page_offset; - char *base; - - base = vec->iov_base; - page_offset = offset_in_page(base); - remaining = vec->iov_len; - while (remaining) { - seg->mr_page = NULL; - seg->mr_offset = base; - seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); - remaining -= seg->mr_len; - base += seg->mr_len; - ++seg; - ++(*n); - page_offset = 0; - } + seg->mr_page = virt_to_page(vec->iov_base); + seg->mr_offset = offset_in_page(vec->iov_base); + seg->mr_len = vec->iov_len; + ++seg; + ++(*n); return seg; } @@ -233,17 +241,8 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); while (len) { - /* ACL likes to be lazy in allocating pages - ACLs - * are small by default but can get huge. - */ - if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { - if (!*ppages) - *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); - if (!*ppages) - return -ENOBUFS; - } seg->mr_page = *ppages; - seg->mr_offset = (char *)page_base; + seg->mr_offset = page_base; seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); len -= seg->mr_len; ++ppages; @@ -252,22 +251,11 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, page_base = 0; } - /* When encoding a Read chunk, the tail iovec contains an - * XDR pad and may be omitted. - */ - if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) - goto out; - - /* When encoding a Write chunk, some servers need to see an - * extra segment for non-XDR-aligned Write chunks. The upper - * layer provides space in the tail iovec that may be used - * for this purpose. - */ - if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) + if (type == rpcrdma_readch || type == rpcrdma_writech) goto out; if (xdrbuf->tail[0].iov_len) - seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); + rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); out: if (unlikely(n > RPCRDMA_MAX_SEGS)) @@ -275,40 +263,6 @@ out: return n; } -static inline int -encode_item_present(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, sizeof(*p)); - if (unlikely(!p)) - return -EMSGSIZE; - - *p = xdr_one; - return 0; -} - -static inline int -encode_item_not_present(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, sizeof(*p)); - if (unlikely(!p)) - return -EMSGSIZE; - - *p = xdr_zero; - return 0; -} - -static void -xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) -{ - *iptr++ = cpu_to_be32(mr->mr_handle); - *iptr++ = cpu_to_be32(mr->mr_length); - xdr_encode_hyper(iptr, mr->mr_offset); -} - static int encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) { @@ -318,7 +272,7 @@ encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) if (unlikely(!p)) return -EMSGSIZE; - xdr_encode_rdma_segment(p, mr); + xdr_encode_rdma_segment(p, mr->mr_handle, mr->mr_length, mr->mr_offset); return 0; } @@ -333,8 +287,8 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return -EMSGSIZE; *p++ = xdr_one; /* Item present */ - *p++ = cpu_to_be32(position); - xdr_encode_rdma_segment(p, mr); + xdr_encode_read_segment(p, position, mr->mr_handle, mr->mr_length, + mr->mr_offset); return 0; } @@ -349,7 +303,6 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, *mr = rpcrdma_mr_get(r_xprt); if (!*mr) goto out_getmr_err; - trace_xprtrdma_mr_get(req); (*mr)->mr_req = req; } @@ -357,7 +310,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); out_getmr_err: - trace_xprtrdma_nomrs(req); + trace_xprtrdma_nomrs_err(r_xprt, req); xprt_wait_for_buffer_space(&r_xprt->rx_xprt); rpcrdma_mrs_refresh(r_xprt); return ERR_PTR(-EAGAIN); @@ -414,7 +367,9 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, } while (nsegs); done: - return encode_item_not_present(xdr); + if (xdr_stream_encode_item_absent(xdr) < 0) + return -EMSGSIZE; + return 0; } /* Register and XDR encode the Write list. Supports encoding a list @@ -438,6 +393,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rpcrdma_mr_seg *seg; struct rpcrdma_mr *mr; int nsegs, nchunks; @@ -453,7 +409,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, if (nsegs < 0) return nsegs; - if (encode_item_present(xdr) < 0) + if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; segcount = xdr_reserve_space(xdr, sizeof(*segcount)); if (unlikely(!segcount)) @@ -476,11 +432,25 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, nsegs -= mr->mr_nents; } while (nsegs); + if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) { + if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) + return -EMSGSIZE; + + trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, + nsegs); + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; + nchunks++; + nsegs -= mr->mr_nents; + } + /* Update count of segments in this Write chunk */ *segcount = cpu_to_be32(nchunks); done: - return encode_item_not_present(xdr); + if (xdr_stream_encode_item_absent(xdr) < 0) + return -EMSGSIZE; + return 0; } /* Register and XDR encode the Reply chunk. Supports encoding an array @@ -506,15 +476,18 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, int nsegs, nchunks; __be32 *segcount; - if (wtype != rpcrdma_replych) - return encode_item_not_present(xdr); + if (wtype != rpcrdma_replych) { + if (xdr_stream_encode_item_absent(xdr) < 0) + return -EMSGSIZE; + return 0; + } seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) return nsegs; - if (encode_item_present(xdr) < 0) + if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; segcount = xdr_reserve_space(xdr, sizeof(*segcount)); if (unlikely(!segcount)) @@ -894,6 +867,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) __be32 *p; int ret; + if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) { + ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf); + if (ret) + return ret; + } + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), rqst); @@ -911,8 +890,8 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) * or privacy, direct data placement of individual data items * is not allowed. */ - ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & - RPCAUTH_AUTH_DATATOUCH); + ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH, + &rqst->rq_cred->cr_auth->au_flags); /* * Chunks needed for results? @@ -1142,6 +1121,7 @@ static bool rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) #if defined(CONFIG_SUNRPC_BACKCHANNEL) { + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct xdr_stream *xdr = &rep->rr_stream; __be32 *p; @@ -1152,11 +1132,11 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) p = xdr_inline_decode(xdr, 0); /* Chunk lists */ - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; /* RPC header */ @@ -1165,19 +1145,19 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) if (*p != cpu_to_be32(RPC_CALL)) return false; + /* No bc service. */ + if (xprt->bc_serv == NULL) + return false; + /* Now that we are sure this is a backchannel call, * advance to the RPC header. */ p = xdr_inline_decode(xdr, 3 * sizeof(*p)); if (unlikely(!p)) - goto out_short; + return true; rpcrdma_bc_receive_call(r_xprt, rep); return true; - -out_short: - pr_warn("RPC/RDMA short backward direction call\n"); - return true; } #else /* CONFIG_SUNRPC_BACKCHANNEL */ { @@ -1195,10 +1175,7 @@ static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) if (unlikely(!p)) return -EIO; - handle = be32_to_cpup(p++); - *length = be32_to_cpup(p++); - xdr_decode_hyper(p, &offset); - + xdr_decode_rdma_segment(p, &handle, length, &offset); trace_xprtrdma_decode_seg(handle, *length, offset); return 0; } @@ -1234,7 +1211,7 @@ static int decode_read_list(struct xdr_stream *xdr) p = xdr_inline_decode(xdr, sizeof(*p)); if (unlikely(!p)) return -EIO; - if (unlikely(*p != xdr_zero)) + if (unlikely(xdr_item_is_present(p))) return -EIO; return 0; } @@ -1253,7 +1230,7 @@ static int decode_write_list(struct xdr_stream *xdr, u32 *length) p = xdr_inline_decode(xdr, sizeof(*p)); if (unlikely(!p)) return -EIO; - if (*p == xdr_zero) + if (xdr_item_is_absent(p)) break; if (!first) return -EIO; @@ -1275,7 +1252,7 @@ static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) return -EIO; *length = 0; - if (*p != xdr_zero) + if (xdr_item_is_present(p)) if (decode_write_chunk(xdr, length)) return -EIO; return 0; @@ -1352,29 +1329,47 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) break; - dprintk("RPC: %s: server reports " - "version error (%u-%u), xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(*(p + 1)), - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_vers(rqst, p, p + 1); break; case err_chunk: - dprintk("RPC: %s: server reports " - "header decoding error, xid %08x\n", __func__, - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_chunk(rqst); break; default: - dprintk("RPC: %s: server reports " - "unrecognized error %d, xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_unrecognized(rqst, p); } - r_xprt->rx_stats.bad_reply_count++; - return -EREMOTEIO; + return -EIO; +} + +/** + * rpcrdma_unpin_rqst - Release rqst without completing it + * @rep: RPC/RDMA Receive context + * + * This is done when a connection is lost so that a Reply + * can be dropped and its matching Call can be subsequently + * retransmitted on a new connection. + */ +void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep) +{ + struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt; + struct rpc_rqst *rqst = rep->rr_rqst; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + + req->rl_reply = NULL; + rep->rr_rqst = NULL; + + spin_lock(&xprt->queue_lock); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->queue_lock); } -/* Perform XID lookup, reconstruction of the RPC reply, and - * RPC completion while holding the transport lock to ensure - * the rep, rqst, and rq_task pointers remain stable. +/** + * rpcrdma_complete_rqst - Pass completed rqst back to RPC + * @rep: RPC/RDMA Receive context + * + * Reconstruct the RPC reply and complete the transaction + * while @rqst is still pinned to ensure the rep, rqst, and + * rq_task pointers remain stable. */ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) { @@ -1406,13 +1401,11 @@ out: spin_unlock(&xprt->queue_lock); return; -/* If the incoming reply terminated a pending RPC, the next - * RPC call will post a replacement receive buffer as it is - * being marshaled. - */ out_badheader: - trace_xprtrdma_reply_hdr(rep); + trace_xprtrdma_reply_hdr_err(rep); r_xprt->rx_stats.bad_reply_count++; + rqst->rq_task->tk_status = status; + status = 0; goto out; } @@ -1476,21 +1469,20 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (credits == 0) credits = 1; /* don't deadlock */ - else if (credits > r_xprt->rx_ep.rep_max_requests) - credits = r_xprt->rx_ep.rep_max_requests; + else if (credits > r_xprt->rx_ep->re_max_requests) + credits = r_xprt->rx_ep->re_max_requests; + rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1), + false); if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); - rpcrdma_post_recvs(r_xprt, false); req = rpcr_to_rdmar(rqst); - if (req->rl_reply) { - trace_xprtrdma_leaked_rep(rqst, req->rl_reply); - rpcrdma_recv_buffer_put(req->rl_reply); - } + if (unlikely(req->rl_reply)) + rpcrdma_rep_put(buf, req->rl_reply); req->rl_reply = rep; rep->rr_rqst = rqst; - trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); + trace_xprtrdma_reply(rqst->rq_task, rep, credits); if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) frwr_reminv(rep, &req->rl_registered); @@ -1502,17 +1494,17 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) return; out_badversion: - trace_xprtrdma_reply_vers(rep); + trace_xprtrdma_reply_vers_err(rep); goto out; out_norqst: spin_unlock(&xprt->queue_lock); - trace_xprtrdma_reply_rqst(rep); + trace_xprtrdma_reply_rqst_err(rep); goto out; out_shortreply: - trace_xprtrdma_reply_short(rep); + trace_xprtrdma_reply_short_err(rep); out: - rpcrdma_recv_buffer_put(rep); + rpcrdma_rep_put(buf, rep); } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 97bca509a391..5bc20e9d09cd 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -62,52 +62,47 @@ static unsigned int max_max_requests = 16384; unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH; static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH; static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH; +static unsigned int svcrdma_stat_unused; +static unsigned int zero; -atomic_t rdma_stat_recv; -atomic_t rdma_stat_read; -atomic_t rdma_stat_write; -atomic_t rdma_stat_sq_starve; -atomic_t rdma_stat_rq_starve; -atomic_t rdma_stat_rq_poll; -atomic_t rdma_stat_rq_prod; -atomic_t rdma_stat_sq_poll; -atomic_t rdma_stat_sq_prod; +struct percpu_counter svcrdma_stat_read; +struct percpu_counter svcrdma_stat_recv; +struct percpu_counter svcrdma_stat_sq_starve; +struct percpu_counter svcrdma_stat_write; -/* - * This function implements reading and resetting an atomic_t stat - * variable through read/write to a proc file. Any write to the file - * resets the associated statistic to zero. Any read returns it's - * current value. - */ -static int read_reset_stat(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +enum { + SVCRDMA_COUNTER_BUFSIZ = sizeof(unsigned long long), +}; + +static int svcrdma_counter_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { - atomic_t *stat = (atomic_t *)table->data; - - if (!stat) - return -EINVAL; - - if (write) - atomic_set(stat, 0); - else { - char str_buf[32]; - int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); - if (len >= 32) - return -EFAULT; - len = strlen(str_buf); - if (*ppos > len) { - *lenp = 0; - return 0; - } - len -= *ppos; - if (len > *lenp) - len = *lenp; - if (len && copy_to_user(buffer, str_buf, len)) - return -EFAULT; - *lenp = len; - *ppos += len; + struct percpu_counter *stat = (struct percpu_counter *)table->data; + char tmp[SVCRDMA_COUNTER_BUFSIZ + 1]; + int len; + + if (write) { + percpu_counter_set(stat, 0); + return 0; } + + len = snprintf(tmp, SVCRDMA_COUNTER_BUFSIZ, "%lld\n", + percpu_counter_sum_positive(stat)); + if (len >= SVCRDMA_COUNTER_BUFSIZ) + return -EFAULT; + len = strlen(tmp); + if (*ppos > len) { + *lenp = 0; + return 0; + } + len -= *ppos; + if (len > *lenp) + len = *lenp; + if (len) + memcpy(buffer, tmp, len); + *lenp = len; + *ppos += len; + return 0; } @@ -143,66 +138,76 @@ static struct ctl_table svcrdma_parm_table[] = { { .procname = "rdma_stat_read", - .data = &rdma_stat_read, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_read, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_recv", - .data = &rdma_stat_recv, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_recv, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_write", - .data = &rdma_stat_write, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_write, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_sq_starve", - .data = &rdma_stat_sq_starve, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_sq_starve, + .maxlen = SVCRDMA_COUNTER_BUFSIZ, .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = svcrdma_counter_handler, }, { .procname = "rdma_stat_rq_starve", - .data = &rdma_stat_rq_starve, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_rq_poll", - .data = &rdma_stat_rq_poll, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_rq_prod", - .data = &rdma_stat_rq_prod, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_sq_poll", - .data = &rdma_stat_sq_poll, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { .procname = "rdma_stat_sq_prod", - .data = &rdma_stat_sq_prod, - .maxlen = sizeof(atomic_t), + .data = &svcrdma_stat_unused, + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = read_reset_stat, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &zero, }, { }, }; @@ -225,27 +230,69 @@ static struct ctl_table svcrdma_root_table[] = { { }, }; +static void svc_rdma_proc_cleanup(void) +{ + if (!svcrdma_table_header) + return; + unregister_sysctl_table(svcrdma_table_header); + svcrdma_table_header = NULL; + + percpu_counter_destroy(&svcrdma_stat_write); + percpu_counter_destroy(&svcrdma_stat_sq_starve); + percpu_counter_destroy(&svcrdma_stat_recv); + percpu_counter_destroy(&svcrdma_stat_read); +} + +static int svc_rdma_proc_init(void) +{ + int rc; + + if (svcrdma_table_header) + return 0; + + rc = percpu_counter_init(&svcrdma_stat_read, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL); + if (rc) + goto out_err; + rc = percpu_counter_init(&svcrdma_stat_write, 0, GFP_KERNEL); + if (rc) + goto out_err; + + svcrdma_table_header = register_sysctl_table(svcrdma_root_table); + return 0; + +out_err: + percpu_counter_destroy(&svcrdma_stat_sq_starve); + percpu_counter_destroy(&svcrdma_stat_recv); + percpu_counter_destroy(&svcrdma_stat_read); + return rc; +} + void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); - if (svcrdma_table_header) { - unregister_sysctl_table(svcrdma_table_header); - svcrdma_table_header = NULL; - } svc_unreg_xprt_class(&svc_rdma_class); + svc_rdma_proc_cleanup(); } int svc_rdma_init(void) { + int rc; + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); dprintk("\tmax_requests : %u\n", svcrdma_max_requests); dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); - if (!svcrdma_table_header) - svcrdma_table_header = - register_sysctl_table(svcrdma_root_table); + rc = svc_rdma_proc_init(); + if (rc) + return rc; /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 908e78bb87c6..aa2227a7e552 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2015-2018 Oracle. All rights reserved. * - * Support for backward direction RPCs on RPC/RDMA (server-side). + * Support for reverse-direction RPCs on RPC/RDMA (server-side). */ #include <linux/sunrpc/svc_rdma.h> @@ -10,59 +10,34 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - -#undef SVCRDMA_BACKCHANNEL_DEBUG - /** - * svc_rdma_handle_bc_reply - Process incoming backchannel reply - * @xprt: controlling backchannel transport - * @rdma_resp: pointer to incoming transport header - * @rcvbuf: XDR buffer into which to decode the reply + * svc_rdma_handle_bc_reply - Process incoming backchannel Reply + * @rqstp: resources for handling the Reply + * @rctxt: Received message * - * Returns: - * %0 if @rcvbuf is filled in, xprt_complete_rqst called, - * %-EAGAIN if server should call ->recvfrom again. */ -int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, - struct xdr_buf *rcvbuf) +void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *rctxt) { + struct svc_xprt *sxprt = rqstp->rq_xprt; + struct rpc_xprt *xprt = sxprt->xpt_bc_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct xdr_buf *rcvbuf = &rqstp->rq_arg; struct kvec *dst, *src = &rcvbuf->head[0]; + __be32 *rdma_resp = rctxt->rc_recv_buf; struct rpc_rqst *req; u32 credits; - size_t len; - __be32 xid; - __be32 *p; - int ret; - - p = (__be32 *)src->iov_base; - len = src->iov_len; - xid = *rdma_resp; - -#ifdef SVCRDMA_BACKCHANNEL_DEBUG - pr_info("%s: xid=%08x, length=%zu\n", - __func__, be32_to_cpu(xid), len); - pr_info("%s: RPC/RDMA: %*ph\n", - __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp); - pr_info("%s: RPC: %*ph\n", - __func__, (int)len, p); -#endif - - ret = -EAGAIN; - if (src->iov_len < 24) - goto out_shortreply; spin_lock(&xprt->queue_lock); - req = xprt_lookup_rqst(xprt, xid); + req = xprt_lookup_rqst(xprt, *rdma_resp); if (!req) - goto out_notfound; + goto out_unlock; dst = &req->rq_private_buf.head[0]; memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); - if (dst->iov_len < len) + if (dst->iov_len < src->iov_len) goto out_unlock; - memcpy(dst->iov_base, p, len); + memcpy(dst->iov_base, src->iov_base, src->iov_len); xprt_pin_rqst(req); spin_unlock(&xprt->queue_lock); @@ -71,34 +46,20 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_buf.rb_bc_max_requests) credits = r_xprt->rx_buf.rb_bc_max_requests; - spin_lock(&xprt->transport_lock); xprt->cwnd = credits << RPC_CWNDSHIFT; spin_unlock(&xprt->transport_lock); spin_lock(&xprt->queue_lock); - ret = 0; xprt_complete_rqst(req->rq_task, rcvbuf->len); xprt_unpin_rqst(req); rcvbuf->len = 0; out_unlock: spin_unlock(&xprt->queue_lock); -out: - return ret; - -out_shortreply: - dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n", - xprt, src->iov_len); - goto out; - -out_notfound: - dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n", - xprt, be32_to_cpu(xid)); - goto out_unlock; } -/* Send a backwards direction RPC call. +/* Send a reverse-direction RPC Call. * * Caller holds the connection's mutex and has already marshaled * the RPC/RDMA request. @@ -113,11 +74,17 @@ out_notfound: */ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst, - struct svc_rdma_send_ctxt *ctxt) + struct svc_rdma_send_ctxt *sctxt) { + struct svc_rdma_recv_ctxt *rctxt; int ret; - ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqst->rq_snd_buf, NULL); + rctxt = svc_rdma_recv_ctxt_get(rdma); + if (!rctxt) + return -EIO; + + ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf); + svc_rdma_recv_ctxt_put(rdma, rctxt); if (ret < 0) return -EIO; @@ -125,8 +92,14 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, * the rq_buffer before all retransmits are complete. */ get_page(virt_to_page(rqst->rq_buffer)); - ctxt->sc_send_wr.opcode = IB_WR_SEND; - return svc_rdma_send(rdma, &ctxt->sc_send_wr); + sctxt->sc_send_wr.opcode = IB_WR_SEND; + ret = svc_rdma_send(rdma, sctxt); + if (ret < 0) + return ret; + + ret = wait_for_completion_killable(&sctxt->sc_done); + svc_rdma_send_ctxt_put(rdma, sctxt); + return ret; } /* Server-side transport endpoint wants a whole page for its send @@ -146,12 +119,12 @@ xprt_rdma_bc_allocate(struct rpc_task *task) return -EINVAL; } - page = alloc_page(RPCRDMA_DEF_GFP); + page = alloc_page(GFP_NOIO | __GFP_NOWARN); if (!page) return -ENOMEM; rqst->rq_buffer = page_address(page); - rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP); + rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, GFP_NOIO | __GFP_NOWARN); if (!rqst->rq_rbuffer) { put_page(page); return -ENOMEM; @@ -181,7 +154,9 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) if (!ctxt) goto drop_connection; - p = ctxt->sc_xprt_buf; + p = xdr_reserve_space(&ctxt->sc_stream, RPCRDMA_HDRLEN_MIN); + if (!p) + goto put_ctxt; *p++ = rqst->rq_xid; *p++ = rpcrdma_version; *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); @@ -189,67 +164,55 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) *p++ = xdr_zero; *p++ = xdr_zero; *p = xdr_zero; - svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_MIN); - -#ifdef SVCRDMA_BACKCHANNEL_DEBUG - pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); -#endif rqst->rq_xtime = ktime_get(); rc = svc_rdma_bc_sendto(rdma, rqst, ctxt); - if (rc) { - svc_rdma_send_ctxt_put(rdma, ctxt); - goto drop_connection; - } + if (rc) + goto put_ctxt; return 0; +put_ctxt: + svc_rdma_send_ctxt_put(rdma, ctxt); + drop_connection: - dprintk("svcrdma: failed to send bc call\n"); return -ENOTCONN; } -/* Send an RPC call on the passive end of a transport - * connection. +/** + * xprt_rdma_bc_send_request - Send a reverse-direction Call + * @rqst: rpc_rqst containing Call message to be sent + * + * Return values: + * %0 if the message was sent successfully + * %ENOTCONN if the message was not sent */ -static int -xprt_rdma_bc_send_request(struct rpc_rqst *rqst) +static int xprt_rdma_bc_send_request(struct rpc_rqst *rqst) { struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; - struct svcxprt_rdma *rdma; + struct svcxprt_rdma *rdma = + container_of(sxprt, struct svcxprt_rdma, sc_xprt); int ret; - dprintk("svcrdma: sending bc call with xid: %08x\n", - be32_to_cpu(rqst->rq_xid)); - - mutex_lock(&sxprt->xpt_mutex); - - ret = -ENOTCONN; - rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) { - ret = rpcrdma_bc_send_request(rdma, rqst); - if (ret == -ENOTCONN) - svc_close_xprt(sxprt); - } - - mutex_unlock(&sxprt->xpt_mutex); + if (test_bit(XPT_DEAD, &sxprt->xpt_flags)) + return -ENOTCONN; - if (ret < 0) - return ret; - return 0; + ret = rpcrdma_bc_send_request(rdma, rqst); + if (ret == -ENOTCONN) + svc_xprt_close(sxprt); + return ret; } static void xprt_rdma_bc_close(struct rpc_xprt *xprt) { - dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + xprt_disconnect_done(xprt); xprt->cwnd = RPC_CWNDSHIFT; } static void xprt_rdma_bc_put(struct rpc_xprt *xprt) { - dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); - + xprt_rdma_free_addresses(xprt); xprt_free(xprt); } @@ -283,26 +246,21 @@ xprt_setup_rdma_bc(struct xprt_create *args) struct rpc_xprt *xprt; struct rpcrdma_xprt *new_xprt; - if (args->addrlen > sizeof(xprt->addr)) { - dprintk("RPC: %s: address too large\n", __func__); + if (args->addrlen > sizeof(xprt->addr)) return ERR_PTR(-EBADF); - } xprt = xprt_alloc(args->net, sizeof(*new_xprt), RPCRDMA_MAX_BC_REQUESTS, RPCRDMA_MAX_BC_REQUESTS); - if (!xprt) { - dprintk("RPC: %s: couldn't allocate rpc_xprt\n", - __func__); + if (!xprt) return ERR_PTR(-ENOMEM); - } xprt->timeout = &xprt_rdma_bc_timeout; xprt_set_bound(xprt); xprt_set_connected(xprt); - xprt->bind_timeout = RPCRDMA_BIND_TO; - xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; - xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + xprt->bind_timeout = 0; + xprt->reestablish_timeout = 0; + xprt->idle_timeout = 0; xprt->prot = XPRT_TRANSPORT_BC_RDMA; xprt->ops = &xprt_rdma_bc_procs; diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c new file mode 100644 index 000000000000..b63cfeaa2923 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Oracle. All rights reserved. + */ + +#include <linux/sunrpc/svc_rdma.h> +#include <linux/sunrpc/rpc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +/** + * pcl_free - Release all memory associated with a parsed chunk list + * @pcl: parsed chunk list + * + */ +void pcl_free(struct svc_rdma_pcl *pcl) +{ + while (!list_empty(&pcl->cl_chunks)) { + struct svc_rdma_chunk *chunk; + + chunk = pcl_first_chunk(pcl); + list_del(&chunk->ch_list); + kfree(chunk); + } +} + +static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position) +{ + struct svc_rdma_chunk *chunk; + + chunk = kmalloc(struct_size(chunk, ch_segments, segcount), GFP_KERNEL); + if (!chunk) + return NULL; + + chunk->ch_position = position; + chunk->ch_length = 0; + chunk->ch_payload_length = 0; + chunk->ch_segcount = 0; + return chunk; +} + +static struct svc_rdma_chunk * +pcl_lookup_position(struct svc_rdma_pcl *pcl, u32 position) +{ + struct svc_rdma_chunk *pos; + + pcl_for_each_chunk(pos, pcl) { + if (pos->ch_position == position) + return pos; + } + return NULL; +} + +static void pcl_insert_position(struct svc_rdma_pcl *pcl, + struct svc_rdma_chunk *chunk) +{ + struct svc_rdma_chunk *pos; + + pcl_for_each_chunk(pos, pcl) { + if (pos->ch_position > chunk->ch_position) + break; + } + __list_add(&chunk->ch_list, pos->ch_list.prev, &pos->ch_list); + pcl->cl_count++; +} + +static void pcl_set_read_segment(const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_chunk *chunk, + u32 handle, u32 length, u64 offset) +{ + struct svc_rdma_segment *segment; + + segment = &chunk->ch_segments[chunk->ch_segcount]; + segment->rs_handle = handle; + segment->rs_length = length; + segment->rs_offset = offset; + + trace_svcrdma_decode_rseg(&rctxt->rc_cid, chunk, segment); + + chunk->ch_length += length; + chunk->ch_segcount++; +} + +/** + * pcl_alloc_call - Construct a parsed chunk list for the Call body + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list + * + * Assumptions: + * - The incoming Read list has already been sanity checked. + * - cl_count is already set to the number of segments in + * the un-decoded list. + * - The list might not be in order by position. + * + * Return values: + * %true: Parsed chunk list was successfully constructed, and + * cl_count is updated to be the number of chunks (ie. + * unique positions) in the Read list. + * %false: Memory allocation failed. + */ +bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + struct svc_rdma_pcl *pcl = &rctxt->rc_call_pcl; + unsigned int i, segcount = pcl->cl_count; + + pcl->cl_count = 0; + for (i = 0; i < segcount; i++) { + struct svc_rdma_chunk *chunk; + u32 position, handle, length; + u64 offset; + + p++; /* skip the list discriminator */ + p = xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position != 0) + continue; + + if (pcl_is_empty(pcl)) { + chunk = pcl_alloc_chunk(segcount, position); + if (!chunk) + return false; + pcl_insert_position(pcl, chunk); + } else { + chunk = list_first_entry(&pcl->cl_chunks, + struct svc_rdma_chunk, + ch_list); + } + + pcl_set_read_segment(rctxt, chunk, handle, length, offset); + } + + return true; +} + +/** + * pcl_alloc_read - Construct a parsed chunk list for normal Read chunks + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list + * + * Assumptions: + * - The incoming Read list has already been sanity checked. + * - cl_count is already set to the number of segments in + * the un-decoded list. + * - The list might not be in order by position. + * + * Return values: + * %true: Parsed chunk list was successfully constructed, and + * cl_count is updated to be the number of chunks (ie. + * unique position values) in the Read list. + * %false: Memory allocation failed. + * + * TODO: + * - Check for chunk range overlaps + */ +bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + struct svc_rdma_pcl *pcl = &rctxt->rc_read_pcl; + unsigned int i, segcount = pcl->cl_count; + + pcl->cl_count = 0; + for (i = 0; i < segcount; i++) { + struct svc_rdma_chunk *chunk; + u32 position, handle, length; + u64 offset; + + p++; /* skip the list discriminator */ + p = xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position == 0) + continue; + + chunk = pcl_lookup_position(pcl, position); + if (!chunk) { + chunk = pcl_alloc_chunk(segcount, position); + if (!chunk) + return false; + pcl_insert_position(pcl, chunk); + } + + pcl_set_read_segment(rctxt, chunk, handle, length, offset); + } + + return true; +} + +/** + * pcl_alloc_write - Construct a parsed chunk list from a Write list + * @rctxt: Ingress receive context + * @pcl: Parsed chunk list to populate + * @p: Start of an un-decoded Write list + * + * Assumptions: + * - The incoming Write list has already been sanity checked, and + * - cl_count is set to the number of chunks in the un-decoded list. + * + * Return values: + * %true: Parsed chunk list was successfully constructed. + * %false: Memory allocation failed. + */ +bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_pcl *pcl, __be32 *p) +{ + struct svc_rdma_segment *segment; + struct svc_rdma_chunk *chunk; + unsigned int i, j; + u32 segcount; + + for (i = 0; i < pcl->cl_count; i++) { + p++; /* skip the list discriminator */ + segcount = be32_to_cpup(p++); + + chunk = pcl_alloc_chunk(segcount, 0); + if (!chunk) + return false; + list_add_tail(&chunk->ch_list, &pcl->cl_chunks); + + for (j = 0; j < segcount; j++) { + segment = &chunk->ch_segments[j]; + p = xdr_decode_rdma_segment(p, &segment->rs_handle, + &segment->rs_length, + &segment->rs_offset); + trace_svcrdma_decode_wseg(&rctxt->rc_cid, chunk, j); + + chunk->ch_length += segment->rs_length; + chunk->ch_segcount++; + } + } + return true; +} + +static int pcl_process_region(const struct xdr_buf *xdr, + unsigned int offset, unsigned int length, + int (*actor)(const struct xdr_buf *, void *), + void *data) +{ + struct xdr_buf subbuf; + + if (!length) + return 0; + if (xdr_buf_subsegment(xdr, &subbuf, offset, length)) + return -EMSGSIZE; + return actor(&subbuf, data); +} + +/** + * pcl_process_nonpayloads - Process non-payload regions inside @xdr + * @pcl: Chunk list to process + * @xdr: xdr_buf to process + * @actor: Function to invoke on each non-payload region + * @data: Arguments for @actor + * + * This mechanism must ignore not only result payloads that were already + * sent via RDMA Write, but also XDR padding for those payloads that + * the upper layer has added. + * + * Assumptions: + * The xdr->len and ch_position fields are aligned to 4-byte multiples. + * + * Returns: + * On success, zero, + * %-EMSGSIZE on XDR buffer overflow, or + * The return value of @actor + */ +int pcl_process_nonpayloads(const struct svc_rdma_pcl *pcl, + const struct xdr_buf *xdr, + int (*actor)(const struct xdr_buf *, void *), + void *data) +{ + struct svc_rdma_chunk *chunk, *next; + unsigned int start; + int ret; + + chunk = pcl_first_chunk(pcl); + + /* No result payloads were generated */ + if (!chunk || !chunk->ch_payload_length) + return actor(xdr, data); + + /* Process the region before the first result payload */ + ret = pcl_process_region(xdr, 0, chunk->ch_position, actor, data); + if (ret < 0) + return ret; + + /* Process the regions between each middle result payload */ + while ((next = pcl_next_chunk(pcl, chunk))) { + if (!next->ch_payload_length) + break; + + start = pcl_chunk_end_offset(chunk); + ret = pcl_process_region(xdr, start, next->ch_position - start, + actor, data); + if (ret < 0) + return ret; + + chunk = next; + } + + /* Process the region after the last result payload */ + start = pcl_chunk_end_offset(chunk); + ret = pcl_process_region(xdr, start, xdr->len - start, actor, data); + if (ret < 0) + return ret; + + return 0; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 96bccd398469..5242ad121450 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -89,10 +89,10 @@ * svc_rdma_recvfrom call returns. * * During the second svc_rdma_recvfrom call, RDMA Read sink pages - * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst - * (see rdma_read_complete() below). + * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst. */ +#include <linux/slab.h> #include <linux/spinlock.h> #include <asm/unaligned.h> #include <rdma/ib_verbs.h> @@ -106,8 +106,6 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc); static inline struct svc_rdma_recv_ctxt * @@ -117,6 +115,13 @@ svc_rdma_next_recv_ctxt(struct list_head *list) rc_list); } +static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_rq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { @@ -135,6 +140,12 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) goto fail2; + svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid); + pcl_init(&ctxt->rc_call_pcl); + pcl_init(&ctxt->rc_read_pcl); + pcl_init(&ctxt->rc_write_pcl); + pcl_init(&ctxt->rc_reply_pcl); + ctxt->rc_recv_wr.next = NULL; ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge; @@ -180,8 +191,13 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) } } -static struct svc_rdma_recv_ctxt * -svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) +/** + * svc_rdma_recv_ctxt_get - Allocate a recv_ctxt + * @rdma: controlling svcxprt_rdma + * + * Returns a recv_ctxt or (rarely) NULL if none are available. + */ +struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; struct llist_node *node; @@ -211,10 +227,10 @@ out_empty: void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { - unsigned int i; - - for (i = 0; i < ctxt->rc_page_count; i++) - put_page(ctxt->rc_pages[i]); + pcl_free(&ctxt->rc_call_pcl); + pcl_free(&ctxt->rc_read_pcl); + pcl_free(&ctxt->rc_write_pcl); + pcl_free(&ctxt->rc_reply_pcl); if (!ctxt->rc_temp) llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); @@ -222,32 +238,68 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, svc_rdma_recv_ctxt_destroy(rdma, ctxt); } -static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, - struct svc_rdma_recv_ctxt *ctxt) +/** + * svc_rdma_release_rqst - Release transport-specific per-rqst resources + * @rqstp: svc_rqst being released + * + * Ensure that the recv_ctxt is released whether or not a Reply + * was sent. For example, the client could close the connection, + * or svc_process could drop an RPC, before the Reply is sent. + */ +void svc_rdma_release_rqst(struct svc_rqst *rqstp) { - int ret; - - svc_xprt_get(&rdma->sc_xprt); - ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); - trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret); - if (ret) - goto err_post; - return 0; + struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt; + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); -err_post: - svc_rdma_recv_ctxt_put(rdma, ctxt); - svc_xprt_put(&rdma->sc_xprt); - return ret; + rqstp->rq_xprt_ctxt = NULL; + if (ctxt) + svc_rdma_recv_ctxt_put(rdma, ctxt); } -static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) +static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, + unsigned int wanted, bool temp) { + const struct ib_recv_wr *bad_wr = NULL; struct svc_rdma_recv_ctxt *ctxt; + struct ib_recv_wr *recv_chain; + int ret; - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - return -ENOMEM; - return __svc_rdma_post_recv(rdma, ctxt); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + return false; + + recv_chain = NULL; + while (wanted--) { + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + break; + + trace_svcrdma_post_recv(ctxt); + ctxt->rc_temp = temp; + ctxt->rc_recv_wr.next = recv_chain; + recv_chain = &ctxt->rc_recv_wr; + rdma->sc_pending_recvs++; + } + if (!recv_chain) + return false; + + ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr); + if (ret) + goto err_free; + return true; + +err_free: + trace_svcrdma_rq_post_err(rdma, ret); + while (bad_wr) { + ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt, + rc_recv_wr); + bad_wr = bad_wr->next; + svc_rdma_recv_ctxt_put(rdma, ctxt); + } + /* Since we're destroying the xprt, no need to reset + * sc_pending_recvs. */ + return false; } /** @@ -258,20 +310,7 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { - struct svc_rdma_recv_ctxt *ctxt; - unsigned int i; - int ret; - - for (i = 0; i < rdma->sc_max_requests; i++) { - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - return false; - ctxt->rc_temp = true; - ret = __svc_rdma_post_recv(rdma, ctxt); - if (ret) - return false; - } - return true; + return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true); } /** @@ -279,8 +318,6 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) * @cq: Completion Queue context * @wc: Work Completion object * - * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that - * the Receive completion handler could be running. */ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { @@ -288,22 +325,30 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_recv_ctxt *ctxt; - trace_svcrdma_wc_receive(wc); + rdma->sc_pending_recvs--; /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); if (wc->status != IB_WC_SUCCESS) goto flushed; - - if (svc_rdma_post_recv(rdma)) - goto post_err; + trace_svcrdma_wc_recv(wc, &ctxt->rc_cid); + + /* If receive posting fails, the connection is about to be + * lost anyway. The server will not be able to send a reply + * for this RPC, and the client will retransmit this RPC + * anyway when it reconnects. + * + * Therefore we drop the Receive, even if status was SUCCESS + * to reduce the likelihood of replayed requests once the + * client reconnects. + */ + if (rdma->sc_pending_recvs < rdma->sc_max_requests) + if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false)) + goto dropped; /* All wc fields are now known to be valid */ ctxt->rc_byte_len = wc->byte_len; - ib_dma_sync_single_for_cpu(rdma->sc_pd->device, - ctxt->rc_recv_sge.addr, - wc->byte_len, DMA_FROM_DEVICE); spin_lock(&rdma->sc_rq_dto_lock); list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q); @@ -312,15 +357,16 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) spin_unlock(&rdma->sc_rq_dto_lock); if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) svc_xprt_enqueue(&rdma->sc_xprt); - goto out; + return; flushed: -post_err: + if (wc->status == IB_WC_WR_FLUSH_ERR) + trace_svcrdma_wc_recv_flush(wc, &ctxt->rc_cid); + else + trace_svcrdma_wc_recv_err(wc, &ctxt->rc_cid); +dropped: svc_rdma_recv_ctxt_put(rdma, ctxt); - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_xprt_enqueue(&rdma->sc_xprt); -out: - svc_xprt_put(&rdma->sc_xprt); + svc_xprt_deferred_close(&rdma->sc_xprt); } /** @@ -332,10 +378,6 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; - while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) { - list_del(&ctxt->rc_list); - svc_rdma_recv_ctxt_put(rdma, ctxt); - } while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) { list_del(&ctxt->rc_list); svc_rdma_recv_ctxt_put(rdma, ctxt); @@ -357,122 +399,183 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, arg->len = ctxt->rc_byte_len; } -/* This accommodates the largest possible Write chunk, - * in one segment. +/** + * xdr_count_read_segments - Count number of Read segments in Read list + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list + * + * Before allocating anything, ensure the ingress Read list is safe + * to use. + * + * The segment count is limited to how many segments can fit in the + * transport header without overflowing the buffer. That's about 40 + * Read segments for a 1KB inline threshold. + * + * Return values: + * %true: Read list is valid. @rctxt's xdr_stream is updated to point + * to the first byte past the Read list. rc_read_pcl and + * rc_call_pcl cl_count fields are set to the number of + * Read segments in the list. + * %false: Read list is corrupt. @rctxt's xdr_stream is left in an + * unknown state. */ -#define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) +static bool xdr_count_read_segments(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + rctxt->rc_call_pcl.cl_count = 0; + rctxt->rc_read_pcl.cl_count = 0; + while (xdr_item_is_present(p)) { + u32 position, handle, length; + u64 offset; + + p = xdr_inline_decode(&rctxt->rc_stream, + rpcrdma_readseg_maxsz * sizeof(*p)); + if (!p) + return false; -/* This accommodates the largest possible Position-Zero - * Read chunk or Reply chunk, in one segment. - */ -#define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) + xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position) { + if (position & 3) + return false; + ++rctxt->rc_read_pcl.cl_count; + } else { + ++rctxt->rc_call_pcl.cl_count; + } + + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + } + return true; +} /* Sanity check the Read list. * - * Implementation limits: - * - This implementation supports only one Read chunk. - * * Sanity checks: - * - Read list does not overflow buffer. - * - Segment size limited by largest NFS data payload. - * - * The segment count is limited to how many segments can - * fit in the transport header without overflowing the - * buffer. That's about 40 Read segments for a 1KB inline - * threshold. - * - * Returns pointer to the following Write list. + * - Read list does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Read list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Read list. + * %false: Read list is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end) +static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 position; - bool first; - - first = true; - while (*p++ != xdr_zero) { - if (first) { - position = be32_to_cpup(p++); - first = false; - } else if (be32_to_cpup(p++) != position) { - return NULL; - } - p++; /* handle */ - if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG) - return NULL; - p += 2; /* offset */ + __be32 *p; - if (p > end) - return NULL; - } - return p; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + if (!xdr_count_read_segments(rctxt, p)) + return false; + if (!pcl_alloc_call(rctxt, p)) + return false; + return pcl_alloc_read(rctxt, p); } -/* The segment count is limited to how many segments can - * fit in the transport header without overflowing the - * buffer. That's about 60 Write segments for a 1KB inline - * threshold. - */ -static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end, - u32 maxlen) +static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt) { - u32 i, segcount; + u32 segcount; + __be32 *p; - segcount = be32_to_cpup(p++); - for (i = 0; i < segcount; i++) { - p++; /* handle */ - if (be32_to_cpup(p++) > maxlen) - return NULL; - p += 2; /* offset */ + if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount)) + return false; - if (p > end) - return NULL; - } + /* A bogus segcount causes this buffer overflow check to fail. */ + p = xdr_inline_decode(&rctxt->rc_stream, + segcount * rpcrdma_segment_maxsz * sizeof(*p)); + return p != NULL; +} - return p; +/** + * xdr_count_write_chunks - Count number of Write chunks in Write list + * @rctxt: Received header and decoding state + * @p: start of an un-decoded Write list + * + * Before allocating anything, ensure the ingress Write list is + * safe to use. + * + * Return values: + * %true: Write list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Write list, and + * the number of Write chunks is in rc_write_pcl.cl_count. + * %false: Write list is corrupt. @rctxt's xdr_stream is left + * in an indeterminate state. + */ +static bool xdr_count_write_chunks(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + rctxt->rc_write_pcl.cl_count = 0; + while (xdr_item_is_present(p)) { + if (!xdr_check_write_chunk(rctxt)) + return false; + ++rctxt->rc_write_pcl.cl_count; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + } + return true; } /* Sanity check the Write list. * * Implementation limits: - * - This implementation supports only one Write chunk. + * - This implementation currently supports only one Write chunk. * * Sanity checks: - * - Write list does not overflow buffer. - * - Segment size limited by largest NFS data payload. - * - * Returns pointer to the following Reply chunk. + * - Write list does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Write list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Write list. + * %false: Write list is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end) +static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 chcount; + __be32 *p; - chcount = 0; - while (*p++ != xdr_zero) { - p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG); - if (!p) - return NULL; - if (chcount++ > 1) - return NULL; - } - return p; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + if (!xdr_count_write_chunks(rctxt, p)) + return false; + if (!pcl_alloc_write(rctxt, &rctxt->rc_write_pcl, p)) + return false; + + rctxt->rc_cur_result_payload = pcl_first_chunk(&rctxt->rc_write_pcl); + return true; } /* Sanity check the Reply chunk. * * Sanity checks: - * - Reply chunk does not overflow buffer. - * - Segment size limited by largest NFS data payload. - * - * Returns pointer to the following RPC header. + * - Reply chunk does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Reply chunk is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Reply chunk. + * %false: Reply chunk is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) +static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt) { - if (*p++ != xdr_zero) { - p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG); - if (!p) - return NULL; - } - return p; + __be32 *p; + + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + + if (!xdr_item_is_present(p)) + return true; + if (!xdr_check_write_chunk(rctxt)) + return false; + + rctxt->rc_reply_pcl.cl_count = 1; + return pcl_alloc_write(rctxt, &rctxt->rc_reply_pcl, p); } /* RPC-over-RDMA Version One private extension: Remote Invalidation. @@ -481,206 +584,147 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) * * If there is exactly one distinct R_key in the received transport * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero. - * - * Perform this operation while the received transport header is - * still in the CPU cache. */ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { - __be32 inv_rkey, *p; - u32 i, segcount; + struct svc_rdma_segment *segment; + struct svc_rdma_chunk *chunk; + u32 inv_rkey; ctxt->rc_inv_rkey = 0; if (!rdma->sc_snd_w_inv) return; - inv_rkey = xdr_zero; - p = ctxt->rc_recv_buf; - p += rpcrdma_fixed_maxsz; - - /* Read list */ - while (*p++ != xdr_zero) { - p++; /* position */ - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) - return; - p += 4; + inv_rkey = 0; + pcl_for_each_chunk(chunk, &ctxt->rc_call_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) + return; + } } - - /* Write list */ - while (*p++ != xdr_zero) { - segcount = be32_to_cpup(p++); - for (i = 0; i < segcount; i++) { - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) + pcl_for_each_chunk(chunk, &ctxt->rc_read_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) return; - p += 4; } } - - /* Reply chunk */ - if (*p++ != xdr_zero) { - segcount = be32_to_cpup(p++); - for (i = 0; i < segcount; i++) { - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) + pcl_for_each_chunk(chunk, &ctxt->rc_write_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) return; - p += 4; } } - - ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey); + pcl_for_each_chunk(chunk, &ctxt->rc_reply_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) + return; + } + } + ctxt->rc_inv_rkey = inv_rkey; } -/* On entry, xdr->head[0].iov_base points to first byte in the - * RPC-over-RDMA header. +/** + * svc_rdma_xdr_decode_req - Decode the transport header + * @rq_arg: xdr_buf containing ingress RPC/RDMA message + * @rctxt: state of decoding + * + * On entry, xdr->head[0].iov_base points to first byte of the + * RPC-over-RDMA transport header. * * On successful exit, head[0] points to first byte past the * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. + * * The length of the RPC-over-RDMA header is returned. * * Assumptions: * - The transport header is entirely contained in the head iovec. */ -static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) +static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg, + struct svc_rdma_recv_ctxt *rctxt) { - __be32 *p, *end, *rdma_argp; + __be32 *p, *rdma_argp; unsigned int hdr_len; - /* Verify that there's enough bytes for header + something */ - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) - goto out_short; - rdma_argp = rq_arg->head[0].iov_base; - if (*(rdma_argp + 1) != rpcrdma_version) - goto out_version; + xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp, NULL); - switch (*(rdma_argp + 3)) { + p = xdr_inline_decode(&rctxt->rc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); + if (unlikely(!p)) + goto out_short; + p++; + if (*p != rpcrdma_version) + goto out_version; + p += 2; + rctxt->rc_msgtype = *p; + switch (rctxt->rc_msgtype) { case rdma_msg: break; case rdma_nomsg: break; - case rdma_done: goto out_drop; - case rdma_error: goto out_drop; - default: goto out_proc; } - end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); - p = xdr_check_read_list(rdma_argp + 4, end); - if (!p) + if (!xdr_check_read_list(rctxt)) goto out_inval; - p = xdr_check_write_list(p, end); - if (!p) - goto out_inval; - p = xdr_check_reply_chunk(p, end); - if (!p) + if (!xdr_check_write_list(rctxt)) goto out_inval; - if (p > end) + if (!xdr_check_reply_chunk(rctxt)) goto out_inval; - rq_arg->head[0].iov_base = p; - hdr_len = (unsigned long)p - (unsigned long)rdma_argp; + rq_arg->head[0].iov_base = rctxt->rc_stream.p; + hdr_len = xdr_stream_pos(&rctxt->rc_stream); rq_arg->head[0].iov_len -= hdr_len; rq_arg->len -= hdr_len; - trace_svcrdma_decode_rqst(rdma_argp, hdr_len); + trace_svcrdma_decode_rqst(rctxt, rdma_argp, hdr_len); return hdr_len; out_short: - trace_svcrdma_decode_short(rq_arg->len); + trace_svcrdma_decode_short_err(rctxt, rq_arg->len); return -EINVAL; out_version: - trace_svcrdma_decode_badvers(rdma_argp); + trace_svcrdma_decode_badvers_err(rctxt, rdma_argp); return -EPROTONOSUPPORT; out_drop: - trace_svcrdma_decode_drop(rdma_argp); + trace_svcrdma_decode_drop_err(rctxt, rdma_argp); return 0; out_proc: - trace_svcrdma_decode_badproc(rdma_argp); + trace_svcrdma_decode_badproc_err(rctxt, rdma_argp); return -EINVAL; out_inval: - trace_svcrdma_decode_parse(rdma_argp); + trace_svcrdma_decode_parse_err(rctxt, rdma_argp); return -EINVAL; } -static void rdma_read_complete(struct svc_rqst *rqstp, - struct svc_rdma_recv_ctxt *head) -{ - int page_no; - - /* Move Read chunk pages to rqstp so that they will be released - * when svc_process is done with them. - */ - for (page_no = 0; page_no < head->rc_page_count; page_no++) { - put_page(rqstp->rq_pages[page_no]); - rqstp->rq_pages[page_no] = head->rc_pages[page_no]; - } - head->rc_page_count = 0; - - /* Point rq_arg.pages past header */ - rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count]; - rqstp->rq_arg.page_len = head->rc_arg.page_len; - - /* rq_respages starts after the last arg page */ - rqstp->rq_respages = &rqstp->rq_pages[page_no]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* Rebuild rq_arg head and tail. */ - rqstp->rq_arg.head[0] = head->rc_arg.head[0]; - rqstp->rq_arg.tail[0] = head->rc_arg.tail[0]; - rqstp->rq_arg.len = head->rc_arg.len; - rqstp->rq_arg.buflen = head->rc_arg.buflen; -} - -static void svc_rdma_send_error(struct svcxprt_rdma *xprt, - __be32 *rdma_argp, int status) +static void svc_rdma_send_error(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *rctxt, + int status) { - struct svc_rdma_send_ctxt *ctxt; - unsigned int length; - __be32 *p; - int ret; + struct svc_rdma_send_ctxt *sctxt; - ctxt = svc_rdma_send_ctxt_get(xprt); - if (!ctxt) + sctxt = svc_rdma_send_ctxt_get(rdma); + if (!sctxt) return; - - p = ctxt->sc_xprt_buf; - *p++ = *rdma_argp; - *p++ = *(rdma_argp + 1); - *p++ = xprt->sc_fc_credits; - *p++ = rdma_error; - switch (status) { - case -EPROTONOSUPPORT: - *p++ = err_vers; - *p++ = rpcrdma_version; - *p++ = rpcrdma_version; - trace_svcrdma_err_vers(*rdma_argp); - break; - default: - *p++ = err_chunk; - trace_svcrdma_err_chunk(*rdma_argp); - } - length = (unsigned long)p - (unsigned long)ctxt->sc_xprt_buf; - svc_rdma_sync_reply_hdr(xprt, ctxt, length); - - ctxt->sc_send_wr.opcode = IB_WR_SEND; - ret = svc_rdma_send(xprt, &ctxt->sc_send_wr); - if (ret) - svc_rdma_send_ctxt_put(xprt, ctxt); + svc_rdma_send_error_msg(rdma, sctxt, rctxt, status); } /* By convention, backchannel calls arrive via rdma_msg type @@ -688,30 +732,28 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt, * the RPC/RDMA header small and fixed in size, so it is * straightforward to check the RPC header's direction field. */ -static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, - __be32 *rdma_resp) +static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, + struct svc_rdma_recv_ctxt *rctxt) { - __be32 *p; + __be32 *p = rctxt->rc_recv_buf; if (!xprt->xpt_bc_xprt) return false; - p = rdma_resp + 3; - if (*p++ != rdma_msg) + if (rctxt->rc_msgtype != rdma_msg) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_call_pcl)) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_read_pcl)) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_write_pcl)) return false; - - /* XID sanity */ - if (*p++ != *rdma_resp) + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) return false; - /* call direction */ - if (*p == cpu_to_be32(RPC_CALL)) + + /* RPC call direction */ + if (*(p + 8) == cpu_to_be32(RPC_CALL)) return false; return true; @@ -753,29 +795,29 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svcxprt_rdma *rdma_xprt = container_of(xprt, struct svcxprt_rdma, sc_xprt); struct svc_rdma_recv_ctxt *ctxt; - __be32 *p; int ret; + rqstp->rq_xprt_ctxt = NULL; + + ctxt = NULL; spin_lock(&rdma_xprt->sc_rq_dto_lock); - ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q); - if (ctxt) { - list_del(&ctxt->rc_list); - spin_unlock(&rdma_xprt->sc_rq_dto_lock); - rdma_read_complete(rqstp, ctxt); - goto complete; - } ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q); - if (!ctxt) { + if (ctxt) + list_del(&ctxt->rc_list); + else /* No new incoming requests, terminate the loop */ clear_bit(XPT_DATA, &xprt->xpt_flags); - spin_unlock(&rdma_xprt->sc_rq_dto_lock); - return 0; - } - list_del(&ctxt->rc_list); spin_unlock(&rdma_xprt->sc_rq_dto_lock); - atomic_inc(&rdma_stat_recv); + /* Unblock the transport for the next receive */ + svc_xprt_received(xprt); + if (!ctxt) + return 0; + percpu_counter_inc(&svcrdma_stat_recv); + ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device, + ctxt->rc_recv_sge.addr, ctxt->rc_byte_len, + DMA_FROM_DEVICE); svc_rdma_build_arg_xdr(rqstp, ctxt); /* Prevent svc_xprt_release from releasing pages in rq_pages @@ -784,49 +826,42 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_next_page = rqstp->rq_respages; - p = (__be32 *)rqstp->rq_arg.head[0].iov_base; - ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); + ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt); if (ret < 0) goto out_err; if (ret == 0) goto out_drop; - rqstp->rq_xprt_hlen = ret; - if (svc_rdma_is_backchannel_reply(xprt, p)) { - ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, - &rqstp->rq_arg); - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); - return ret; - } + if (svc_rdma_is_reverse_direction_reply(xprt, ctxt)) + goto out_backchannel; + svc_rdma_get_inv_rkey(rdma_xprt, ctxt); - p += rpcrdma_fixed_maxsz; - if (*p != xdr_zero) - goto out_readchunk; + if (!pcl_is_empty(&ctxt->rc_read_pcl) || + !pcl_is_empty(&ctxt->rc_call_pcl)) { + ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt); + if (ret < 0) + goto out_readfail; + } -complete: rqstp->rq_xprt_ctxt = ctxt; rqstp->rq_prot = IPPROTO_MAX; svc_xprt_copy_addrs(rqstp, xprt); return rqstp->rq_arg.len; -out_readchunk: - ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p); - if (ret < 0) - goto out_postfail; - return 0; - out_err: - svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_send_error(rdma_xprt, ctxt, ret); svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; -out_postfail: +out_readfail: if (ret == -EINVAL) - svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_send_error(rdma_xprt, ctxt, ret); svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; +out_backchannel: + svc_rdma_handle_bc_reply(rqstp, ctxt); out_drop: svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 48fe3b16b0d9..11cf7c646644 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -7,15 +7,13 @@ #include <rdma/rw.h> +#include <linux/sunrpc/xdr.h> #include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> -#include <linux/sunrpc/debug.h> #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc); static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); @@ -37,11 +35,12 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); * controlling svcxprt_rdma is destroyed. */ struct svc_rdma_rw_ctxt { + struct llist_node rw_node; struct list_head rw_list; struct rdma_rw_ctx rw_ctx; - int rw_nents; + unsigned int rw_nents; struct sg_table rw_sg_table; - struct scatterlist rw_first_sgl[0]; + struct scatterlist rw_first_sgl[]; }; static inline struct svc_rdma_rw_ctxt * @@ -55,41 +54,48 @@ static struct svc_rdma_rw_ctxt * svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_rw_ctxt_lock); - - ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts); - if (ctxt) { - list_del(&ctxt->rw_list); - spin_unlock(&rdma->sc_rw_ctxt_lock); + node = llist_del_first(&rdma->sc_rw_ctxts); + spin_unlock(&rdma->sc_rw_ctxt_lock); + if (node) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - spin_unlock(&rdma->sc_rw_ctxt_lock); ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), GFP_KERNEL); if (!ctxt) - goto out; + goto out_noctx; + INIT_LIST_HEAD(&ctxt->rw_list); } ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, ctxt->rw_sg_table.sgl, - SG_CHUNK_SIZE)) { - kfree(ctxt); - ctxt = NULL; - } -out: + SG_CHUNK_SIZE)) + goto out_free; return ctxt; + +out_free: + kfree(ctxt); +out_noctx: + trace_svcrdma_no_rwctx_err(rdma, sges); + return NULL; } -static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, - struct svc_rdma_rw_ctxt *ctxt) +static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt, + struct llist_head *list) { sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); + llist_add(&ctxt->rw_node, list); +} - spin_lock(&rdma->sc_rw_ctxt_lock); - list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts); - spin_unlock(&rdma->sc_rw_ctxt_lock); +static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt) +{ + __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts); } /** @@ -100,15 +106,44 @@ static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) { - list_del(&ctxt->rw_list); + while ((node = llist_del_first(&rdma->sc_rw_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); kfree(ctxt); } } +/** + * svc_rdma_rw_ctx_init - Prepare a R/W context for I/O + * @rdma: controlling transport instance + * @ctxt: R/W context to prepare + * @offset: RDMA offset + * @handle: RDMA tag/handle + * @direction: I/O direction + * + * Returns on success, the number of WQEs that will be needed + * on the workqueue, or a negative errno. + */ +static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt, + u64 offset, u32 handle, + enum dma_data_direction direction) +{ + int ret; + + ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, + ctxt->rw_sg_table.sgl, ctxt->rw_nents, + 0, offset, handle, direction); + if (unlikely(ret < 0)) { + svc_rdma_put_rw_ctxt(rdma, ctxt); + trace_svcrdma_dma_map_rw_err(rdma, ctxt->rw_nents, ret); + } + return ret; +} + /* A chunk context tracks all I/O for moving one Read or Write - * chunk. This is a a set of rdma_rw's that handle data movement + * chunk. This is a set of rdma_rw's that handle data movement * for all segments of one chunk. * * These are small, acquired with a single allocator call, and @@ -116,37 +151,62 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) * demand, and not cached. */ struct svc_rdma_chunk_ctxt { + struct rpc_rdma_cid cc_cid; struct ib_cqe cc_cqe; struct svcxprt_rdma *cc_rdma; struct list_head cc_rwctxts; + ktime_t cc_posttime; int cc_sqecount; + enum ib_wc_status cc_status; + struct completion cc_done; }; +static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_sq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc) { + svc_rdma_cc_cid_init(rdma, &cc->cc_cid); cc->cc_rdma = rdma; - svc_xprt_get(&rdma->sc_xprt); INIT_LIST_HEAD(&cc->cc_rwctxts); cc->cc_sqecount = 0; } +/* + * The consumed rw_ctx's are cleaned and placed on a local llist so + * that only one atomic llist operation is needed to put them all + * back on the free list. + */ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir) { struct svcxprt_rdma *rdma = cc->cc_rdma; + struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; + LLIST_HEAD(free); + first = last = NULL; while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, ctxt->rw_sg_table.sgl, ctxt->rw_nents, dir); - svc_rdma_put_rw_ctxt(rdma, ctxt); + __svc_rdma_put_rw_ctxt(rdma, ctxt, &free); + + ctxt->rw_node.next = first; + first = &ctxt->rw_node; + if (!last) + last = first; } - svc_xprt_put(&rdma->sc_xprt); + if (first) + llist_add_batch(first, last, &rdma->sc_rw_ctxts); } /* State for sending a Write or Reply chunk. @@ -154,14 +214,14 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, * - Stores arguments for the SGL constructor functions */ struct svc_rdma_write_info { + const struct svc_rdma_chunk *wi_chunk; + /* write state of this chunk */ unsigned int wi_seg_off; unsigned int wi_seg_no; - unsigned int wi_nsegs; - __be32 *wi_segs; /* SGL constructor arguments */ - struct xdr_buf *wi_xdr; + const struct xdr_buf *wi_xdr; unsigned char *wi_base; unsigned int wi_next_off; @@ -169,7 +229,8 @@ struct svc_rdma_write_info { }; static struct svc_rdma_write_info * -svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) +svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk) { struct svc_rdma_write_info *info; @@ -177,10 +238,9 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) if (!info) return info; + info->wi_chunk = chunk; info->wi_seg_off = 0; info->wi_seg_no = 0; - info->wi_nsegs = be32_to_cpup(++chunk); - info->wi_segs = ++chunk; svc_rdma_cc_init(rdma, &info->wi_cc); info->wi_cc.cc_cqe.done = svc_rdma_write_done; return info; @@ -208,13 +268,21 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_write_info *info = container_of(cc, struct svc_rdma_write_info, wi_cc); - trace_svcrdma_wc_write(wc); + switch (wc->status) { + case IB_WC_SUCCESS: + trace_svcrdma_wc_write(wc, &cc->cc_cid); + break; + case IB_WC_WR_FLUSH_ERR: + trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); + break; + default: + trace_svcrdma_wc_write_err(wc, &cc->cc_cid); + } - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); if (unlikely(wc->status != IB_WC_SUCCESS)) - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_xprt_deferred_close(&rdma->sc_xprt); svc_rdma_write_info_free(info); } @@ -222,11 +290,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) /* State for pulling a Read chunk. */ struct svc_rdma_read_info { + struct svc_rqst *ri_rqst; struct svc_rdma_recv_ctxt *ri_readctxt; - unsigned int ri_position; unsigned int ri_pageno; unsigned int ri_pageoff; - unsigned int ri_chunklen; + unsigned int ri_totalbytes; struct svc_rdma_chunk_ctxt ri_cc; }; @@ -262,30 +330,25 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svcxprt_rdma *rdma = cc->cc_rdma; - struct svc_rdma_read_info *info = - container_of(cc, struct svc_rdma_read_info, ri_cc); - - trace_svcrdma_wc_read(wc); - - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); + struct svc_rdma_read_info *info; - if (unlikely(wc->status != IB_WC_SUCCESS)) { - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt); - } else { - spin_lock(&rdma->sc_rq_dto_lock); - list_add_tail(&info->ri_readctxt->rc_list, - &rdma->sc_read_complete_q); - /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */ - set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); - spin_unlock(&rdma->sc_rq_dto_lock); - - svc_xprt_enqueue(&rdma->sc_xprt); + switch (wc->status) { + case IB_WC_SUCCESS: + info = container_of(cc, struct svc_rdma_read_info, ri_cc); + trace_svcrdma_wc_read(wc, &cc->cc_cid, info->ri_totalbytes, + cc->cc_posttime); + break; + case IB_WC_WR_FLUSH_ERR: + trace_svcrdma_wc_read_flush(wc, &cc->cc_cid); + break; + default: + trace_svcrdma_wc_read_err(wc, &cc->cc_cid); } - svc_rdma_read_info_free(info); + svc_rdma_wake_send_waiters(cc->cc_rdma, cc->cc_sqecount); + cc->cc_status = wc->status; + complete(&cc->cc_done); + return; } /* This function sleeps when the transport's Send Queue is congested. @@ -298,7 +361,6 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) { struct svcxprt_rdma *rdma = cc->cc_rdma; - struct svc_xprt *xprt = &rdma->sc_xprt; struct ib_send_wr *first_wr; const struct ib_send_wr *bad_wr; struct list_head *tmp; @@ -322,14 +384,14 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) do { if (atomic_sub_return(cc->cc_sqecount, &rdma->sc_sq_avail) > 0) { + cc->cc_posttime = ktime_get(); ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); - trace_svcrdma_post_rw(&cc->cc_cqe, - cc->cc_sqecount, ret); if (ret) break; return 0; } + percpu_counter_inc(&svcrdma_stat_sq_starve); trace_svcrdma_sq_full(rdma); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, @@ -337,7 +399,8 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) trace_svcrdma_sq_retry(rdma); } while (1); - set_bit(XPT_CLOSE, &xprt->xpt_flags); + trace_svcrdma_sq_post_err(rdma, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); /* If even one was posted, there will be a completion. */ if (bad_wr != first_wr) @@ -369,7 +432,7 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, struct svc_rdma_rw_ctxt *ctxt) { unsigned int sge_no, sge_bytes, page_off, page_no; - struct xdr_buf *xdr = info->wi_xdr; + const struct xdr_buf *xdr = info->wi_xdr; struct scatterlist *sg; struct page **page; @@ -407,43 +470,37 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, { struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; struct svcxprt_rdma *rdma = cc->cc_rdma; + const struct svc_rdma_segment *seg; struct svc_rdma_rw_ctxt *ctxt; - __be32 *seg; int ret; - seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; do { unsigned int write_len; - u32 seg_length, seg_handle; - u64 seg_offset; + u64 offset; - if (info->wi_seg_no >= info->wi_nsegs) + if (info->wi_seg_no >= info->wi_chunk->ch_segcount) goto out_overflow; - seg_handle = be32_to_cpup(seg); - seg_length = be32_to_cpup(seg + 1); - xdr_decode_hyper(seg + 2, &seg_offset); - seg_offset += info->wi_seg_off; - - write_len = min(remaining, seg_length - info->wi_seg_off); + seg = &info->wi_chunk->ch_segments[info->wi_seg_no]; + write_len = min(remaining, seg->rs_length - info->wi_seg_off); + if (!write_len) + goto out_overflow; ctxt = svc_rdma_get_rw_ctxt(rdma, (write_len >> PAGE_SHIFT) + 2); if (!ctxt) - goto out_noctx; + return -ENOMEM; constructor(info, write_len, ctxt); - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, ctxt->rw_sg_table.sgl, - ctxt->rw_nents, 0, seg_offset, - seg_handle, DMA_TO_DEVICE); + offset = seg->rs_offset + info->wi_seg_off; + ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle, + DMA_TO_DEVICE); if (ret < 0) - goto out_initerr; + return -EIO; + percpu_counter_inc(&svcrdma_stat_write); - trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; - if (write_len == seg_length - info->wi_seg_off) { - seg += 4; + if (write_len == seg->rs_length - info->wi_seg_off) { info->wi_seg_no++; info->wi_seg_off = 0; } else { @@ -455,51 +512,96 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, return 0; out_overflow: - dprintk("svcrdma: inadequate space in Write chunk (%u)\n", - info->wi_nsegs); + trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no, + info->wi_chunk->ch_segcount); return -E2BIG; - -out_noctx: - dprintk("svcrdma: no R/W ctxs available\n"); - return -ENOMEM; - -out_initerr: - svc_rdma_put_rw_ctxt(rdma, ctxt); - trace_svcrdma_dma_map_rwctx(rdma, ret); - return -EIO; } -/* Send one of an xdr_buf's kvecs by itself. To send a Reply - * chunk, the whole RPC Reply is written back to the client. - * This function writes either the head or tail of the xdr_buf - * containing the Reply. +/** + * svc_rdma_iov_write - Construct RDMA Writes from an iov + * @info: pointer to write arguments + * @iov: kvec to write + * + * Returns: + * On success, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred */ -static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info, - struct kvec *vec) +static int svc_rdma_iov_write(struct svc_rdma_write_info *info, + const struct kvec *iov) { - info->wi_base = vec->iov_base; + info->wi_base = iov->iov_base; return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, - vec->iov_len); + iov->iov_len); } -/* Send an xdr_buf's page list by itself. A Write chunk is - * just the page list. a Reply chunk is the head, page list, - * and tail. This function is shared between the two types - * of chunk. +/** + * svc_rdma_pages_write - Construct RDMA Writes from pages + * @info: pointer to write arguments + * @xdr: xdr_buf with pages to write + * @offset: offset into the content of @xdr + * @length: number of bytes to write + * + * Returns: + * On success, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred */ -static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, - struct xdr_buf *xdr) +static int svc_rdma_pages_write(struct svc_rdma_write_info *info, + const struct xdr_buf *xdr, + unsigned int offset, + unsigned long length) { info->wi_xdr = xdr; - info->wi_next_off = 0; + info->wi_next_off = offset - xdr->head[0].iov_len; return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, - xdr->page_len); + length); +} + +/** + * svc_rdma_xb_write - Construct RDMA Writes to write an xdr_buf + * @xdr: xdr_buf to write + * @data: pointer to write arguments + * + * Returns: + * On success, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred + */ +static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) +{ + struct svc_rdma_write_info *info = data; + int ret; + + if (xdr->head[0].iov_len) { + ret = svc_rdma_iov_write(info, &xdr->head[0]); + if (ret < 0) + return ret; + } + + if (xdr->page_len) { + ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len, + xdr->page_len); + if (ret < 0) + return ret; + } + + if (xdr->tail[0].iov_len) { + ret = svc_rdma_iov_write(info, &xdr->tail[0]); + if (ret < 0) + return ret; + } + + return xdr->len; } /** * svc_rdma_send_write_chunk - Write all segments in a Write chunk * @rdma: controlling RDMA transport - * @wr_ch: Write chunk provided by client + * @chunk: Write chunk provided by the client * @xdr: xdr_buf containing the data payload * * Returns a non-negative number of bytes the chunk consumed, or @@ -509,29 +611,28 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, - struct xdr_buf *xdr) +int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; + struct svc_rdma_chunk_ctxt *cc; int ret; - if (!xdr->page_len) - return 0; - - info = svc_rdma_write_info_alloc(rdma, wr_ch); + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; + cc = &info->wi_cc; - ret = svc_rdma_send_xdr_pagelist(info, xdr); - if (ret < 0) + ret = svc_rdma_xb_write(xdr, info); + if (ret != xdr->len) goto out_err; - ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); + ret = svc_rdma_post_chunk_ctxt(cc); if (ret < 0) goto out_err; - - trace_svcrdma_encode_write(xdr->page_len); - return xdr->page_len; + return xdr->len; out_err: svc_rdma_write_info_free(info); @@ -541,8 +642,7 @@ out_err: /** * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk * @rdma: controlling RDMA transport - * @rp_ch: Reply chunk provided by client - * @writelist: true if client provided a Write list + * @rctxt: Write and Reply chunks from client * @xdr: xdr_buf containing an RPC Reply * * Returns a non-negative number of bytes the chunk consumed, or @@ -552,65 +652,68 @@ out_err: * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, - bool writelist, struct xdr_buf *xdr) +int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; - int consumed, ret; + struct svc_rdma_chunk_ctxt *cc; + struct svc_rdma_chunk *chunk; + int ret; - info = svc_rdma_write_info_alloc(rdma, rp_ch); + if (pcl_is_empty(&rctxt->rc_reply_pcl)) + return 0; + + chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; + cc = &info->wi_cc; - ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]); + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_write, info); if (ret < 0) goto out_err; - consumed = xdr->head[0].iov_len; - - /* Send the page list in the Reply chunk only if the - * client did not provide Write chunks. - */ - if (!writelist && xdr->page_len) { - ret = svc_rdma_send_xdr_pagelist(info, xdr); - if (ret < 0) - goto out_err; - consumed += xdr->page_len; - } - - if (xdr->tail[0].iov_len) { - ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]); - if (ret < 0) - goto out_err; - consumed += xdr->tail[0].iov_len; - } - ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); + ret = svc_rdma_post_chunk_ctxt(cc); if (ret < 0) goto out_err; - trace_svcrdma_encode_reply(consumed); - return consumed; + return xdr->len; out_err: svc_rdma_write_info_free(info); return ret; } +/** + * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment + * @info: context for ongoing I/O + * @segment: co-ordinates of remote memory to be read + * + * Returns: + * %0: the Read WR chain was constructed successfully + * %-EINVAL: there were not enough rq_pages to finish + * %-ENOMEM: allocating a local resources failed + * %-EIO: a DMA mapping error occurred + */ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, - struct svc_rqst *rqstp, - u32 rkey, u32 len, u64 offset) + const struct svc_rdma_segment *segment) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; + struct svc_rqst *rqstp = info->ri_rqst; + unsigned int sge_no, seg_len, len; struct svc_rdma_rw_ctxt *ctxt; - unsigned int sge_no, seg_len; struct scatterlist *sg; int ret; + len = segment->rs_length; sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); if (!ctxt) - goto out_noctx; + return -ENOMEM; ctxt->rw_nents = sge_no; sg = ctxt->rw_sg_table.sgl; @@ -618,8 +721,6 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, seg_len = min_t(unsigned int, len, PAGE_SIZE - info->ri_pageoff); - head->rc_arg.pages[info->ri_pageno] = - rqstp->rq_pages[info->ri_pageno]; if (!info->ri_pageoff) head->rc_page_count++; @@ -640,102 +741,198 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, goto out_overrun; } - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp, - cc->cc_rdma->sc_port_num, - ctxt->rw_sg_table.sgl, ctxt->rw_nents, - 0, offset, rkey, DMA_FROM_DEVICE); + ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset, + segment->rs_handle, DMA_FROM_DEVICE); if (ret < 0) - goto out_initerr; + return -EIO; + percpu_counter_inc(&svcrdma_stat_read); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; return 0; -out_noctx: - dprintk("svcrdma: no R/W ctxs available\n"); - return -ENOMEM; - out_overrun: - dprintk("svcrdma: request overruns rq_pages\n"); + trace_svcrdma_page_overrun_err(cc->cc_rdma, rqstp, info->ri_pageno); return -EINVAL; - -out_initerr: - trace_svcrdma_dma_map_rwctx(cc->cc_rdma, ret); - svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt); - return -EIO; } -/* Walk the segments in the Read chunk starting at @p and construct - * RDMA Read operations to pull the chunk to the server. +/** + * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk + * @info: context for ongoing I/O + * @chunk: Read chunk to pull + * + * Return values: + * %0: the Read WR chain was constructed successfully + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: allocating a local resources failed + * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info, + const struct svc_rdma_chunk *chunk) { - unsigned int i; + const struct svc_rdma_segment *segment; int ret; ret = -EINVAL; - info->ri_chunklen = 0; - while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { - u32 rs_handle, rs_length; - u64 rs_offset; - - rs_handle = be32_to_cpup(p++); - rs_length = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &rs_offset); - - ret = svc_rdma_build_read_segment(info, rqstp, - rs_handle, rs_length, - rs_offset); + pcl_for_each_segment(segment, chunk) { + ret = svc_rdma_build_read_segment(info, segment); if (ret < 0) break; + info->ri_totalbytes += segment->rs_length; + } + return ret; +} + +/** + * svc_rdma_copy_inline_range - Copy part of the inline content into pages + * @info: context for RDMA Reads + * @offset: offset into the Receive buffer of region to copy + * @remaining: length of region to copy + * + * Take a page at a time from rqstp->rq_pages and copy the inline + * content from the Receive buffer into that page. Update + * info->ri_pageno and info->ri_pageoff so that the next RDMA Read + * result will land contiguously with the copied content. + * + * Return values: + * %0: Inline content was successfully copied + * %-EINVAL: offset or length was incorrect + */ +static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, + unsigned int offset, + unsigned int remaining) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + unsigned char *dst, *src = head->rc_recv_buf; + struct svc_rqst *rqstp = info->ri_rqst; + unsigned int page_no, numpages; - trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset); - info->ri_chunklen += rs_length; + numpages = PAGE_ALIGN(info->ri_pageoff + remaining) >> PAGE_SHIFT; + for (page_no = 0; page_no < numpages; page_no++) { + unsigned int page_len; + + page_len = min_t(unsigned int, remaining, + PAGE_SIZE - info->ri_pageoff); + + if (!info->ri_pageoff) + head->rc_page_count++; + + dst = page_address(rqstp->rq_pages[info->ri_pageno]); + memcpy(dst + info->ri_pageno, src + offset, page_len); + + info->ri_totalbytes += page_len; + info->ri_pageoff += page_len; + if (info->ri_pageoff == PAGE_SIZE) { + info->ri_pageno++; + info->ri_pageoff = 0; + } + remaining -= page_len; + offset += page_len; } - /* Pages under I/O have been copied to head->rc_pages. - * Prevent their premature release by svc_xprt_release() . - */ - for (i = 0; i < info->ri_readctxt->rc_page_count; i++) - rqstp->rq_pages[i] = NULL; + return -EINVAL; +} - return ret; +/** + * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks + * @info: context for RDMA Reads + * + * The chunk data lands in rqstp->rq_arg as a series of contiguous pages, + * like an incoming TCP call. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *info) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; + struct xdr_buf *buf = &info->ri_rqst->rq_arg; + struct svc_rdma_chunk *chunk, *next; + unsigned int start, length; + int ret; + + start = 0; + chunk = pcl_first_chunk(pcl); + length = chunk->ch_position; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; + + pcl_for_each_chunk(chunk, pcl) { + ret = svc_rdma_build_read_chunk(info, chunk); + if (ret < 0) + return ret; + + next = pcl_next_chunk(pcl, chunk); + if (!next) + break; + + start += length; + length = next->ch_position - info->ri_totalbytes; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; + } + + start += length; + length = head->rc_byte_len - start; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; + + buf->len += info->ri_totalbytes; + buf->buflen += info->ri_totalbytes; + + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->pages = &info->ri_rqst->rq_pages[1]; + buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; + return 0; } -/* Construct RDMA Reads to pull over a normal Read chunk. The chunk - * data lands in the page list of head->rc_arg.pages. +/** + * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks + * @info: context for RDMA Reads + * + * The chunk data lands in the page list of rqstp->rq_arg.pages. * - * Currently NFSD does not look at the head->rc_arg.tail[0] iovec. + * Currently NFSD does not look at the rqstp->rq_arg.tail[0] kvec. * Therefore, XDR round-up of the Read chunk and trailing * inline content must both be added at the end of the pagelist. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static int svc_rdma_read_data_item(struct svc_rdma_read_info *info) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + struct xdr_buf *buf = &info->ri_rqst->rq_arg; + struct svc_rdma_chunk *chunk; + unsigned int length; int ret; - ret = svc_rdma_build_read_chunk(rqstp, info, p); + chunk = pcl_first_chunk(&head->rc_read_pcl); + ret = svc_rdma_build_read_chunk(info, chunk); if (ret < 0) goto out; - trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position); - - head->rc_hdr_count = 0; - /* Split the Receive buffer between the head and tail * buffers at Read chunk's position. XDR roundup of the * chunk is not included in either the pagelist or in * the tail. */ - head->rc_arg.tail[0].iov_base = - head->rc_arg.head[0].iov_base + info->ri_position; - head->rc_arg.tail[0].iov_len = - head->rc_arg.head[0].iov_len - info->ri_position; - head->rc_arg.head[0].iov_len = info->ri_position; + buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position; + buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position; + buf->head[0].iov_len = chunk->ch_position; /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). * @@ -746,109 +943,221 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, * Currently these chunks always start at page offset 0, * thus the rounded-up length never crosses a page boundary. */ - info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2; - - head->rc_arg.page_len = info->ri_chunklen; - head->rc_arg.len += info->ri_chunklen; - head->rc_arg.buflen += info->ri_chunklen; + buf->pages = &info->ri_rqst->rq_pages[0]; + length = xdr_align_size(chunk->ch_length); + buf->page_len = length; + buf->len += length; + buf->buflen += length; out: return ret; } -/* Construct RDMA Reads to pull over a Position Zero Read chunk. - * The start of the data lands in the first page just after - * the Transport header, and the rest lands in the page list of - * head->rc_arg.pages. +/** + * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk + * @info: context for RDMA Reads + * @chunk: parsed Call chunk to pull + * @offset: offset of region to pull + * @length: length of region to pull * - * Assumptions: - * - A PZRC has an XDR-aligned length (no implicit round-up). - * - There can be no trailing inline content (IOW, we assume - * a PZRC is never sent in an RDMA_MSG message, though it's - * allowed by spec). + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, + const struct svc_rdma_chunk *chunk, + unsigned int offset, unsigned int length) +{ + const struct svc_rdma_segment *segment; + int ret; + + ret = -EINVAL; + pcl_for_each_segment(segment, chunk) { + struct svc_rdma_segment dummy; + + if (offset > segment->rs_length) { + offset -= segment->rs_length; + continue; + } + + dummy.rs_handle = segment->rs_handle; + dummy.rs_length = min_t(u32, length, segment->rs_length) - offset; + dummy.rs_offset = segment->rs_offset + offset; + + ret = svc_rdma_build_read_segment(info, &dummy); + if (ret < 0) + break; + + info->ri_totalbytes += dummy.rs_length; + length -= dummy.rs_length; + offset = 0; + } + return ret; +} + +/** + * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message + * @info: context for RDMA Reads + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + const struct svc_rdma_chunk *call_chunk = + pcl_first_chunk(&head->rc_call_pcl); + const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; + struct svc_rdma_chunk *chunk, *next; + unsigned int start, length; int ret; - ret = svc_rdma_build_read_chunk(rqstp, info, p); + if (pcl_is_empty(pcl)) + return svc_rdma_build_read_chunk(info, call_chunk); + + start = 0; + chunk = pcl_first_chunk(pcl); + length = chunk->ch_position; + ret = svc_rdma_read_chunk_range(info, call_chunk, start, length); if (ret < 0) - goto out; + return ret; - trace_svcrdma_encode_pzr(info->ri_chunklen); + pcl_for_each_chunk(chunk, pcl) { + ret = svc_rdma_build_read_chunk(info, chunk); + if (ret < 0) + return ret; - head->rc_arg.len += info->ri_chunklen; - head->rc_arg.buflen += info->ri_chunklen; + next = pcl_next_chunk(pcl, chunk); + if (!next) + break; - head->rc_hdr_count = 1; - head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]); - head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE, - info->ri_chunklen); + start += length; + length = next->ch_position - info->ri_totalbytes; + ret = svc_rdma_read_chunk_range(info, call_chunk, + start, length); + if (ret < 0) + return ret; + } - head->rc_arg.page_len = info->ri_chunklen - - head->rc_arg.head[0].iov_len; + start += length; + length = call_chunk->ch_length - start; + return svc_rdma_read_chunk_range(info, call_chunk, start, length); +} + +/** + * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message + * @info: context for RDMA Reads + * + * The start of the data lands in the first page just after the + * Transport header, and the rest lands in rqstp->rq_arg.pages. + * + * Assumptions: + * - A PZRC is never sent in an RDMA_MSG message, though it's + * allowed by spec. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info) +{ + struct xdr_buf *buf = &info->ri_rqst->rq_arg; + int ret; + + ret = svc_rdma_read_call_chunk(info); + if (ret < 0) + goto out; + + buf->len += info->ri_totalbytes; + buf->buflen += info->ri_totalbytes; + + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->pages = &info->ri_rqst->rq_pages[1]; + buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; out: return ret; } /** - * svc_rdma_recv_read_chunk - Pull a Read chunk from the client + * svc_rdma_process_read_list - Pull list of Read chunks from the client * @rdma: controlling RDMA transport * @rqstp: set of pages to use as Read sink buffers * @head: pages under I/O collect here - * @p: pointer to start of Read chunk * - * Returns: - * %0 if all needed RDMA Reads were posted successfully, - * %-EINVAL if client provided too many segments, - * %-ENOMEM if rdma_rw context pool was exhausted, - * %-ENOTCONN if posting failed (connection is lost), - * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + * The RPC/RDMA protocol assumes that the upper layer's XDR decoders + * pull each Read chunk as they decode an incoming RPC message. * - * Assumptions: - * - All Read segments in @p have the same Position value. + * On Linux, however, the server needs to have a fully-constructed RPC + * message in rqstp->rq_arg when there is a positive return code from + * ->xpo_recvfrom. So the Read list is safety-checked immediately when + * it is received, then here the whole Read list is pulled all at once. + * The ingress RPC message is fully reconstructed once all associated + * RDMA Reads have completed. + * + * Return values: + * %1: all needed RDMA Reads were posted successfully, + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, - struct svc_rdma_recv_ctxt *head, __be32 *p) +int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, + struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { struct svc_rdma_read_info *info; + struct svc_rdma_chunk_ctxt *cc; int ret; - /* The request (with page list) is constructed in - * head->rc_arg. Pages involved with RDMA Read I/O are - * transferred there. - */ - head->rc_arg.head[0] = rqstp->rq_arg.head[0]; - head->rc_arg.tail[0] = rqstp->rq_arg.tail[0]; - head->rc_arg.pages = head->rc_pages; - head->rc_arg.page_base = 0; - head->rc_arg.page_len = 0; - head->rc_arg.len = rqstp->rq_arg.len; - head->rc_arg.buflen = rqstp->rq_arg.buflen; - info = svc_rdma_read_info_alloc(rdma); if (!info) return -ENOMEM; + cc = &info->ri_cc; + info->ri_rqst = rqstp; info->ri_readctxt = head; info->ri_pageno = 0; info->ri_pageoff = 0; - - info->ri_position = be32_to_cpup(p + 1); - if (info->ri_position) - ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); - else - ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); + info->ri_totalbytes = 0; + + if (pcl_is_empty(&head->rc_call_pcl)) { + if (head->rc_read_pcl.cl_count == 1) + ret = svc_rdma_read_data_item(info); + else + ret = svc_rdma_read_multiple_chunks(info); + } else + ret = svc_rdma_read_special(info); if (ret < 0) goto out_err; - ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); + trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount); + init_completion(&cc->cc_done); + ret = svc_rdma_post_chunk_ctxt(cc); if (ret < 0) goto out_err; - return 0; + + ret = 1; + wait_for_completion(&cc->cc_done); + if (cc->cc_status != IB_WC_SUCCESS) + ret = -EIO; + + /* rq_respages starts after the last arg page */ + rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + /* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */ + head->rc_page_count = 0; out_err: svc_rdma_read_info_free(info); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index f3f108090aa4..22a871e6fe4d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -106,21 +106,18 @@ #include <rdma/rdma_cm.h> #include <linux/sunrpc/debug.h> -#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); -static inline struct svc_rdma_send_ctxt * -svc_rdma_next_send_ctxt(struct list_head *list) +static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) { - return list_first_entry_or_null(list, struct svc_rdma_send_ctxt, - sc_list); + cid->ci_queue_id = rdma->sc_sq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); } static struct svc_rdma_send_ctxt * @@ -145,12 +142,17 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) goto fail2; + svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); + ctxt->sc_send_wr.next = NULL; ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; + init_completion(&ctxt->sc_done); ctxt->sc_cqe.done = svc_rdma_wc_send; ctxt->sc_xprt_buf = buffer; + xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, + rdma->sc_max_req_size); ctxt->sc_sges[0].addr = addr; for (i = 0; i < rdma->sc_max_send_sges; i++) @@ -173,9 +175,10 @@ fail0: void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) { - list_del(&ctxt->sc_list); + while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); ib_dma_unmap_single(rdma->sc_pd->device, ctxt->sc_sges[0].addr, rdma->sc_max_req_size, @@ -195,18 +198,22 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_send_lock); - ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts); - if (!ctxt) + node = llist_del_first(&rdma->sc_send_ctxts); + if (!node) goto out_empty; - list_del(&ctxt->sc_list); + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); spin_unlock(&rdma->sc_send_lock); out: + rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0); + xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, + ctxt->sc_xprt_buf, NULL); + ctxt->sc_send_wr.num_sge = 0; ctxt->sc_cur_sge_no = 0; - ctxt->sc_page_count = 0; return ctxt; out_empty: @@ -221,8 +228,6 @@ out_empty: * svc_rdma_send_ctxt_put - Return send_ctxt to free list * @rdma: controlling svcxprt_rdma * @ctxt: object to return to the free list - * - * Pages left in sc_pages are DMA unmapped and released. */ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) @@ -243,12 +248,21 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, ctxt->sc_sges[i].length); } - for (i = 0; i < ctxt->sc_page_count; ++i) - put_page(ctxt->sc_pages[i]); + llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts); +} - spin_lock(&rdma->sc_send_lock); - list_add(&ctxt->sc_list, &rdma->sc_send_ctxts); - spin_unlock(&rdma->sc_send_lock); +/** + * svc_rdma_wake_send_waiters - manage Send Queue accounting + * @rdma: controlling transport + * @avail: Number of additional SQEs that are now available + * + */ +void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail) +{ + atomic_add(avail, &rdma->sc_sq_avail); + smp_mb__after_atomic(); + if (unlikely(waitqueue_active(&rdma->sc_send_wait))) + wake_up(&rdma->sc_send_wait); } /** @@ -263,42 +277,51 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { struct svcxprt_rdma *rdma = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_send_ctxt *ctxt; + struct svc_rdma_send_ctxt *ctxt = + container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); - trace_svcrdma_wc_send(wc); + svc_rdma_wake_send_waiters(rdma, 1); + complete(&ctxt->sc_done); - atomic_inc(&rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); + if (unlikely(wc->status != IB_WC_SUCCESS)) + goto flushed; - ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); - svc_rdma_send_ctxt_put(rdma, ctxt); + trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + return; - if (unlikely(wc->status != IB_WC_SUCCESS)) { - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_xprt_enqueue(&rdma->sc_xprt); - } - - svc_xprt_put(&rdma->sc_xprt); +flushed: + if (wc->status != IB_WC_WR_FLUSH_ERR) + trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid); + else + trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid); + svc_xprt_deferred_close(&rdma->sc_xprt); } /** * svc_rdma_send - Post a single Send WR * @rdma: transport on which to post the WR - * @wr: prepared Send WR to post + * @ctxt: send ctxt with a Send WR ready to post * - * Returns zero the Send WR was posted successfully. Otherwise, a + * Returns zero if the Send WR was posted successfully. Otherwise, a * negative errno is returned. */ -int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) +int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) { + struct ib_send_wr *wr = &ctxt->sc_send_wr; int ret; - might_sleep(); + reinit_completion(&ctxt->sc_done); + + /* Sync the transport header buffer */ + ib_dma_sync_single_for_device(rdma->sc_pd->device, + wr->sg_list[0].addr, + wr->sg_list[0].length, + DMA_TO_DEVICE); /* If the SQ is full, wait until an SQ entry is available */ while (1) { if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { - atomic_inc(&rdma_stat_sq_starve); + percpu_counter_inc(&svcrdma_stat_sq_starve); trace_svcrdma_sq_full(rdma); atomic_inc(&rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, @@ -309,423 +332,471 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) continue; } - svc_xprt_get(&rdma->sc_xprt); + trace_svcrdma_post_send(ctxt); ret = ib_post_send(rdma->sc_qp, wr, NULL); - trace_svcrdma_post_send(wr, ret); - if (ret) { - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_xprt_put(&rdma->sc_xprt); - wake_up(&rdma->sc_send_wait); - } - break; + if (ret) + break; + return 0; } + + trace_svcrdma_sq_post_err(rdma, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + wake_up(&rdma->sc_send_wait); return ret; } -static u32 xdr_padsize(u32 len) +/** + * svc_rdma_encode_read_list - Encode RPC Reply's Read chunk list + * @sctxt: Send context for the RPC Reply + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Reply Read list + * %-EMSGSIZE on XDR buffer overflow + */ +static ssize_t svc_rdma_encode_read_list(struct svc_rdma_send_ctxt *sctxt) { - return (len & 3) ? (4 - (len & 3)) : 0; + /* RPC-over-RDMA version 1 replies never have a Read list. */ + return xdr_stream_encode_item_absent(&sctxt->sc_stream); } -/* Returns length of transport header, in bytes. +/** + * svc_rdma_encode_write_segment - Encode one Write segment + * @sctxt: Send context for the RPC Reply + * @chunk: Write chunk to push + * @remaining: remaining bytes of the payload left in the Write chunk + * @segno: which segment in the chunk + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Write segment, and updates @remaining + * %-EMSGSIZE on XDR buffer overflow */ -static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp) +static ssize_t svc_rdma_encode_write_segment(struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk, + u32 *remaining, unsigned int segno) { - unsigned int nsegs; + const struct svc_rdma_segment *segment = &chunk->ch_segments[segno]; + const size_t len = rpcrdma_segment_maxsz * sizeof(__be32); + u32 length; __be32 *p; - p = rdma_resp; - - /* RPC-over-RDMA V1 replies never have a Read list. */ - p += rpcrdma_fixed_maxsz + 1; - - /* Skip Write list. */ - while (*p++ != xdr_zero) { - nsegs = be32_to_cpup(p++); - p += nsegs * rpcrdma_segment_maxsz; - } - - /* Skip Reply chunk. */ - if (*p++ != xdr_zero) { - nsegs = be32_to_cpup(p++); - p += nsegs * rpcrdma_segment_maxsz; - } - - return (unsigned long)p - (unsigned long)rdma_resp; + p = xdr_reserve_space(&sctxt->sc_stream, len); + if (!p) + return -EMSGSIZE; + + length = min_t(u32, *remaining, segment->rs_length); + *remaining -= length; + xdr_encode_rdma_segment(p, segment->rs_handle, length, + segment->rs_offset); + trace_svcrdma_encode_wseg(sctxt, segno, segment->rs_handle, length, + segment->rs_offset); + return len; } -/* One Write chunk is copied from Call transport header to Reply - * transport header. Each segment's length field is updated to - * reflect number of bytes consumed in the segment. +/** + * svc_rdma_encode_write_chunk - Encode one Write chunk + * @sctxt: Send context for the RPC Reply + * @chunk: Write chunk to push * - * Returns number of segments in this chunk. + * Copy a Write chunk from the Call transport header to the + * Reply transport header. Update each segment's length field + * to reflect the number of bytes written in that segment. + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Write chunk + * %-EMSGSIZE on XDR buffer overflow */ -static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src, - unsigned int remaining) +static ssize_t svc_rdma_encode_write_chunk(struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk) { - unsigned int i, nsegs; - u32 seg_len; - - /* Write list discriminator */ - *dst++ = *src++; - - /* number of segments in this chunk */ - nsegs = be32_to_cpup(src); - *dst++ = *src++; - - for (i = nsegs; i; i--) { - /* segment's RDMA handle */ - *dst++ = *src++; - - /* bytes returned in this segment */ - seg_len = be32_to_cpu(*src); - if (remaining >= seg_len) { - /* entire segment was consumed */ - *dst = *src; - remaining -= seg_len; - } else { - /* segment only partly filled */ - *dst = cpu_to_be32(remaining); - remaining = 0; - } - dst++; src++; + u32 remaining = chunk->ch_payload_length; + unsigned int segno; + ssize_t len, ret; - /* segment's RDMA offset */ - *dst++ = *src++; - *dst++ = *src++; - } + len = 0; + ret = xdr_stream_encode_item_present(&sctxt->sc_stream); + if (ret < 0) + return ret; + len += ret; - return nsegs; -} + ret = xdr_stream_encode_u32(&sctxt->sc_stream, chunk->ch_segcount); + if (ret < 0) + return ret; + len += ret; -/* The client provided a Write list in the Call message. Fill in - * the segments in the first Write chunk in the Reply's transport - * header with the number of bytes consumed in each segment. - * Remaining chunks are returned unused. - * - * Assumptions: - * - Client has provided only one Write chunk - */ -static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch, - unsigned int consumed) -{ - unsigned int nsegs; - __be32 *p, *q; - - /* RPC-over-RDMA V1 replies never have a Read list. */ - p = rdma_resp + rpcrdma_fixed_maxsz + 1; - - q = wr_ch; - while (*q != xdr_zero) { - nsegs = xdr_encode_write_chunk(p, q, consumed); - q += 2 + nsegs * rpcrdma_segment_maxsz; - p += 2 + nsegs * rpcrdma_segment_maxsz; - consumed = 0; + for (segno = 0; segno < chunk->ch_segcount; segno++) { + ret = svc_rdma_encode_write_segment(sctxt, chunk, &remaining, segno); + if (ret < 0) + return ret; + len += ret; } - /* Terminate Write list */ - *p++ = xdr_zero; - - /* Reply chunk discriminator; may be replaced later */ - *p = xdr_zero; + return len; } -/* The client provided a Reply chunk in the Call message. Fill in - * the segments in the Reply chunk in the Reply message with the - * number of bytes consumed in each segment. +/** + * svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list + * @rctxt: Reply context with information about the RPC Call + * @sctxt: Send context for the RPC Reply * - * Assumptions: - * - Reply can always fit in the provided Reply chunk + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Reply's Write list + * %-EMSGSIZE on XDR buffer overflow */ -static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch, - unsigned int consumed) +static ssize_t svc_rdma_encode_write_list(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt) { - __be32 *p; + struct svc_rdma_chunk *chunk; + ssize_t len, ret; - /* Find the Reply chunk in the Reply's xprt header. - * RPC-over-RDMA V1 replies never have a Read list. - */ - p = rdma_resp + rpcrdma_fixed_maxsz + 1; + len = 0; + pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { + ret = svc_rdma_encode_write_chunk(sctxt, chunk); + if (ret < 0) + return ret; + len += ret; + } - /* Skip past Write list */ - while (*p++ != xdr_zero) - p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + /* Terminate the Write list */ + ret = xdr_stream_encode_item_absent(&sctxt->sc_stream); + if (ret < 0) + return ret; - xdr_encode_write_chunk(p, rp_ch, consumed); + return len + ret; } -/* Parse the RPC Call's transport header. +/** + * svc_rdma_encode_reply_chunk - Encode RPC Reply's Reply chunk + * @rctxt: Reply context with information about the RPC Call + * @sctxt: Send context for the RPC Reply + * @length: size in bytes of the payload in the Reply chunk + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Reply's Reply chunk + * %-EMSGSIZE on XDR buffer overflow + * %-E2BIG if the RPC message is larger than the Reply chunk */ -static void svc_rdma_get_write_arrays(__be32 *rdma_argp, - __be32 **write, __be32 **reply) +static ssize_t +svc_rdma_encode_reply_chunk(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + unsigned int length) { - __be32 *p; - - p = rdma_argp + rpcrdma_fixed_maxsz; + struct svc_rdma_chunk *chunk; - /* Read list */ - while (*p++ != xdr_zero) - p += 5; + if (pcl_is_empty(&rctxt->rc_reply_pcl)) + return xdr_stream_encode_item_absent(&sctxt->sc_stream); - /* Write list */ - if (*p != xdr_zero) { - *write = p; - while (*p++ != xdr_zero) - p += 1 + be32_to_cpu(*p) * 4; - } else { - *write = NULL; - p++; - } + chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); + if (length > chunk->ch_length) + return -E2BIG; - /* Reply chunk */ - if (*p != xdr_zero) - *reply = p; - else - *reply = NULL; + chunk->ch_payload_length = length; + return svc_rdma_encode_write_chunk(sctxt, chunk); } -static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct page *page, - unsigned long offset, - unsigned int len) +struct svc_rdma_map_data { + struct svcxprt_rdma *md_rdma; + struct svc_rdma_send_ctxt *md_ctxt; +}; + +/** + * svc_rdma_page_dma_map - DMA map one page + * @data: pointer to arguments + * @page: struct page to DMA map + * @offset: offset into the page + * @len: number of bytes to map + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if the page cannot be DMA mapped + */ +static int svc_rdma_page_dma_map(void *data, struct page *page, + unsigned long offset, unsigned int len) { + struct svc_rdma_map_data *args = data; + struct svcxprt_rdma *rdma = args->md_rdma; + struct svc_rdma_send_ctxt *ctxt = args->md_ctxt; struct ib_device *dev = rdma->sc_cm_id->device; dma_addr_t dma_addr; + ++ctxt->sc_cur_sge_no; + dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); - trace_svcrdma_dma_map_page(rdma, dma_addr, len); if (ib_dma_mapping_error(dev, dma_addr)) goto out_maperr; + trace_svcrdma_dma_map_page(rdma, dma_addr, len); ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; ctxt->sc_send_wr.num_sge++; return 0; out_maperr: + trace_svcrdma_dma_map_err(rdma, dma_addr, len); return -EIO; } -/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() +/** + * svc_rdma_iov_dma_map - DMA map an iovec + * @data: pointer to arguments + * @iov: kvec to DMA map + * + * ib_dma_map_page() is used here because svc_rdma_dma_unmap() * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively. + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if the iovec cannot be DMA mapped */ -static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - unsigned char *base, - unsigned int len) +static int svc_rdma_iov_dma_map(void *data, const struct kvec *iov) { - return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base), - offset_in_page(base), len); + if (!iov->iov_len) + return 0; + return svc_rdma_page_dma_map(data, virt_to_page(iov->iov_base), + offset_in_page(iov->iov_base), + iov->iov_len); } /** - * svc_rdma_sync_reply_hdr - DMA sync the transport header buffer - * @rdma: controlling transport - * @ctxt: send_ctxt for the Send WR - * @len: length of transport header + * svc_rdma_xb_dma_map - DMA map all segments of an xdr_buf + * @xdr: xdr_buf containing portion of an RPC message to transmit + * @data: pointer to arguments + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if DMA mapping failed * + * On failure, any DMA mappings that have been already done must be + * unmapped by the caller. */ -void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - unsigned int len) +static int svc_rdma_xb_dma_map(const struct xdr_buf *xdr, void *data) { - ctxt->sc_sges[0].length = len; - ctxt->sc_send_wr.num_sge++; - ib_dma_sync_single_for_device(rdma->sc_pd->device, - ctxt->sc_sges[0].addr, len, - DMA_TO_DEVICE); -} + unsigned int len, remaining; + unsigned long pageoff; + struct page **ppages; + int ret; -/* If the xdr_buf has more elements than the device can - * transmit in a single RDMA Send, then the reply will - * have to be copied into a bounce buffer. - */ -static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, - struct xdr_buf *xdr, - __be32 *wr_lst) -{ - int elements; - - /* xdr->head */ - elements = 1; - - /* xdr->pages */ - if (!wr_lst) { - unsigned int remaining; - unsigned long pageoff; - - pageoff = xdr->page_base & ~PAGE_MASK; - remaining = xdr->page_len; - while (remaining) { - ++elements; - remaining -= min_t(u32, PAGE_SIZE - pageoff, - remaining); - pageoff = 0; - } + ret = svc_rdma_iov_dma_map(data, &xdr->head[0]); + if (ret < 0) + return ret; + + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + pageoff = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + len = min_t(u32, PAGE_SIZE - pageoff, remaining); + + ret = svc_rdma_page_dma_map(data, *ppages++, pageoff, len); + if (ret < 0) + return ret; + + remaining -= len; + pageoff = 0; } - /* xdr->tail */ - if (xdr->tail[0].iov_len) - ++elements; + ret = svc_rdma_iov_dma_map(data, &xdr->tail[0]); + if (ret < 0) + return ret; - /* assume 1 SGE is needed for the transport header */ - return elements >= rdma->sc_max_send_sges; + return xdr->len; } -/* The device is not capable of sending the reply directly. - * Assemble the elements of @xdr into the transport header - * buffer. +struct svc_rdma_pullup_data { + u8 *pd_dest; + unsigned int pd_length; + unsigned int pd_num_sges; +}; + +/** + * svc_rdma_xb_count_sges - Count how many SGEs will be needed + * @xdr: xdr_buf containing portion of an RPC message to transmit + * @data: pointer to arguments + * + * Returns: + * Number of SGEs needed to Send the contents of @xdr inline */ -static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct xdr_buf *xdr, __be32 *wr_lst) +static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr, + void *data) { - unsigned char *dst, *tailbase; - unsigned int taillen; - - dst = ctxt->sc_xprt_buf; - dst += ctxt->sc_sges[0].length; + struct svc_rdma_pullup_data *args = data; + unsigned int remaining; + unsigned long offset; - memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len); - dst += xdr->head[0].iov_len; + if (xdr->head[0].iov_len) + ++args->pd_num_sges; - tailbase = xdr->tail[0].iov_base; - taillen = xdr->tail[0].iov_len; - if (wr_lst) { - u32 xdrpad; - - xdrpad = xdr_padsize(xdr->page_len); - if (taillen && xdrpad) { - tailbase += xdrpad; - taillen -= xdrpad; - } - } else { - unsigned int len, remaining; - unsigned long pageoff; - struct page **ppages; - - ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - pageoff = xdr->page_base & ~PAGE_MASK; - remaining = xdr->page_len; - while (remaining) { - len = min_t(u32, PAGE_SIZE - pageoff, remaining); - - memcpy(dst, page_address(*ppages), len); - remaining -= len; - dst += len; - pageoff = 0; - } + offset = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + ++args->pd_num_sges; + remaining -= min_t(u32, PAGE_SIZE - offset, remaining); + offset = 0; } - if (taillen) - memcpy(dst, tailbase, taillen); - - ctxt->sc_sges[0].length += xdr->len; - ib_dma_sync_single_for_device(rdma->sc_pd->device, - ctxt->sc_sges[0].addr, - ctxt->sc_sges[0].length, - DMA_TO_DEVICE); + if (xdr->tail[0].iov_len) + ++args->pd_num_sges; + args->pd_length += xdr->len; return 0; } -/* svc_rdma_map_reply_msg - Map the buffer holding RPC message +/** + * svc_rdma_pull_up_needed - Determine whether to use pull-up * @rdma: controlling transport - * @ctxt: send_ctxt for the Send WR - * @xdr: prepared xdr_buf containing RPC message - * @wr_lst: pointer to Call header's Write list, or NULL - * - * Load the xdr_buf into the ctxt's sge array, and DMA map each - * element as it is added. + * @sctxt: send_ctxt for the Send WR + * @rctxt: Write and Reply chunks provided by client + * @xdr: xdr_buf containing RPC message to transmit * - * Returns zero on success, or a negative errno on failure. + * Returns: + * %true if pull-up must be used + * %false otherwise */ -int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct xdr_buf *xdr, __be32 *wr_lst) +static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma, + const struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) { - unsigned int len, remaining; - unsigned long page_off; - struct page **ppages; - unsigned char *base; - u32 xdr_pad; + /* Resources needed for the transport header */ + struct svc_rdma_pullup_data args = { + .pd_length = sctxt->sc_hdrbuf.len, + .pd_num_sges = 1, + }; int ret; - if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst)) - return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst); - - ++ctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_buf(rdma, ctxt, - xdr->head[0].iov_base, - xdr->head[0].iov_len); + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_count_sges, &args); if (ret < 0) - return ret; + return false; - /* If a Write chunk is present, the xdr_buf's page list - * is not included inline. However the Upper Layer may - * have added XDR padding in the tail buffer, and that - * should not be included inline. - */ - if (wr_lst) { - base = xdr->tail[0].iov_base; - len = xdr->tail[0].iov_len; - xdr_pad = xdr_padsize(xdr->page_len); - - if (len && xdr_pad) { - base += xdr_pad; - len -= xdr_pad; - } + if (args.pd_length < RPCRDMA_PULLUP_THRESH) + return true; + return args.pd_num_sges >= rdma->sc_max_send_sges; +} - goto tail; +/** + * svc_rdma_xb_linearize - Copy region of xdr_buf to flat buffer + * @xdr: xdr_buf containing portion of an RPC message to copy + * @data: pointer to arguments + * + * Returns: + * Always zero. + */ +static int svc_rdma_xb_linearize(const struct xdr_buf *xdr, + void *data) +{ + struct svc_rdma_pullup_data *args = data; + unsigned int len, remaining; + unsigned long pageoff; + struct page **ppages; + + if (xdr->head[0].iov_len) { + memcpy(args->pd_dest, xdr->head[0].iov_base, xdr->head[0].iov_len); + args->pd_dest += xdr->head[0].iov_len; } ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - page_off = xdr->page_base & ~PAGE_MASK; + pageoff = offset_in_page(xdr->page_base); remaining = xdr->page_len; while (remaining) { - len = min_t(u32, PAGE_SIZE - page_off, remaining); - - ++ctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++, - page_off, len); - if (ret < 0) - return ret; - + len = min_t(u32, PAGE_SIZE - pageoff, remaining); + memcpy(args->pd_dest, page_address(*ppages) + pageoff, len); remaining -= len; - page_off = 0; + args->pd_dest += len; + pageoff = 0; + ppages++; } - base = xdr->tail[0].iov_base; - len = xdr->tail[0].iov_len; -tail: - if (len) { - ++ctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len); - if (ret < 0) - return ret; + if (xdr->tail[0].iov_len) { + memcpy(args->pd_dest, xdr->tail[0].iov_base, xdr->tail[0].iov_len); + args->pd_dest += xdr->tail[0].iov_len; } + args->pd_length += xdr->len; return 0; } -/* The svc_rqst and all resources it owns are released as soon as - * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt - * so they are released by the Send completion handler. +/** + * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer + * @rdma: controlling transport + * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared + * @rctxt: Write and Reply chunks provided by client + * @xdr: prepared xdr_buf containing RPC message + * + * The device is not capable of sending the reply directly. + * Assemble the elements of @xdr into the transport header buffer. + * + * Assumptions: + * pull_up_needed has determined that @xdr will fit in the buffer. + * + * Returns: + * %0 if pull-up was successful + * %-EMSGSIZE if a buffer manipulation problem occurred */ -static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, - struct svc_rdma_send_ctxt *ctxt) +static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) { - int i, pages = rqstp->rq_next_page - rqstp->rq_respages; + struct svc_rdma_pullup_data args = { + .pd_dest = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len, + }; + int ret; - ctxt->sc_page_count += pages; - for (i = 0; i < pages; i++) { - ctxt->sc_pages[i] = rqstp->rq_respages[i]; - rqstp->rq_respages[i] = NULL; - } + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_linearize, &args); + if (ret < 0) + return ret; + + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len + args.pd_length; + trace_svcrdma_send_pullup(sctxt, args.pd_length); + return 0; +} + +/* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message + * @rdma: controlling transport + * @sctxt: send_ctxt for the Send WR + * @rctxt: Write and Reply chunks provided by client + * @xdr: prepared xdr_buf containing RPC message + * + * Returns: + * %0 if DMA mapping was successful. + * %-EMSGSIZE if a buffer manipulation problem occurred + * %-EIO if DMA mapping failed + * + * The Send WR's num_sge field is set in all cases. + */ +int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) +{ + struct svc_rdma_map_data args = { + .md_rdma = rdma, + .md_ctxt = sctxt, + }; - /* Prevent svc_xprt_release from releasing pages in rq_pages */ - rqstp->rq_next_page = rqstp->rq_respages; + /* Set up the (persistently-mapped) transport header SGE. */ + sctxt->sc_send_wr.num_sge = 1; + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; + + /* If there is a Reply chunk, nothing follows the transport + * header, and we're done here. + */ + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) + return 0; + + /* For pull-up, svc_rdma_send() will sync the transport header. + * No additional DMA mapping is necessary. + */ + if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr)) + return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr); + + return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_dma_map, &args); } /* Prepare the portion of the RPC Reply that will be transmitted @@ -748,20 +819,14 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, */ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - struct svc_rdma_recv_ctxt *rctxt, - struct svc_rqst *rqstp, - __be32 *wr_lst, __be32 *rp_ch) + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rqst *rqstp) { int ret; - if (!rp_ch) { - ret = svc_rdma_map_reply_msg(rdma, sctxt, - &rqstp->rq_res, wr_lst); - if (ret < 0) - return ret; - } - - svc_rdma_save_io_pages(rqstp, sctxt); + ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqstp->rq_res); + if (ret < 0) + return ret; if (rctxt->rc_inv_rkey) { sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; @@ -769,42 +834,84 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, } else { sctxt->sc_send_wr.opcode = IB_WR_SEND; } - dprintk("svcrdma: posting Send WR with %u sge(s)\n", - sctxt->sc_send_wr.num_sge); - return svc_rdma_send(rdma, &sctxt->sc_send_wr); + + ret = svc_rdma_send(rdma, sctxt); + if (ret < 0) + return ret; + + ret = wait_for_completion_killable(&sctxt->sc_done); + svc_rdma_send_ctxt_put(rdma, sctxt); + return ret; } -/* Given the client-provided Write and Reply chunks, the server was not - * able to form a complete reply. Return an RDMA_ERROR message so the - * client can retire this RPC transaction. As above, the Send completion - * routine releases payload pages that were part of a previous RDMA Write. +/** + * svc_rdma_send_error_msg - Send an RPC/RDMA v1 error response + * @rdma: controlling transport context + * @sctxt: Send context for the response + * @rctxt: Receive context for incoming bad message + * @status: negative errno indicating error that occurred * - * Remote Invalidation is skipped for simplicity. + * Given the client-provided Read, Write, and Reply chunks, the + * server was not able to parse the Call or form a complete Reply. + * Return an RDMA_ERROR message so the client can retire the RPC + * transaction. + * + * The caller does not have to release @sctxt. It is released by + * Send completion, or by this function on error. */ -static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct svc_rqst *rqstp) +void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_recv_ctxt *rctxt, + int status) { + __be32 *rdma_argp = rctxt->rc_recv_buf; __be32 *p; - int ret; - p = ctxt->sc_xprt_buf; - trace_svcrdma_err_chunk(*p); - p += 3; - *p++ = rdma_error; - *p = err_chunk; - svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_ERR); + rpcrdma_set_xdrlen(&sctxt->sc_hdrbuf, 0); + xdr_init_encode(&sctxt->sc_stream, &sctxt->sc_hdrbuf, + sctxt->sc_xprt_buf, NULL); - svc_rdma_save_io_pages(rqstp, ctxt); + p = xdr_reserve_space(&sctxt->sc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); + if (!p) + goto put_ctxt; - ctxt->sc_send_wr.opcode = IB_WR_SEND; - ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); - if (ret) { - svc_rdma_send_ctxt_put(rdma, ctxt); - return ret; + *p++ = *rdma_argp; + *p++ = *(rdma_argp + 1); + *p++ = rdma->sc_fc_credits; + *p = rdma_error; + + switch (status) { + case -EPROTONOSUPPORT: + p = xdr_reserve_space(&sctxt->sc_stream, 3 * sizeof(*p)); + if (!p) + goto put_ctxt; + + *p++ = err_vers; + *p++ = rpcrdma_version; + *p = rpcrdma_version; + trace_svcrdma_err_vers(*rdma_argp); + break; + default: + p = xdr_reserve_space(&sctxt->sc_stream, sizeof(*p)); + if (!p) + goto put_ctxt; + + *p = err_chunk; + trace_svcrdma_err_chunk(*rdma_argp); } - return 0; + /* Remote Invalidation is skipped for simplicity. */ + sctxt->sc_send_wr.num_sge = 1; + sctxt->sc_send_wr.opcode = IB_WR_SEND; + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; + if (svc_rdma_send(rdma, sctxt)) + goto put_ctxt; + + wait_for_completion_killable(&sctxt->sc_done); + +put_ctxt: + svc_rdma_send_ctxt_put(rdma, sctxt); } /** @@ -825,78 +932,113 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; - __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch; - struct xdr_buf *xdr = &rqstp->rq_res; + __be32 *rdma_argp = rctxt->rc_recv_buf; struct svc_rdma_send_ctxt *sctxt; + unsigned int rc_size; + __be32 *p; int ret; - rdma_argp = rctxt->rc_recv_buf; - svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch); + ret = -ENOTCONN; + if (svc_xprt_is_dead(xprt)) + goto drop_connection; - /* Create the RDMA response header. xprt->xpt_mutex, - * acquired in svc_send(), serializes RPC replies. The - * code path below that inserts the credit grant value - * into each transport header runs only inside this - * critical section. - */ ret = -ENOMEM; sctxt = svc_rdma_send_ctxt_get(rdma); if (!sctxt) - goto err0; - rdma_resp = sctxt->sc_xprt_buf; + goto drop_connection; + + ret = -EMSGSIZE; + p = xdr_reserve_space(&sctxt->sc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); + if (!p) + goto put_ctxt; + + ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); + if (ret < 0) + goto reply_chunk; + rc_size = ret; - p = rdma_resp; *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); *p++ = rdma->sc_fc_credits; - *p++ = rp_ch ? rdma_nomsg : rdma_msg; - - /* Start with empty chunks */ - *p++ = xdr_zero; - *p++ = xdr_zero; - *p = xdr_zero; + *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg; - if (wr_lst) { - /* XXX: Presume the client sent only one Write chunk */ - ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr); - if (ret < 0) - goto err2; - svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret); - } - if (rp_ch) { - ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr); - if (ret < 0) - goto err2; - svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret); - } + ret = svc_rdma_encode_read_list(sctxt); + if (ret < 0) + goto put_ctxt; + ret = svc_rdma_encode_write_list(rctxt, sctxt); + if (ret < 0) + goto put_ctxt; + ret = svc_rdma_encode_reply_chunk(rctxt, sctxt, rc_size); + if (ret < 0) + goto put_ctxt; - svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp)); - ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp, - wr_lst, rp_ch); + ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); if (ret < 0) - goto err1; - ret = 0; + goto put_ctxt; -out: - rqstp->rq_xprt_ctxt = NULL; - svc_rdma_recv_ctxt_put(rdma, rctxt); - return ret; + /* Prevent svc_xprt_release() from releasing the page backing + * rq_res.head[0].iov_base. It's no longer being accessed by + * the I/O device. */ + rqstp->rq_respages++; + return 0; - err2: +reply_chunk: if (ret != -E2BIG && ret != -EINVAL) - goto err1; + goto put_ctxt; - ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp); - if (ret < 0) - goto err1; - ret = 0; - goto out; + svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret); + return 0; - err1: +put_ctxt: svc_rdma_send_ctxt_put(rdma, sctxt); - err0: - trace_svcrdma_send_failed(rqstp, ret); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - ret = -ENOTCONN; - goto out; +drop_connection: + trace_svcrdma_send_err(rqstp, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + return -ENOTCONN; +} + +/** + * svc_rdma_result_payload - special processing for a result payload + * @rqstp: svc_rqst to operate on + * @offset: payload's byte offset in @xdr + * @length: size of payload, in bytes + * + * Return values: + * %0 if successful or nothing needed to be done + * %-EMSGSIZE on XDR buffer overflow + * %-E2BIG if the payload was larger than the Write chunk + * %-EINVAL if client provided too many segments + * %-ENOMEM if rdma_rw context pool was exhausted + * %-ENOTCONN if posting failed (connection is lost) + * %-EIO if rdma_rw initialization failed (DMA mapping, etc) + */ +int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) +{ + struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; + struct svc_rdma_chunk *chunk; + struct svcxprt_rdma *rdma; + struct xdr_buf subbuf; + int ret; + + chunk = rctxt->rc_cur_result_payload; + if (!length || !chunk) + return 0; + rctxt->rc_cur_result_payload = + pcl_next_chunk(&rctxt->rc_write_pcl, chunk); + if (length > chunk->ch_length) + return -E2BIG; + + chunk->ch_position = offset; + chunk->ch_payload_length = length; + + if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length)) + return -EMSGSIZE; + + rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); + ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf); + if (ret < 0) + return ret; + return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 145a3615c319..199fa012f18a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -55,7 +55,6 @@ #include <linux/sunrpc/addr.h> #include <linux/sunrpc/debug.h> -#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/svc_rdma.h> @@ -71,7 +70,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct sockaddr *sa, int salen, int flags); static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); -static void svc_rdma_release_rqst(struct svc_rqst *); static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); static int svc_rdma_has_wspace(struct svc_xprt *xprt); @@ -82,6 +80,7 @@ static const struct svc_xprt_ops svc_rdma_ops = { .xpo_create = svc_rdma_create, .xpo_recvfrom = svc_rdma_recvfrom, .xpo_sendto = svc_rdma_sendto, + .xpo_result_payload = svc_rdma_result_payload, .xpo_release_rqst = svc_rdma_release_rqst, .xpo_detach = svc_rdma_detach, .xpo_free = svc_rdma_free, @@ -120,8 +119,7 @@ static void qp_event_handler(struct ib_event *event, void *context) case IB_EVENT_QP_ACCESS_ERR: case IB_EVENT_DEVICE_FATAL: default: - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); + svc_xprt_deferred_close(xprt); break; } } @@ -138,10 +136,9 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); - INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); - INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); + init_llist_head(&cma_xprt->sc_send_ctxts); init_llist_head(&cma_xprt->sc_recv_ctxts); - INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); + init_llist_head(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); @@ -211,7 +208,12 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, newxprt->sc_ord = param->initiator_depth; sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; - svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + newxprt->sc_xprt.xpt_remotelen = svc_addr_len(sa); + memcpy(&newxprt->sc_xprt.xpt_remote, sa, + newxprt->sc_xprt.xpt_remotelen); + snprintf(newxprt->sc_xprt.xpt_remotebuf, + sizeof(newxprt->sc_xprt.xpt_remotebuf) - 1, "%pISc", sa); + /* The remote port is arbitrary and not under the control of the * client ULP. Set it to a fixed value so that the DRC continues * to be effective after a reconnect. @@ -233,72 +235,58 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, svc_xprt_enqueue(&listen_xprt->sc_xprt); } -/* - * Handles events generated on the listening endpoint. These events will be - * either be incoming connect requests or adapter removal events. +/** + * svc_rdma_listen_handler - Handle CM events generated on a listening endpoint + * @cma_id: the server's listener rdma_cm_id + * @event: details of the event + * + * Return values: + * %0: Do not destroy @cma_id + * %1: Destroy @cma_id (never returned here) + * + * NB: There is never a DEVICE_REMOVAL event for INADDR_ANY listeners. */ -static int rdma_listen_handler(struct rdma_cm_id *cma_id, - struct rdma_cm_event *event) +static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr; - - trace_svcrdma_cm_event(event, sap); - switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: - dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " - "event = %s (%d)\n", cma_id, cma_id->context, - rdma_event_msg(event->event), event->event); handle_connect_req(cma_id, &event->param.conn); break; default: - /* NB: No device removal upcall for INADDR_ANY listeners */ - dprintk("svcrdma: Unexpected event on listening endpoint %p, " - "event = %s (%d)\n", cma_id, - rdma_event_msg(event->event), event->event); break; } - return 0; } -static int rdma_cma_handler(struct rdma_cm_id *cma_id, - struct rdma_cm_event *event) +/** + * svc_rdma_cma_handler - Handle CM events on client connections + * @cma_id: the server's listener rdma_cm_id + * @event: details of the event + * + * Return values: + * %0: Do not destroy @cma_id + * %1: Destroy @cma_id (never returned here) + */ +static int svc_rdma_cma_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.dst_addr; struct svcxprt_rdma *rdma = cma_id->context; struct svc_xprt *xprt = &rdma->sc_xprt; - trace_svcrdma_cm_event(event, sap); - switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: - /* Accept complete */ - svc_xprt_get(xprt); - dprintk("svcrdma: Connection completed on DTO xprt=%p, " - "cm_id=%p\n", xprt, cma_id); clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); + + /* Handle any requests that were received while + * CONN_PENDING was set. */ svc_xprt_enqueue(xprt); break; case RDMA_CM_EVENT_DISCONNECTED: - dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", - xprt, cma_id); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); - break; case RDMA_CM_EVENT_DEVICE_REMOVAL: - dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " - "event = %s (%d)\n", cma_id, xprt, - rdma_event_msg(event->event), event->event); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); + svc_xprt_deferred_close(xprt); break; default: - dprintk("svcrdma: Unexpected event on DTO endpoint %p, " - "event = %s (%d)\n", cma_id, - rdma_event_msg(event->event), event->event); break; } return 0; @@ -316,22 +304,18 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct svcxprt_rdma *cma_xprt; int ret; - dprintk("svcrdma: Creating RDMA listener\n"); - if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) { - dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family); + if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6) return ERR_PTR(-EAFNOSUPPORT); - } cma_xprt = svc_rdma_create_xprt(serv, net); if (!cma_xprt) return ERR_PTR(-ENOMEM); set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener"); - listen_id = rdma_create_id(net, rdma_listen_handler, cma_xprt, + listen_id = rdma_create_id(net, svc_rdma_listen_handler, cma_xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(listen_id)) { ret = PTR_ERR(listen_id); - dprintk("svcrdma: rdma_create_id failed = %d\n", ret); goto err0; } @@ -340,23 +324,17 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, */ #if IS_ENABLED(CONFIG_IPV6) ret = rdma_set_afonly(listen_id, 1); - if (ret) { - dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret); + if (ret) goto err1; - } #endif ret = rdma_bind_addr(listen_id, sa); - if (ret) { - dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); + if (ret) goto err1; - } cma_xprt->sc_cm_id = listen_id; ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); - if (ret) { - dprintk("svcrdma: rdma_listen failed = %d\n", ret); + if (ret) goto err1; - } /* * We need to use the address from the cm_id in case the @@ -412,9 +390,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (!newxprt) return NULL; - dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", - newxprt, newxprt->sc_cm_id); - dev = newxprt->sc_cm_id->device; newxprt->sc_port_num = newxprt->sc_cm_id->port_num; @@ -429,11 +404,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = svcrdma_max_requests; newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; - rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; + newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH; + rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + + newxprt->sc_recv_batch; if (rq_depth > dev->attrs.max_qp_wr) { pr_warn("svcrdma: reducing receive depth to %d\n", dev->attrs.max_qp_wr); rq_depth = dev->attrs.max_qp_wr; + newxprt->sc_recv_batch = 1; newxprt->sc_max_requests = rq_depth - 2; newxprt->sc_max_bc_requests = 2; } @@ -450,21 +428,17 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { - dprintk("svcrdma: error creating PD for connect request\n"); + trace_svcrdma_pd_err(newxprt, PTR_ERR(newxprt->sc_pd)); goto errout; } newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth, IB_POLL_WORKQUEUE); - if (IS_ERR(newxprt->sc_sq_cq)) { - dprintk("svcrdma: error creating SQ CQ for connect request\n"); + if (IS_ERR(newxprt->sc_sq_cq)) goto errout; - } newxprt->sc_rq_cq = ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE); - if (IS_ERR(newxprt->sc_rq_cq)) { - dprintk("svcrdma: error creating RQ CQ for connect request\n"); + if (IS_ERR(newxprt->sc_rq_cq)) goto errout; - } memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; @@ -488,7 +462,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); if (ret) { - dprintk("svcrdma: failed to create QP, ret=%d\n", ret); + trace_svcrdma_qp_err(newxprt, ret); goto errout; } newxprt->sc_qp = newxprt->sc_cm_id->qp; @@ -496,15 +470,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) newxprt->sc_snd_w_inv = false; if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) && - !rdma_ib_or_roce(dev, newxprt->sc_port_num)) + !rdma_ib_or_roce(dev, newxprt->sc_port_num)) { + trace_svcrdma_fabric_err(newxprt, -EINVAL); goto errout; + } if (!svc_rdma_post_recvs(newxprt)) goto errout; - /* Swap out the handler */ - newxprt->sc_cm_id->event_handler = rdma_cma_handler; - /* Construct RDMA-CM private message */ pmsg.cp_magic = rpcrdma_cmp_magic; pmsg.cp_version = RPCRDMA_CMP_VERSION; @@ -519,15 +492,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) conn_param.initiator_depth = min_t(int, newxprt->sc_ord, dev->attrs.max_qp_init_rd_atom); if (!conn_param.initiator_depth) { - dprintk("svcrdma: invalid ORD setting\n"); ret = -EINVAL; + trace_svcrdma_initdepth_err(newxprt, ret); goto errout; } conn_param.private_data = &pmsg; conn_param.private_data_len = sizeof(pmsg); + rdma_lock_handler(newxprt->sc_cm_id); + newxprt->sc_cm_id->event_handler = svc_rdma_cma_handler; ret = rdma_accept(newxprt->sc_cm_id, &conn_param); - if (ret) + rdma_unlock_handler(newxprt->sc_cm_id); + if (ret) { + trace_svcrdma_accept_err(newxprt, ret); goto errout; + } #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) dprintk("svcrdma: new connection %p accepted:\n", newxprt); @@ -542,12 +520,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk(" ord : %d\n", conn_param.initiator_depth); #endif - trace_svcrdma_xprt_accept(&newxprt->sc_xprt); return &newxprt->sc_xprt; errout: - dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); - trace_svcrdma_xprt_fail(&newxprt->sc_xprt); /* Take a reference in case the DTO handler runs */ svc_xprt_get(&newxprt->sc_xprt); if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) @@ -558,28 +533,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) return NULL; } -static void svc_rdma_release_rqst(struct svc_rqst *rqstp) -{ -} - -/* - * When connected, an svc_xprt has at least two references: - * - * - A reference held by the cm_id between the ESTABLISHED and - * DISCONNECTED events. If the remote peer disconnected first, this - * reference could be gone. - * - * - A reference held by the svc_recv code that called this function - * as part of close processing. - * - * At a minimum one references should still be held. - */ static void svc_rdma_detach(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); - /* Disconnect and flush posted WQE */ rdma_disconnect(rdma->sc_cm_id); } @@ -587,21 +545,13 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); - struct svc_xprt *xprt = &rdma->sc_xprt; - - trace_svcrdma_xprt_free(xprt); + /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); svc_rdma_flush_recv_queues(rdma); - /* Final put of backchannel client transport */ - if (xprt->xpt_bc_xprt) { - xprt_put(xprt->xpt_bc_xprt); - xprt->xpt_bc_xprt = NULL; - } - svc_rdma_destroy_rw_ctxts(rdma); svc_rdma_send_ctxts_destroy(rdma); svc_rdma_recv_ctxts_destroy(rdma); @@ -652,7 +602,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) static void svc_rdma_secure_port(struct svc_rqst *rqstp) { - set_bit(RQ_SECURE, &rqstp->rq_flags); + __set_bit(RQ_SECURE, &rqstp->rq_flags); } static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 3cfeba68ee9a..10bb2b929c6d 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -60,19 +60,16 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - /* * tunables */ -unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; +static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; int xprt_rdma_pad_optimize; +static struct xprt_class xprt_rdma; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -238,26 +235,35 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; + unsigned int pflags = current->flags; int rc; - rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + if (atomic_read(&xprt->swapper)) + current->flags |= PF_MEMALLOC; + rc = rpcrdma_xprt_connect(r_xprt); xprt_clear_connecting(xprt); - if (r_xprt->rx_ep.rep_connected > 0) { + if (!rc) { + xprt->connect_cookie++; xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; xprt_set_connected(xprt); rc = -EAGAIN; - } + } else + rpcrdma_xprt_disconnect(r_xprt); + xprt_unlock_connect(xprt, r_xprt); xprt_wake_pending_tasks(xprt, rc); + current_restore_flags(pflags, PF_MEMALLOC); } /** * xprt_rdma_inject_disconnect - inject a connection fault * @xprt: transport context * - * If @xprt is connected, disconnect it to simulate spurious connection - * loss. + * If @xprt is connected, disconnect it to simulate spurious + * connection loss. Caller must hold @xprt's send lock to + * ensure that data structures and hardware resources are + * stable during the rdma_disconnect() call. */ static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) @@ -265,7 +271,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_op_inject_dsc(r_xprt); - rdma_disconnect(r_xprt->rx_ia.ri_id); + rdma_disconnect(r_xprt->rx_ep->re_id); } /** @@ -280,13 +286,10 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - trace_xprtrdma_op_destroy(r_xprt); - cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - rpcrdma_ep_destroy(r_xprt); + rpcrdma_xprt_disconnect(r_xprt); rpcrdma_buffer_destroy(&r_xprt->rx_buf); - rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); xprt_free(xprt); @@ -316,10 +319,15 @@ xprt_setup_rdma(struct xprt_create *args) if (args->addrlen > sizeof(xprt->addr)) return ERR_PTR(-EBADF); + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-EIO); + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, xprt_rdma_slot_table_entries); - if (!xprt) + if (!xprt) { + module_put(THIS_MODULE); return ERR_PTR(-ENOMEM); + } xprt->timeout = &xprt_rdma_default_timeout; xprt->connect_timeout = xprt->timeout->to_initval; @@ -339,6 +347,7 @@ xprt_setup_rdma(struct xprt_create *args) /* Ensure xprt->addr holds valid server TCP (not RDMA) * address, for any side protocols which peek at it */ xprt->prot = IPPROTO_TCP; + xprt->xprt_class = &xprt_rdma; xprt->addrlen = args->addrlen; memcpy(&xprt->addr, sap, xprt->addrlen); @@ -347,43 +356,20 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_format_addresses(xprt, sap); new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt); - if (rc) - goto out1; - - rc = rpcrdma_ep_create(new_xprt); - if (rc) - goto out2; - rc = rpcrdma_buffer_create(new_xprt); - if (rc) - goto out3; - - if (!try_module_get(THIS_MODULE)) - goto out4; + if (rc) { + xprt_rdma_free_addresses(xprt); + xprt_free(xprt); + module_put(THIS_MODULE); + return ERR_PTR(rc); + } INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); + xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; - dprintk("RPC: %s: %s:%s\n", __func__, - xprt->address_strings[RPC_DISPLAY_ADDR], - xprt->address_strings[RPC_DISPLAY_PORT]); - trace_xprtrdma_create(new_xprt); return xprt; - -out4: - rpcrdma_buffer_destroy(&new_xprt->rx_buf); - rc = -ENODEV; -out3: - rpcrdma_ep_destroy(new_xprt); -out2: - rpcrdma_ia_close(&new_xprt->rx_ia); -out1: - trace_xprtrdma_op_destroy(new_xprt); - xprt_rdma_free_addresses(xprt); - xprt_free(xprt); - return ERR_PTR(rc); } /** @@ -398,26 +384,9 @@ out1: void xprt_rdma_close(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - might_sleep(); - trace_xprtrdma_op_close(r_xprt); + rpcrdma_xprt_disconnect(r_xprt); - /* Prevent marshaling and sending of new requests */ - xprt_clear_connected(xprt); - - if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) { - rpcrdma_ia_remove(ia); - goto out; - } - - if (ep->rep_connected == -ENODEV) - return; - rpcrdma_ep_disconnect(ep, ia); - -out: xprt->reestablish_timeout = 0; ++xprt->connect_cookie; xprt_disconnect_done(xprt); @@ -445,9 +414,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); snprintf(buf, sizeof(buf), "%4hx", port); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); - - trace_xprtrdma_op_setport(container_of(xprt, struct rpcrdma_xprt, - rx_xprt)); } /** @@ -517,16 +483,18 @@ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned long delay; + WARN_ON_ONCE(!xprt_lock_connect(xprt, task, r_xprt)); + delay = 0; - if (r_xprt->rx_ep.rep_connected != 0) { + if (ep && ep->re_connect_status != 0) { delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } trace_xprtrdma_op_connect(r_xprt, delay); - queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker, - delay); + queue_delayed_work(system_long_wq, &r_xprt->rx_connect_worker, delay); } /** @@ -552,9 +520,8 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) return; out_sleep: - set_bit(XPRT_CONGESTED, &xprt->state); - rpc_sleep_on(&xprt->backlog, task, NULL); - task->tk_status = -EAGAIN; + task->tk_status = -ENOMEM; + xprt_add_backlog(xprt, task); } /** @@ -569,10 +536,11 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst) struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, rx_xprt); - memset(rqst, 0, sizeof(*rqst)); - rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); - if (unlikely(!rpc_wake_up_next(&xprt->backlog))) - clear_bit(XPRT_CONGESTED, &xprt->state); + rpcrdma_reply_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); + if (!xprt_wake_up_backlog(xprt, rqst)) { + memset(rqst, 0, sizeof(*rqst)); + rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); + } } static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt, @@ -602,11 +570,7 @@ xprt_rdma_allocate(struct rpc_task *task) struct rpc_rqst *rqst = task->tk_rqstp; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - gfp_t flags; - - flags = RPCRDMA_DEF_GFP; - if (RPC_IS_SWAPPER(task)) - flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; + gfp_t flags = rpc_task_gfp_mask(); if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize, flags)) @@ -617,11 +581,9 @@ xprt_rdma_allocate(struct rpc_task *task) rqst->rq_buffer = rdmab_data(req->rl_sendbuf); rqst->rq_rbuffer = rdmab_data(req->rl_recvbuf); - trace_xprtrdma_op_allocate(task, req); return 0; out_fail: - trace_xprtrdma_op_allocate(task, NULL); return -ENOMEM; } @@ -635,13 +597,12 @@ static void xprt_rdma_free(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - trace_xprtrdma_op_free(task, req); - - if (!list_empty(&req->rl_registered)) - frwr_unmap_sync(r_xprt, req); + if (unlikely(!list_empty(&req->rl_registered))) { + trace_xprtrdma_mrs_zap(task); + frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req); + } /* XXX: If the RPC is completing because of a signal and * not because a reply was received, we ought to ensure @@ -694,7 +655,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst) goto drop_connection; rqst->rq_xtime = ktime_get(); - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + if (frwr_send(r_xprt, req)) goto drop_connection; rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; @@ -806,6 +767,7 @@ static struct xprt_class xprt_rdma = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_RDMA, .setup = xprt_setup_rdma, + .netid = { "rdma", "rdma6", "" }, }; void xprt_rdma_cleanup(void) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 353f61ac8d51..44b87e4274b4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -63,17 +63,6 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -/* - * Globals/Macros - */ - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - -/* - * internal functions - */ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt); static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, @@ -84,9 +73,10 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_ep_get(struct rpcrdma_ep *ep); +static int rpcrdma_ep_put(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * -rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, - gfp_t flags); +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction); static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); @@ -96,72 +86,86 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); */ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rdma_cm_id *id = ep->re_id; + + /* Wait for rpcrdma_post_recvs() to leave its critical + * section. + */ + if (atomic_inc_return(&ep->re_receiving) > 1) + wait_for_completion(&ep->re_done); /* Flush Receives, then wait for deferred Reply work * to complete. */ - ib_drain_rq(ia->ri_id->qp); + ib_drain_rq(id->qp); /* Deferred Reply processing might have scheduled * local invalidations. */ - ib_drain_sq(ia->ri_id->qp); + ib_drain_sq(id->qp); + + rpcrdma_ep_put(ep); +} + +/* Ensure xprt_force_disconnect() is invoked exactly once when a + * connection is closed or lost. (The important thing is it needs + * to be invoked "at least" once). + */ +void rpcrdma_force_disconnect(struct rpcrdma_ep *ep) +{ + if (atomic_add_unless(&ep->re_force_disconnect, 1, 1)) + xprt_force_disconnect(ep->re_xprt); } /** - * rpcrdma_qp_event_handler - Handle one QP event (error notification) - * @event: details of the event - * @context: ep that owns QP where event occurred + * rpcrdma_flush_disconnect - Disconnect on flushed completion + * @r_xprt: transport to disconnect + * @wc: work completion entry * - * Called from the RDMA provider (device driver) possibly in an interrupt - * context. + * Must be called in process context. */ -static void -rpcrdma_qp_event_handler(struct ib_event *event, void *context) +void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc) { - struct rpcrdma_ep *ep = context; - struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, - rx_ep); - - trace_xprtrdma_qp_event(r_xprt, event); + if (wc->status != IB_WC_SUCCESS) + rpcrdma_force_disconnect(r_xprt->rx_ep); } /** * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: completion queue - * @wc: completed WR + * @wc: WCE for a completed Send WR * */ -static void -rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) +static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_sendctx *sc = container_of(cqe, struct rpcrdma_sendctx, sc_cqe); + struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_send(sc, wc); - rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc); + trace_xprtrdma_wc_send(wc, &sc->sc_cid); + rpcrdma_sendctx_put_locked(r_xprt, sc); + rpcrdma_flush_disconnect(r_xprt, wc); } /** * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed Receive WR * */ -static void -rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) +static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, rr_cqe); - struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_receive(wc); - --r_xprt->rx_ep.rep_receive_count; + trace_xprtrdma_wc_receive(wc, &rep->rr_cid); + --r_xprt->rx_ep->re_receive_count; if (wc->status != IB_WC_SUCCESS) goto out_flushed; @@ -178,35 +182,33 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) return; out_flushed: - rpcrdma_rep_destroy(rep); + rpcrdma_flush_disconnect(r_xprt, wc); + rpcrdma_rep_put(&r_xprt->rx_buf, rep); } -static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, +static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, struct rdma_conn_param *param) { const struct rpcrdma_connect_private *pmsg = param->private_data; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ - r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; if (pmsg && pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { - r_xprt->rx_ia.ri_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } - if (rsize < ep->rep_inline_recv) - ep->rep_inline_recv = rsize; - if (wsize < ep->rep_inline_send) - ep->rep_inline_send = wsize; + if (rsize < ep->re_inline_recv) + ep->re_inline_recv = rsize; + if (wsize < ep->re_inline_send) + ep->re_inline_send = wsize; - rpcrdma_set_max_header_sizes(r_xprt); + rpcrdma_set_max_header_sizes(ep); } /** @@ -220,116 +222,100 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct rpcrdma_xprt *r_xprt = id->context; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; + struct rpcrdma_ep *ep = id->context; might_sleep(); - trace_xprtrdma_cm_event(r_xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: - ia->ri_async_rc = 0; - complete(&ia->ri_done); + ep->re_async_rc = 0; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_ADDR_ERROR: - ia->ri_async_rc = -EPROTO; - complete(&ia->ri_done); + ep->re_async_rc = -EPROTO; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_ROUTE_ERROR: - ia->ri_async_rc = -ENETUNREACH; - complete(&ia->ri_done); + ep->re_async_rc = -ENETUNREACH; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: removing device %s for %s:%s\n", - ia->ri_id->device->name, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); -#endif - init_completion(&ia->ri_remove_done); - set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); - ep->rep_connected = -ENODEV; - xprt_force_disconnect(xprt); - wait_for_completion(&ia->ri_remove_done); - - ia->ri_id = NULL; - /* Return 1 to ensure the core destroys the id. */ - return 1; + pr_info("rpcrdma: removing device %s for %pISpc\n", + ep->re_id->device->name, sap); + fallthrough; + case RDMA_CM_EVENT_ADDR_CHANGE: + ep->re_connect_status = -ENODEV; + goto disconnected; case RDMA_CM_EVENT_ESTABLISHED: - ++xprt->connect_cookie; - ep->rep_connected = 1; - rpcrdma_update_cm_private(r_xprt, &event->param.conn); - trace_xprtrdma_inline_thresh(r_xprt); - wake_up_all(&ep->rep_connect_wait); + rpcrdma_ep_get(ep); + ep->re_connect_status = 1; + rpcrdma_update_cm_private(ep, &event->param.conn); + trace_xprtrdma_inline_thresh(ep); + wake_up_all(&ep->re_connect_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: - ep->rep_connected = -ENOTCONN; - goto disconnected; + ep->re_connect_status = -ENOTCONN; + goto wake_connect_worker; case RDMA_CM_EVENT_UNREACHABLE: - ep->rep_connected = -ENETUNREACH; - goto disconnected; + ep->re_connect_status = -ENETUNREACH; + goto wake_connect_worker; case RDMA_CM_EVENT_REJECTED: - dprintk("rpcrdma: connection to %s:%s rejected: %s\n", - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - rdma_reject_msg(id, event->status)); - ep->rep_connected = -ECONNREFUSED; + ep->re_connect_status = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) - ep->rep_connected = -EAGAIN; - goto disconnected; + ep->re_connect_status = -ENOTCONN; +wake_connect_worker: + wake_up_all(&ep->re_connect_wait); + return 0; case RDMA_CM_EVENT_DISCONNECTED: - ep->rep_connected = -ECONNABORTED; + ep->re_connect_status = -ECONNABORTED; disconnected: - xprt_force_disconnect(xprt); - wake_up_all(&ep->rep_connect_wait); - break; + rpcrdma_force_disconnect(ep); + return rpcrdma_ep_put(ep); default: break; } - dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - ia->ri_id->device->name, rdma_event_msg(event->event)); return 0; } -static struct rdma_cm_id * -rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) +static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_ep *ep) { unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rdma_cm_id *id; int rc; - init_completion(&ia->ri_done); + init_completion(&ep->re_done); - id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, - xprt, RDMA_PS_TCP, IB_QPT_RC); + id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep, + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) return id; - ia->ri_async_rc = -ETIMEDOUT; - rc = rdma_resolve_addr(id, NULL, - (struct sockaddr *)&xprt->rx_xprt.addr, + ep->re_async_rc = -ETIMEDOUT; + rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr, RDMA_RESOLVE_TIMEOUT); if (rc) goto out; - rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); if (rc < 0) goto out; - rc = ia->ri_async_rc; + rc = ep->re_async_rc; if (rc) goto out; - ia->ri_async_rc = -ETIMEDOUT; + ep->re_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) goto out; - rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); if (rc < 0) goto out; - rc = ia->ri_async_rc; + rc = ep->re_async_rc; if (rc) goto out; @@ -340,423 +326,248 @@ out: return ERR_PTR(rc); } -/* - * Exported functions. - */ - -/** - * rpcrdma_ia_open - Open and initialize an Interface Adapter. - * @xprt: transport with IA to (re)initialize - * - * Returns 0 on success, negative errno if an appropriate - * Interface Adapter could not be found and opened. - */ -int -rpcrdma_ia_open(struct rpcrdma_xprt *xprt) +static void rpcrdma_ep_destroy(struct kref *kref) { - struct rpcrdma_ia *ia = &xprt->rx_ia; - int rc; + struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref); - ia->ri_id = rpcrdma_create_id(xprt, ia); - if (IS_ERR(ia->ri_id)) { - rc = PTR_ERR(ia->ri_id); - goto out_err; + if (ep->re_id->qp) { + rdma_destroy_qp(ep->re_id); + ep->re_id->qp = NULL; } - ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0); - if (IS_ERR(ia->ri_pd)) { - rc = PTR_ERR(ia->ri_pd); - pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); - goto out_err; - } + if (ep->re_attr.recv_cq) + ib_free_cq(ep->re_attr.recv_cq); + ep->re_attr.recv_cq = NULL; + if (ep->re_attr.send_cq) + ib_free_cq(ep->re_attr.send_cq); + ep->re_attr.send_cq = NULL; - return 0; + if (ep->re_pd) + ib_dealloc_pd(ep->re_pd); + ep->re_pd = NULL; -out_err: - rpcrdma_ia_close(ia); - return rc; + kfree(ep); + module_put(THIS_MODULE); } -/** - * rpcrdma_ia_remove - Handle device driver unload - * @ia: interface adapter being removed - * - * Divest transport H/W resources associated with this adapter, - * but allow it to be restored later. - * - * Caller must hold the transport send lock. - */ -void -rpcrdma_ia_remove(struct rpcrdma_ia *ia) +static noinline void rpcrdma_ep_get(struct rpcrdma_ep *ep) { - struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, - rx_ia); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - - /* This is similar to rpcrdma_ep_destroy, but: - * - Don't cancel the connect worker. - * - Don't call rpcrdma_ep_disconnect, which waits - * for another conn upcall, which will deadlock. - * - rdma_disconnect is unneeded, the underlying - * connection is already gone. - */ - if (ia->ri_id->qp) { - rpcrdma_xprt_drain(r_xprt); - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - ib_free_cq(ep->rep_attr.recv_cq); - ep->rep_attr.recv_cq = NULL; - ib_free_cq(ep->rep_attr.send_cq); - ep->rep_attr.send_cq = NULL; - - /* The ULP is responsible for ensuring all DMA - * mappings and MRs are gone. - */ - rpcrdma_reps_unmap(r_xprt); - rpcrdma_reqs_reset(r_xprt); - rpcrdma_mrs_destroy(r_xprt); - rpcrdma_sendctxs_destroy(r_xprt); - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; - - /* Allow waiters to continue */ - complete(&ia->ri_remove_done); - - trace_xprtrdma_remove(r_xprt); + kref_get(&ep->re_kref); } -/** - * rpcrdma_ia_close - Clean up/close an IA. - * @ia: interface adapter to close - * +/* Returns: + * %0 if @ep still has a positive kref count, or + * %1 if @ep was destroyed successfully. */ -void -rpcrdma_ia_close(struct rpcrdma_ia *ia) +static noinline int rpcrdma_ep_put(struct rpcrdma_ep *ep) { - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { - if (ia->ri_id->qp) - rdma_destroy_qp(ia->ri_id); - rdma_destroy_id(ia->ri_id); - } - ia->ri_id = NULL; - - /* If the pd is still busy, xprtrdma missed freeing a resource */ - if (ia->ri_pd && !IS_ERR(ia->ri_pd)) - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; + return kref_put(&ep->re_kref, rpcrdma_ep_destroy); } -/** - * rpcrdma_ep_create - Create unconnected endpoint - * @r_xprt: transport to instantiate - * - * Returns zero on success, or a negative errno. - */ -int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) +static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; - struct ib_cq *sendcq, *recvcq; + struct rpcrdma_connect_private *pmsg; + struct ib_device *device; + struct rdma_cm_id *id; + struct rpcrdma_ep *ep; int rc; - ep->rep_max_requests = r_xprt->rx_xprt.max_reqs; - ep->rep_inline_send = xprt_rdma_max_inline_write; - ep->rep_inline_recv = xprt_rdma_max_inline_read; + ep = kzalloc(sizeof(*ep), XPRTRDMA_GFP_FLAGS); + if (!ep) + return -ENOTCONN; + ep->re_xprt = &r_xprt->rx_xprt; + kref_init(&ep->re_kref); - rc = frwr_query_device(r_xprt, ia->ri_id->device); - if (rc) - return rc; - r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests); - - ep->rep_attr.event_handler = rpcrdma_qp_event_handler; - ep->rep_attr.qp_context = ep; - ep->rep_attr.srq = NULL; - ep->rep_attr.cap.max_inline_data = 0; - ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; - ep->rep_attr.qp_type = IB_QPT_RC; - ep->rep_attr.port_num = ~0; - - dprintk("RPC: %s: requested max: dtos: send %d recv %d; " - "iovs: send %d recv %d\n", - __func__, - ep->rep_attr.cap.max_send_wr, - ep->rep_attr.cap.max_recv_wr, - ep->rep_attr.cap.max_send_sge, - ep->rep_attr.cap.max_recv_sge); - - ep->rep_send_batch = ep->rep_max_requests >> 3; - ep->rep_send_count = ep->rep_send_batch; - init_waitqueue_head(&ep->rep_connect_wait); - ep->rep_receive_count = 0; - - sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt, - ep->rep_attr.cap.max_send_wr + 1, - IB_POLL_WORKQUEUE); - if (IS_ERR(sendcq)) { - rc = PTR_ERR(sendcq); - goto out1; + id = rpcrdma_create_id(r_xprt, ep); + if (IS_ERR(id)) { + kfree(ep); + return PTR_ERR(id); } + __module_get(THIS_MODULE); + device = id->device; + ep->re_id = id; + reinit_completion(&ep->re_done); + + ep->re_max_requests = r_xprt->rx_xprt.max_reqs; + ep->re_inline_send = xprt_rdma_max_inline_write; + ep->re_inline_recv = xprt_rdma_max_inline_read; + rc = frwr_query_device(ep, device); + if (rc) + goto out_destroy; - recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL, - ep->rep_attr.cap.max_recv_wr + 1, - IB_POLL_WORKQUEUE); - if (IS_ERR(recvcq)) { - rc = PTR_ERR(recvcq); - goto out2; + r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests); + + ep->re_attr.srq = NULL; + ep->re_attr.cap.max_inline_data = 0; + ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + ep->re_attr.qp_type = IB_QPT_RC; + ep->re_attr.port_num = ~0; + + ep->re_send_batch = ep->re_max_requests >> 3; + ep->re_send_count = ep->re_send_batch; + init_waitqueue_head(&ep->re_connect_wait); + + ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt, + ep->re_attr.cap.max_send_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->re_attr.send_cq)) { + rc = PTR_ERR(ep->re_attr.send_cq); + ep->re_attr.send_cq = NULL; + goto out_destroy; } - ep->rep_attr.send_cq = sendcq; - ep->rep_attr.recv_cq = recvcq; + ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt, + ep->re_attr.cap.max_recv_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->re_attr.recv_cq)) { + rc = PTR_ERR(ep->re_attr.recv_cq); + ep->re_attr.recv_cq = NULL; + goto out_destroy; + } + ep->re_receive_count = 0; /* Initialize cma parameters */ - memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); + memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma)); /* Prepare RDMA-CM private message */ + pmsg = &ep->re_cm_private; pmsg->cp_magic = rpcrdma_cmp_magic; pmsg->cp_version = RPCRDMA_CMP_VERSION; pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; - pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send); - pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv); - ep->rep_remote_cma.private_data = pmsg; - ep->rep_remote_cma.private_data_len = sizeof(*pmsg); + pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv); + ep->re_remote_cma.private_data = pmsg; + ep->re_remote_cma.private_data_len = sizeof(*pmsg); /* Client offers RDMA Read but does not initiate */ - ep->rep_remote_cma.initiator_depth = 0; - ep->rep_remote_cma.responder_resources = - min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom); + ep->re_remote_cma.initiator_depth = 0; + ep->re_remote_cma.responder_resources = + min_t(int, U8_MAX, device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing * transport connection and retransmission. */ - ep->rep_remote_cma.retry_count = 6; + ep->re_remote_cma.retry_count = 6; /* RPC-over-RDMA handles its own flow control. In addition, * make all RNR NAKs visible so we know that RPC-over-RDMA * flow control is working correctly (no NAKs should be seen). */ - ep->rep_remote_cma.flow_control = 0; - ep->rep_remote_cma.rnr_retry_count = 0; - - return 0; - -out2: - ib_free_cq(sendcq); -out1: - return rc; -} - -/** - * rpcrdma_ep_destroy - Disconnect and destroy endpoint. - * @r_xprt: transport instance to shut down - * - */ -void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - if (ia->ri_id && ia->ri_id->qp) { - rpcrdma_ep_disconnect(ep, ia); - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - - if (ep->rep_attr.recv_cq) - ib_free_cq(ep->rep_attr.recv_cq); - if (ep->rep_attr.send_cq) - ib_free_cq(ep->rep_attr.send_cq); -} - -/* Re-establish a connection after a device removal event. - * Unlike a normal reconnection, a fresh PD and a new set - * of MRs and buffers is needed. - */ -static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, - struct ib_qp_init_attr *qp_init_attr) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - int rc, err; - - trace_xprtrdma_reinsert(r_xprt); - - rc = -EHOSTUNREACH; - if (rpcrdma_ia_open(r_xprt)) - goto out1; - - rc = -ENOMEM; - err = rpcrdma_ep_create(r_xprt); - if (err) { - pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); - goto out2; - } - memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr)); - - rc = -ENETUNREACH; - err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr); - if (err) { - pr_err("rpcrdma: rdma_create_qp returned %d\n", err); - goto out3; - } - return 0; - -out3: - rpcrdma_ep_destroy(r_xprt); -out2: - rpcrdma_ia_close(ia); -out1: - return rc; -} - -static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, - struct ib_qp_init_attr *qp_init_attr) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rdma_cm_id *id, *old; - int err, rc; - - rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia); + ep->re_remote_cma.flow_control = 0; + ep->re_remote_cma.rnr_retry_count = 0; - rc = -EHOSTUNREACH; - id = rpcrdma_create_id(r_xprt, ia); - if (IS_ERR(id)) - goto out; - - /* As long as the new ID points to the same device as the - * old ID, we can reuse the transport's existing PD and all - * previously allocated MRs. Also, the same device means - * the transport's previous DMA mappings are still valid. - * - * This is a sanity check only. There should be no way these - * point to two different devices here. - */ - old = id; - rc = -ENETUNREACH; - if (ia->ri_id->device != id->device) { - pr_err("rpcrdma: can't reconnect on different device!\n"); + ep->re_pd = ib_alloc_pd(device, 0); + if (IS_ERR(ep->re_pd)) { + rc = PTR_ERR(ep->re_pd); + ep->re_pd = NULL; goto out_destroy; } - err = rdma_create_qp(id, ia->ri_pd, qp_init_attr); - if (err) + rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr); + if (rc) goto out_destroy; - /* Atomically replace the transport's ID and QP. */ - rc = 0; - old = ia->ri_id; - ia->ri_id = id; - rdma_destroy_qp(old); + r_xprt->rx_ep = ep; + return 0; out_destroy: - rdma_destroy_id(old); -out: + rpcrdma_ep_put(ep); + rdma_destroy_id(id); return rc; } -/* - * Connect unconnected endpoint. +/** + * rpcrdma_xprt_connect - Connect an unconnected transport + * @r_xprt: controlling transport instance + * + * Returns 0 on success or a negative errno. */ -int -rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, - rx_ia); struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct ib_qp_init_attr qp_init_attr; + struct rpcrdma_ep *ep; int rc; -retry: - memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr)); - switch (ep->rep_connected) { - case 0: - rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr); - if (rc) { - rc = -ENETUNREACH; - goto out_noupdate; - } - break; - case -ENODEV: - rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr); - if (rc) - goto out_noupdate; - break; - default: - rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr); - if (rc) - goto out; - } + rc = rpcrdma_ep_create(r_xprt); + if (rc) + return rc; + ep = r_xprt->rx_ep; - ep->rep_connected = 0; xprt_clear_connected(xprt); - rpcrdma_reset_cwnd(r_xprt); - rpcrdma_post_recvs(r_xprt, true); - rc = rpcrdma_sendctxs_create(r_xprt); - if (rc) - goto out; + /* Bump the ep's reference count while there are + * outstanding Receives. + */ + rpcrdma_ep_get(ep); + rpcrdma_post_recvs(r_xprt, 1, true); - rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); + rc = rdma_connect(ep->re_id, &ep->re_remote_cma); if (rc) goto out; if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; - wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); - if (ep->rep_connected <= 0) { - if (ep->rep_connected == -EAGAIN) - goto retry; - rc = ep->rep_connected; + wait_event_interruptible(ep->re_connect_wait, + ep->re_connect_status != 0); + if (ep->re_connect_status <= 0) { + rc = ep->re_connect_status; + goto out; + } + + rc = rpcrdma_sendctxs_create(r_xprt); + if (rc) { + rc = -ENOTCONN; goto out; } rc = rpcrdma_reqs_setup(r_xprt); if (rc) { - rpcrdma_ep_disconnect(ep, ia); + rc = -ENOTCONN; goto out; } rpcrdma_mrs_create(r_xprt); + frwr_wp_create(r_xprt); out: - if (rc) - ep->rep_connected = rc; - -out_noupdate: trace_xprtrdma_connect(r_xprt, rc); return rc; } /** - * rpcrdma_ep_disconnect - Disconnect underlying transport - * @ep: endpoint to disconnect - * @ia: associated interface adapter + * rpcrdma_xprt_disconnect - Disconnect underlying transport + * @r_xprt: controlling transport instance * * Caller serializes. Either the transport send lock is held, * or we're being called to destroy the transport. + * + * On return, @r_xprt is completely divested of all hardware + * resources and prepared for the next ->connect operation. */ -void -rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, - rx_ep); + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rdma_cm_id *id; int rc; - /* returns without wait if ID is not connected */ - rc = rdma_disconnect(ia->ri_id); - if (!rc) - wait_event_interruptible(ep->rep_connect_wait, - ep->rep_connected != 1); - else - ep->rep_connected = rc; + if (!ep) + return; + + id = ep->re_id; + rc = rdma_disconnect(id); trace_xprtrdma_disconnect(r_xprt, rc); rpcrdma_xprt_drain(r_xprt); + rpcrdma_reps_unmap(r_xprt); rpcrdma_reqs_reset(r_xprt); rpcrdma_mrs_destroy(r_xprt); rpcrdma_sendctxs_destroy(r_xprt); + + if (rpcrdma_ep_put(ep)) + rdma_destroy_id(id); + + r_xprt->rx_ep = NULL; } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -793,12 +604,15 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) { struct rpcrdma_sendctx *sc; - sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge), - GFP_KERNEL); + sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge), + XPRTRDMA_GFP_FLAGS); if (!sc) return NULL; sc->sc_cqe.done = rpcrdma_wc_send; + sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id; + sc->sc_cid.ci_completion_id = + atomic_inc_return(&ep->re_completion_ids); return sc; } @@ -813,14 +627,14 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * the ->send_request call to fail temporarily before too many * Sends are posted. */ - i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS; - buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); + i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS; + buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), XPRTRDMA_GFP_FLAGS); if (!buf->rb_sc_ctxs) return -ENOMEM; buf->rb_sc_last = i - 1; for (i = 0; i <= buf->rb_sc_last; i++) { - sc = rpcrdma_sendctx_create(&r_xprt->rx_ep); + sc = rpcrdma_sendctx_create(r_xprt->rx_ep); if (!sc) return -ENOMEM; @@ -924,25 +738,26 @@ static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct ib_device *device = ep->re_id->device; unsigned int count; - for (count = 0; count < ia->ri_max_rdma_segs; count++) { + /* Try to allocate enough to perform one full-sized I/O */ + for (count = 0; count < ep->re_max_rdma_segs; count++) { struct rpcrdma_mr *mr; int rc; - mr = kzalloc(sizeof(*mr), GFP_NOFS); + mr = kzalloc_node(sizeof(*mr), XPRTRDMA_GFP_FLAGS, + ibdev_to_node(device)); if (!mr) break; - rc = frwr_init_mr(ia, mr); + rc = frwr_mr_init(r_xprt, mr); if (rc) { kfree(mr); break; } - mr->mr_xprt = r_xprt; - spin_lock(&buf->rb_lock); rpcrdma_mr_push(mr, &buf->rb_mrs); list_add(&mr->mr_all, &buf->rb_all_mrs); @@ -973,43 +788,38 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; - /* If there is no underlying device, it's no use to - * wake the refresh worker. + /* If there is no underlying connection, it's no use + * to wake the refresh worker. */ - if (ep->rep_connected != -ENODEV) { - /* The work is scheduled on a WQ_MEM_RECLAIM - * workqueue in order to prevent MR allocation - * from recursing into NFS during direct reclaim. - */ - queue_work(xprtiod_workqueue, &buf->rb_refresh_worker); - } + if (ep->re_connect_status != 1) + return; + queue_work(system_highpri_wq, &buf->rb_refresh_worker); } /** * rpcrdma_req_create - Allocate an rpcrdma_req object * @r_xprt: controlling r_xprt * @size: initial size, in bytes, of send and receive buffers - * @flags: GFP flags passed to memory allocators * * Returns an allocated and fully initialized rpcrdma_req or NULL. */ -struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, - gfp_t flags) +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, + size_t size) { struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_req *req; - req = kzalloc(sizeof(*req), flags); + req = kzalloc(sizeof(*req), XPRTRDMA_GFP_FLAGS); if (req == NULL) goto out1; - req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags); + req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE); if (!req->rl_sendbuf) goto out2; - req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags); + req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE); if (!req->rl_recvbuf) goto out3; @@ -1042,10 +852,10 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Compute maximum header buffer size in bytes */ maxhdrsize = rpcrdma_fixed_maxsz + 3 + - r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz; + r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz; maxhdrsize *= sizeof(__be32); rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), - DMA_TO_DEVICE, GFP_KERNEL); + DMA_TO_DEVICE); if (!rb) goto out; @@ -1091,6 +901,8 @@ static void rpcrdma_req_reset(struct rpcrdma_req *req) rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); + + frwr_reset(req); } /* ASSUMPTION: the rb_allreqs list is stable for the duration, @@ -1107,27 +919,28 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt) rpcrdma_req_reset(req); } -/* No locking needed here. This function is called only by the - * Receive completion handler. - */ static noinline struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) { + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - rep = kzalloc(sizeof(*rep), GFP_KERNEL); + rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS); if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv, - DMA_FROM_DEVICE, GFP_KERNEL); + rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv, + DMA_FROM_DEVICE); if (!rep->rr_rdmabuf) goto out_free; if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) goto out_free_regbuf; + rep->rr_cid.ci_completion_id = + atomic_inc_return(&r_xprt->rx_ep->re_completion_ids); + xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf), rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; @@ -1137,7 +950,10 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; rep->rr_temp = temp; - list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps); + + spin_lock(&buf->rb_lock); + list_add(&rep->rr_all, &buf->rb_all_reps); + spin_unlock(&buf->rb_lock); return rep; out_free_regbuf: @@ -1148,16 +964,23 @@ out: return NULL; } -/* No locking needed here. This function is invoked only by the - * Receive completion handler, or during transport shutdown. - */ -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +static void rpcrdma_rep_free(struct rpcrdma_rep *rep) { - list_del(&rep->rr_all); rpcrdma_regbuf_free(rep->rr_rdmabuf); kfree(rep); } +static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +{ + struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf; + + spin_lock(&buf->rb_lock); + list_del(&rep->rr_all); + spin_unlock(&buf->rb_lock); + + rpcrdma_rep_free(rep); +} + static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) { struct llist_node *node; @@ -1169,12 +992,21 @@ static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) return llist_entry(node, struct rpcrdma_rep, rr_node); } -static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, - struct rpcrdma_rep *rep) +/** + * rpcrdma_rep_put - Release rpcrdma_rep back to free list + * @buf: buffer pool + * @rep: rep to release + * + */ +void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep) { llist_add(&rep->rr_node, &buf->rb_free_reps); } +/* Caller must ensure the QP is quiescent (RQ is drained) before + * invoking this function, to guarantee rb_all_reps is not + * changing. + */ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; @@ -1182,7 +1014,7 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) list_for_each_entry(rep, &buf->rb_all_reps, rr_all) { rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); - rep->rr_temp = true; + rep->rr_temp = true; /* Mark this rep for destruction */ } } @@ -1190,8 +1022,18 @@ static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_rep *rep; - while ((rep = rpcrdma_rep_get_locked(buf)) != NULL) - rpcrdma_rep_destroy(rep); + spin_lock(&buf->rb_lock); + while ((rep = list_first_entry_or_null(&buf->rb_all_reps, + struct rpcrdma_rep, + rr_all)) != NULL) { + list_del(&rep->rr_all); + spin_unlock(&buf->rb_lock); + + rpcrdma_rep_free(rep); + + spin_lock(&buf->rb_lock); + } + spin_unlock(&buf->rb_lock); } /** @@ -1219,8 +1061,8 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) for (i = 0; i < r_xprt->rx_xprt.max_reqs; i++) { struct rpcrdma_req *req; - req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2, - GFP_KERNEL); + req = rpcrdma_req_create(r_xprt, + RPCRDMA_V1_DEF_INLINE_SIZE * 2); if (!req) goto out; list_add(&req->rl_list, &buf->rb_send_bufs); @@ -1254,7 +1096,7 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req) list_del(&mr->mr_all); spin_unlock(&buf->rb_lock); - frwr_release_mr(mr); + frwr_mr_release(mr); } rpcrdma_regbuf_free(req->rl_recvbuf); @@ -1285,7 +1127,7 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt) list_del(&mr->mr_all); spin_unlock(&buf->rb_lock); - frwr_release_mr(mr); + frwr_mr_release(mr); spin_lock(&buf->rb_lock); } @@ -1335,22 +1177,17 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) } /** - * rpcrdma_mr_put - DMA unmap an MR and release it - * @mr: MR to release + * rpcrdma_reply_put - Put reply buffers back into pool + * @buffers: buffer pool + * @req: object to return * */ -void rpcrdma_mr_put(struct rpcrdma_mr *mr) +void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) { - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - if (mr->mr_dir != DMA_NONE) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; + if (req->rl_reply) { + rpcrdma_rep_put(buffers, req->rl_reply); + req->rl_reply = NULL; } - - rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); } /** @@ -1381,26 +1218,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) */ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) { - if (req->rl_reply) - rpcrdma_rep_put(buffers, req->rl_reply); - req->rl_reply = NULL; + rpcrdma_reply_put(buffers, req); spin_lock(&buffers->rb_lock); list_add(&req->rl_list, &buffers->rb_send_bufs); spin_unlock(&buffers->rb_lock); } -/** - * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list - * @rep: rep to release - * - * Used after error conditions. - */ -void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) -{ - rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep); -} - /* Returns a pointer to a rpcrdma_regbuf object, or NULL. * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for @@ -1408,15 +1232,14 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * or Replies they may be registered externally via frwr_map. */ static struct rpcrdma_regbuf * -rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, - gfp_t flags) +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) { struct rpcrdma_regbuf *rb; - rb = kmalloc(sizeof(*rb), flags); + rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS); if (!rb) return NULL; - rb->rg_data = kmalloc(size, flags); + rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS); if (!rb->rg_data) { kfree(rb); return NULL; @@ -1463,7 +1286,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_regbuf *rb) { - struct ib_device *device = r_xprt->rx_ia.ri_id->device; + struct ib_device *device = r_xprt->rx_ep->re_id->device; if (rb->rg_direction == DMA_NONE) return false; @@ -1476,7 +1299,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, } rb->rg_device = device; - rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey; + rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey; return true; } @@ -1502,61 +1325,32 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) } /** - * rpcrdma_ep_post - Post WRs to a transport's Send Queue - * @ia: transport's device information - * @ep: transport's RDMA endpoint information - * @req: rpcrdma_req containing the Send WR to post - * - * Returns 0 if the post was successful, otherwise -ENOTCONN - * is returned. - */ -int -rpcrdma_ep_post(struct rpcrdma_ia *ia, - struct rpcrdma_ep *ep, - struct rpcrdma_req *req) -{ - struct ib_send_wr *send_wr = &req->rl_wr; - int rc; - - if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) { - send_wr->send_flags |= IB_SEND_SIGNALED; - ep->rep_send_count = ep->rep_send_batch; - } else { - send_wr->send_flags &= ~IB_SEND_SIGNALED; - --ep->rep_send_count; - } - - rc = frwr_send(ia, req); - trace_xprtrdma_post_send(req, rc); - if (rc) - return -ENOTCONN; - return 0; -} - -/** * rpcrdma_post_recvs - Refill the Receive Queue * @r_xprt: controlling transport instance - * @temp: mark Receive buffers to be deleted after use + * @needed: current credit grant + * @temp: mark Receive buffers to be deleted after one use * */ -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_recv_wr *wr, *bad_wr; struct rpcrdma_rep *rep; - int needed, count, rc; + int count, rc; rc = 0; count = 0; - needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); - if (likely(ep->rep_receive_count > needed)) + if (likely(ep->re_receive_count > needed)) goto out; - needed -= ep->rep_receive_count; + needed -= ep->re_receive_count; if (!temp) needed += RPCRDMA_MAX_RECV_BATCH; + if (atomic_inc_return(&ep->re_receiving) > 1) + goto out; + /* fast path: all needed reps can be found on the free list */ wr = NULL; while (needed) { @@ -1570,6 +1364,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!rep) break; + rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; trace_xprtrdma_post_recv(rep); rep->rr_recv_wr.next = wr; wr = &rep->rr_recv_wr; @@ -1579,20 +1374,24 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!wr) goto out; - rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, + rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); -out: - trace_xprtrdma_post_recvs(r_xprt, count, rc); if (rc) { + trace_xprtrdma_post_recvs_err(r_xprt, rc); for (wr = bad_wr; wr;) { struct rpcrdma_rep *rep; rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr); wr = wr->next; - rpcrdma_recv_buffer_put(rep); + rpcrdma_rep_put(buf, rep); --count; } } - ep->rep_receive_count += count; + if (atomic_dec_return(&ep->re_receiving) > 0) + complete(&ep->re_done); + +out: + trace_xprtrdma_post_recvs(r_xprt, count); + ep->re_receive_count += count; return; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 37d5080c250b..5e5ff6784ef5 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -53,6 +53,7 @@ #include <rdma/ib_verbs.h> /* RDMA verbs api */ #include <linux/sunrpc/clnt.h> /* rpc_xprt */ +#include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ @@ -65,48 +66,45 @@ #define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) /* - * Interface Adapter -- one per transport instance + * RDMA Endpoint -- connection endpoint details */ -struct rpcrdma_ia { - struct rdma_cm_id *ri_id; - struct ib_pd *ri_pd; - int ri_async_rc; - unsigned int ri_max_rdma_segs; - unsigned int ri_max_frwr_depth; - bool ri_implicit_roundup; - enum ib_mr_type ri_mrtype; - unsigned long ri_flags; - struct completion ri_done; - struct completion ri_remove_done; -}; - -enum { - RPCRDMA_IAF_REMOVING = 0, -}; - -/* - * RDMA Endpoint -- one per transport instance - */ - +struct rpcrdma_mr; struct rpcrdma_ep { - unsigned int rep_send_count; - unsigned int rep_send_batch; - unsigned int rep_max_inline_send; - unsigned int rep_max_inline_recv; - int rep_connected; - struct ib_qp_init_attr rep_attr; - wait_queue_head_t rep_connect_wait; - struct rpcrdma_connect_private rep_cm_private; - struct rdma_conn_param rep_remote_cma; - unsigned int rep_max_requests; /* depends on device */ - unsigned int rep_inline_send; /* negotiated */ - unsigned int rep_inline_recv; /* negotiated */ - int rep_receive_count; + struct kref re_kref; + struct rdma_cm_id *re_id; + struct ib_pd *re_pd; + unsigned int re_max_rdma_segs; + unsigned int re_max_fr_depth; + struct rpcrdma_mr *re_write_pad_mr; + enum ib_mr_type re_mrtype; + struct completion re_done; + unsigned int re_send_count; + unsigned int re_send_batch; + unsigned int re_max_inline_send; + unsigned int re_max_inline_recv; + int re_async_rc; + int re_connect_status; + atomic_t re_receiving; + atomic_t re_force_disconnect; + struct ib_qp_init_attr re_attr; + wait_queue_head_t re_connect_wait; + struct rpc_xprt *re_xprt; + struct rpcrdma_connect_private + re_cm_private; + struct rdma_conn_param re_remote_cma; + int re_receive_count; + unsigned int re_max_requests; /* depends on device */ + unsigned int re_inline_send; /* negotiated */ + unsigned int re_inline_recv; /* negotiated */ + + atomic_t re_completion_ids; + + char re_write_pad[XDR_UNIT]; }; -/* Pre-allocate extra Work Requests for handling backward receives - * and sends. This is a fixed value because the Work Queues are - * allocated when the forward channel is set up, long before the +/* Pre-allocate extra Work Requests for handling reverse-direction + * Receives and Sends. This is a fixed value because the Work Queues + * are allocated when the forward channel is set up, long before the * backchannel is provisioned. This value is two times * NFS4_DEF_CB_SLOT_TABLE_SIZE. */ @@ -151,7 +149,11 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb) return rb->rg_data; } -#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) +/* Do not use emergency memory reserves, and fail quickly if memory + * cannot be allocated easily. These flags may be used wherever there + * is robust logic to handle a failure to allocate. + */ +#define XPRTRDMA_GFP_FLAGS (__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) /* To ensure a transport can always make forward progress, * the number of RDMA segments allowed in header chunk lists @@ -189,6 +191,8 @@ enum { struct rpcrdma_rep { struct ib_cqe rr_cqe; + struct rpc_rdma_cid rr_cid; + __be32 rr_xid; __be32 rr_vers; __be32 rr_proc; @@ -220,6 +224,7 @@ enum { struct rpcrdma_req; struct rpcrdma_sendctx { struct ib_cqe sc_cqe; + struct rpc_rdma_cid sc_cid; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; struct ib_sge sc_sges[]; @@ -231,29 +236,28 @@ struct rpcrdma_sendctx { * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). */ -struct rpcrdma_frwr { - struct ib_mr *fr_mr; - struct ib_cqe fr_cqe; - struct completion fr_linv_done; - union { - struct ib_reg_wr fr_regwr; - struct ib_send_wr fr_invwr; - }; -}; - struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; struct rpcrdma_req *mr_req; + + struct ib_mr *mr_ibmr; + struct ib_device *mr_device; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; - struct rpcrdma_frwr frwr; + struct ib_cqe mr_cqe; + struct completion mr_linv_done; + union { + struct ib_reg_wr mr_regwr; + struct ib_send_wr mr_invwr; + }; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; u32 mr_length; u64 mr_offset; struct list_head mr_all; + struct rpc_rdma_cid mr_cid; }; /* @@ -284,10 +288,11 @@ enum { RPCRDMA_MAX_IOV_SEGS, }; -struct rpcrdma_mr_seg { /* chunk descriptors */ - u32 mr_len; /* length of chunk or segment */ - struct page *mr_page; /* owning page, if any */ - char *mr_offset; /* kva if no page, else offset */ +/* Arguments for DMA mapping and registration */ +struct rpcrdma_mr_seg { + u32 mr_len; /* length of segment */ + struct page *mr_page; /* underlying struct page */ + u64 mr_offset; /* IN: page offset, OUT: iova */ }; /* The Send SGE array is provisioned to send a maximum size @@ -422,8 +427,7 @@ struct rpcrdma_stats { */ struct rpcrdma_xprt { struct rpc_xprt rx_xprt; - struct rpcrdma_ia rx_ia; - struct rpcrdma_ep rx_ep; + struct rpcrdma_ep *rx_ep; struct rpcrdma_buffer rx_buf; struct delayed_work rx_connect_worker; struct rpc_timeout rx_timeout; @@ -455,29 +459,20 @@ extern int xprt_rdma_pad_optimize; extern unsigned int xprt_rdma_memreg_strategy; /* - * Interface Adapter calls - xprtrdma/verbs.c - */ -int rpcrdma_ia_open(struct rpcrdma_xprt *xprt); -void rpcrdma_ia_remove(struct rpcrdma_ia *ia); -void rpcrdma_ia_close(struct rpcrdma_ia *); - -/* * Endpoint calls - xprtrdma/verbs.c */ -int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt); -void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt); -int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_force_disconnect(struct rpcrdma_ep *ep); +void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc); +int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); +void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); -int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, - struct rpcrdma_req *); -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); /* * Buffer calls - xprtrdma/verbs.c */ -struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, - gfp_t flags); +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, + size_t size); int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); @@ -485,13 +480,13 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); -void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req); -void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); +void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep); +void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req); bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags); @@ -536,18 +531,18 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ void frwr_reset(struct rpcrdma_req *req); -int frwr_query_device(struct rpcrdma_xprt *r_xprt, - const struct ib_device *device); -int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); -void frwr_release_mr(struct rpcrdma_mr *mr); +int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device); +int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); +void frwr_mr_release(struct rpcrdma_mr *mr); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, struct rpcrdma_mr *mr); -int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req); +int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); +int frwr_wp_create(struct rpcrdma_xprt *r_xprt); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c @@ -569,9 +564,10 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype rtype); void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); -void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); +void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep); void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); +void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct rpcrdma_rep *rep); static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d86c664ea6af..915b9902f673 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -54,9 +54,11 @@ #include <trace/events/sunrpc.h> +#include "socklib.h" #include "sunrpc.h" static void xs_close(struct rpc_xprt *xprt); +static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); @@ -90,6 +92,11 @@ static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; static struct ctl_table_header *sunrpc_table_header; +static struct xprt_class xs_local_transport; +static struct xprt_class xs_udp_transport; +static struct xprt_class xs_tcp_transport; +static struct xprt_class xs_bc_tcp_transport; + /* * FIXME: changing the UDP slot table size should also resize the UDP * socket buffers for existing UDP transports @@ -254,7 +261,7 @@ static void xs_format_common_peer_addresses(struct rpc_xprt *xprt) switch (sap->sa_family) { case AF_LOCAL: sun = xs_addr_un(xprt); - strlcpy(buf, sun->sun_path, sizeof(buf)); + strscpy(buf, sun->sun_path, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); break; @@ -421,9 +428,9 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, offset += want; } - want = xs_alloc_sparse_pages(buf, - min_t(size_t, count - offset, buf->page_len), - GFP_KERNEL); + want = xs_alloc_sparse_pages( + buf, min_t(size_t, count - offset, buf->page_len), + GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (seek < want) { ret = xs_read_bvec(sock, msg, flags, buf->bvec, xdr_buf_pagecount(buf), @@ -432,7 +439,8 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, if (ret <= 0) goto sock_err; xs_flush_bvec(buf->bvec, ret, seek + buf->page_base); - offset += ret - buf->page_base; + ret -= buf->page_base; + offset += ret; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) @@ -495,8 +503,8 @@ xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, int flags, struct rpc_rqst *req) { struct xdr_buf *buf = &req->rq_private_buf; - size_t want, uninitialized_var(read); - ssize_t uninitialized_var(ret); + size_t want, read; + ssize_t ret; xs_read_header(transport, buf); @@ -556,6 +564,10 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) struct rpc_rqst *req; ssize_t ret; + /* Is this transport associated with the backchannel? */ + if (!xprt->bc_serv) + return -ESHUTDOWN; + /* Look up and lock the request corresponding to the given XID */ req = xprt_lookup_bc_request(xprt, transport->recv.xid); if (!req) { @@ -749,141 +761,19 @@ xs_stream_start_connect(struct sock_xprt *transport) #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) -static int xs_sendmsg(struct socket *sock, struct msghdr *msg, size_t seek) -{ - if (seek) - iov_iter_advance(&msg->msg_iter, seek); - return sock_sendmsg(sock, msg); -} - -static int xs_send_kvec(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t seek) -{ - iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len); - return xs_sendmsg(sock, msg, seek); -} - -static int xs_send_pagedata(struct socket *sock, struct msghdr *msg, struct xdr_buf *xdr, size_t base) -{ - int err; - - err = xdr_alloc_bvec(xdr, GFP_KERNEL); - if (err < 0) - return err; - - iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec, - xdr_buf_pagecount(xdr), - xdr->page_len + xdr->page_base); - return xs_sendmsg(sock, msg, base + xdr->page_base); -} - -#define xs_record_marker_len() sizeof(rpc_fraghdr) - -/* Common case: - * - stream transport - * - sending from byte 0 of the message - * - the message is wholly contained in @xdr's head iovec - */ -static int xs_send_rm_and_kvec(struct socket *sock, struct msghdr *msg, - rpc_fraghdr marker, struct kvec *vec, size_t base) -{ - struct kvec iov[2] = { - [0] = { - .iov_base = &marker, - .iov_len = sizeof(marker) - }, - [1] = *vec, - }; - size_t len = iov[0].iov_len + iov[1].iov_len; - - iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len); - return xs_sendmsg(sock, msg, base); -} - -/** - * xs_sendpages - write pages directly to a socket - * @sock: socket to send on - * @addr: UDP only -- address of destination - * @addrlen: UDP only -- length of destination address - * @xdr: buffer containing this request - * @base: starting position in the buffer - * @rm: stream record marker field - * @sent_p: return the total number of bytes successfully queued for sending - * - */ -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, rpc_fraghdr rm, int *sent_p) -{ - struct msghdr msg = { - .msg_name = addr, - .msg_namelen = addrlen, - .msg_flags = XS_SENDMSG_FLAGS | MSG_MORE, - }; - unsigned int rmsize = rm ? sizeof(rm) : 0; - unsigned int remainder = rmsize + xdr->len - base; - unsigned int want; - int err = 0; - - if (unlikely(!sock)) - return -ENOTSOCK; - - want = xdr->head[0].iov_len + rmsize; - if (base < want) { - unsigned int len = want - base; - remainder -= len; - if (remainder == 0) - msg.msg_flags &= ~MSG_MORE; - if (rmsize) - err = xs_send_rm_and_kvec(sock, &msg, rm, - &xdr->head[0], base); - else - err = xs_send_kvec(sock, &msg, &xdr->head[0], base); - if (remainder == 0 || err != len) - goto out; - *sent_p += err; - base = 0; - } else - base -= want; - - if (base < xdr->page_len) { - unsigned int len = xdr->page_len - base; - remainder -= len; - if (remainder == 0) - msg.msg_flags &= ~MSG_MORE; - err = xs_send_pagedata(sock, &msg, xdr, base); - if (remainder == 0 || err != len) - goto out; - *sent_p += err; - base = 0; - } else - base -= xdr->page_len; - - if (base >= xdr->tail[0].iov_len) - return 0; - msg.msg_flags &= ~MSG_MORE; - err = xs_send_kvec(sock, &msg, &xdr->tail[0], base); -out: - if (err > 0) { - *sent_p += err; - err = 0; - } - return err; -} - /** * xs_nospace - handle transmit was incomplete * @req: pointer to RPC request + * @transport: pointer to struct sock_xprt * */ -static int xs_nospace(struct rpc_rqst *req) +static int xs_nospace(struct rpc_rqst *req, struct sock_xprt *transport) { - struct rpc_xprt *xprt = req->rq_xprt; - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct rpc_xprt *xprt = &transport->xprt; struct sock *sk = transport->inet; int ret = -EAGAIN; - dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", - req->rq_task->tk_pid, - req->rq_slen - transport->xmit.offset, - req->rq_slen); + trace_rpc_socket_nospace(req, transport); /* Protect against races with write_space */ spin_lock(&xprt->transport_lock); @@ -891,32 +781,50 @@ static int xs_nospace(struct rpc_rqst *req) /* Don't race with disconnect */ if (xprt_connected(xprt)) { /* wait for more buffer space */ + set_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; xprt_wait_for_buffer_space(xprt); } else ret = -ENOTCONN; spin_unlock(&xprt->transport_lock); + return ret; +} - /* Race breaker in case memory is freed before above code is called */ - if (ret == -EAGAIN) { - struct socket_wq *wq; +static int xs_sock_nospace(struct rpc_rqst *req) +{ + struct sock_xprt *transport = + container_of(req->rq_xprt, struct sock_xprt, xprt); + struct sock *sk = transport->inet; + int ret = -EAGAIN; - rcu_read_lock(); - wq = rcu_dereference(sk->sk_wq); - set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags); - rcu_read_unlock(); + lock_sock(sk); + if (!sock_writeable(sk)) + ret = xs_nospace(req, transport); + release_sock(sk); + return ret; +} - sk->sk_write_space(sk); - } +static int xs_stream_nospace(struct rpc_rqst *req, bool vm_wait) +{ + struct sock_xprt *transport = + container_of(req->rq_xprt, struct sock_xprt, xprt); + struct sock *sk = transport->inet; + int ret = -EAGAIN; + + if (vm_wait) + return -ENOBUFS; + lock_sock(sk); + if (!sk_stream_memory_free(sk)) + ret = xs_nospace(req, transport); + release_sock(sk); return ret; } -static void -xs_stream_prepare_request(struct rpc_rqst *req) +static int xs_stream_prepare_request(struct rpc_rqst *req, struct xdr_buf *buf) { - xdr_free_bvec(&req->rq_rcv_buf); - req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_KERNEL); + return xdr_alloc_bvec(buf, rpc_task_gfp_mask()); } /* @@ -949,7 +857,7 @@ xs_stream_record_marker(struct xdr_buf *xdr) * EAGAIN: The socket was blocked, please call again later to * complete the request * ENOTCONN: Caller needs to invoke connect logic then call again - * other: Some other error occured, the request was not sent + * other: Some other error occurred, the request was not sent */ static int xs_local_send_request(struct rpc_rqst *req) { @@ -959,27 +867,30 @@ static int xs_local_send_request(struct rpc_rqst *req) struct xdr_buf *xdr = &req->rq_snd_buf; rpc_fraghdr rm = xs_stream_record_marker(xdr); unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen; + struct msghdr msg = { + .msg_flags = XS_SENDMSG_FLAGS, + }; + bool vm_wait; + unsigned int sent; int status; - int sent = 0; /* Close the stream if the previous transmission was incomplete */ if (xs_send_request_was_aborted(transport, req)) { - xs_close(xprt); + xprt_force_disconnect(xprt); return -ENOTCONN; } xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); + vm_wait = sk_stream_is_writeable(transport->inet) ? true : false; + req->rq_xtime = ktime_get(); - status = xs_sendpages(transport->sock, NULL, 0, xdr, - transport->xmit.offset, rm, &sent); + status = xprt_sock_sendmsg(transport->sock, &msg, xdr, + transport->xmit.offset, rm, &sent); dprintk("RPC: %s(%u) = %d\n", __func__, xdr->len - transport->xmit.offset, status); - if (status == -EAGAIN && sock_writeable(transport->inet)) - status = -ENOBUFS; - if (likely(sent > 0) || status == 0) { transport->xmit.offset += sent; req->rq_bytes_sent = transport->xmit.offset; @@ -989,20 +900,19 @@ static int xs_local_send_request(struct rpc_rqst *req) return 0; } status = -EAGAIN; + vm_wait = false; } switch (status) { - case -ENOBUFS: - break; case -EAGAIN: - status = xs_nospace(req); + status = xs_stream_nospace(req, vm_wait); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); - /* fall through */ + fallthrough; case -EPIPE: - xs_close(xprt); + xprt_force_disconnect(xprt); status = -ENOTCONN; } @@ -1025,7 +935,12 @@ static int xs_udp_send_request(struct rpc_rqst *req) struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; - int sent = 0; + struct msghdr msg = { + .msg_name = xs_addr(xprt), + .msg_namelen = xprt->addrlen, + .msg_flags = XS_SENDMSG_FLAGS, + }; + unsigned int sent; int status; xs_pktdump("packet data:", @@ -1038,9 +953,11 @@ static int xs_udp_send_request(struct rpc_rqst *req) if (!xprt_request_get_cong(xprt, req)) return -EBADSLT; + status = xdr_alloc_bvec(xdr, rpc_task_gfp_mask()); + if (status < 0) + return status; req->rq_xtime = ktime_get(); - status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, - xdr, 0, 0, &sent); + status = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, 0, &sent); dprintk("RPC: xs_udp_send_request(%u) = %d\n", xdr->len, status); @@ -1067,7 +984,7 @@ process_status: /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(req); + status = xs_sock_nospace(req); break; case -ENETUNREACH: case -ENOBUFS: @@ -1106,9 +1023,12 @@ static int xs_tcp_send_request(struct rpc_rqst *req) struct xdr_buf *xdr = &req->rq_snd_buf; rpc_fraghdr rm = xs_stream_record_marker(xdr); unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen; - bool vm_wait = false; + struct msghdr msg = { + .msg_flags = XS_SENDMSG_FLAGS, + }; + bool vm_wait; + unsigned int sent; int status; - int sent; /* Close the stream if the previous transmission was incomplete */ if (xs_send_request_was_aborted(transport, req)) { @@ -1116,6 +1036,8 @@ static int xs_tcp_send_request(struct rpc_rqst *req) kernel_sock_shutdown(transport->sock, SHUT_RDWR); return -ENOTCONN; } + if (!transport->inet) + return -ENOTCONN; xs_pktdump("packet data:", req->rq_svec->iov_base, @@ -1124,14 +1046,19 @@ static int xs_tcp_send_request(struct rpc_rqst *req) if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state)) xs_tcp_set_socket_timeouts(xprt, transport->sock); + xs_set_srcport(transport, transport->sock); + /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ req->rq_xtime = ktime_get(); - while (1) { - sent = 0; - status = xs_sendpages(transport->sock, NULL, 0, xdr, - transport->xmit.offset, rm, &sent); + tcp_sock_set_cork(transport->inet, true); + + vm_wait = sk_stream_is_writeable(transport->inet) ? true : false; + + do { + status = xprt_sock_sendmsg(transport->sock, &msg, xdr, + transport->xmit.offset, rm, &sent); dprintk("RPC: xs_tcp_send_request(%u) = %d\n", xdr->len - transport->xmit.offset, status); @@ -1143,36 +1070,17 @@ static int xs_tcp_send_request(struct rpc_rqst *req) if (likely(req->rq_bytes_sent >= msglen)) { req->rq_xmit_bytes_sent += transport->xmit.offset; transport->xmit.offset = 0; + if (atomic_long_read(&xprt->xmit_queuelen) == 1) + tcp_sock_set_cork(transport->inet, false); return 0; } WARN_ON_ONCE(sent == 0 && status == 0); - if (status == -EAGAIN ) { - /* - * Return EAGAIN if we're sure we're hitting the - * socket send buffer limits. - */ - if (test_bit(SOCK_NOSPACE, &transport->sock->flags)) - break; - /* - * Did we hit a memory allocation failure? - */ - if (sent == 0) { - status = -ENOBUFS; - if (vm_wait) - break; - /* Retry, knowing now that we're below the - * socket send buffer limit - */ - vm_wait = true; - } - continue; - } - if (status < 0) - break; - vm_wait = false; - } + if (sent > 0) + vm_wait = false; + + } while (status == 0); switch (status) { case -ENOTSOCK: @@ -1180,7 +1088,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req) /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(req); + status = xs_stream_nospace(req, vm_wait); break; case -ECONNRESET: case -ECONNREFUSED: @@ -1221,6 +1129,7 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state); + clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); } static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr) @@ -1231,6 +1140,7 @@ static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr) static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { + xprt->connect_cookie++; smp_mb__before_atomic(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); @@ -1250,14 +1160,13 @@ static void xs_error_report(struct sock *sk) struct sock_xprt *transport; struct rpc_xprt *xprt; - read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) - goto out; + return; transport = container_of(xprt, struct sock_xprt, xprt); transport->xprt_err = -sk->sk_err; if (transport->xprt_err == 0) - goto out; + return; dprintk("RPC: xs_error_report client %p, error=%d...\n", xprt, -transport->xprt_err); trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err); @@ -1265,8 +1174,6 @@ static void xs_error_report(struct sock *sk) /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */ smp_mb__before_atomic(); xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR); - out: - read_unlock_bh(&sk->sk_callback_lock); } static void xs_reset_transport(struct sock_xprt *transport) @@ -1278,6 +1185,16 @@ static void xs_reset_transport(struct sock_xprt *transport) if (sk == NULL) return; + /* + * Make sure we're calling this in a context from which it is safe + * to call __fput_sync(). In practice that means rpciod and the + * system workqueue. + */ + if (!(current->flags & PF_WQ_WORKER)) { + WARN_ON_ONCE(1); + set_bit(XPRT_CLOSE_WAIT, &xprt->state); + return; + } if (atomic_read(&transport->xprt.swapper)) sk_clear_memalloc(sk); @@ -1285,7 +1202,7 @@ static void xs_reset_transport(struct sock_xprt *transport) kernel_sock_shutdown(sock, SHUT_RDWR); mutex_lock(&transport->recv_mutex); - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); transport->inet = NULL; transport->sock = NULL; transport->file = NULL; @@ -1294,14 +1211,14 @@ static void xs_reset_transport(struct sock_xprt *transport) xs_restore_old_callbacks(transport, sk); xprt_clear_connected(xprt); - write_unlock_bh(&sk->sk_callback_lock); xs_sock_reset_connection_flags(xprt); /* Reset stream record info */ xs_stream_reset_connect(transport); + release_sock(sk); mutex_unlock(&transport->recv_mutex); trace_rpc_socket_close(xprt, sock); - fput(filp); + __fput_sync(filp); xprt_disconnect_done(xprt); } @@ -1430,7 +1347,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport) if (sk == NULL) goto out; for (;;) { - skb = skb_recv_udp(sk, 0, 1, &err); + skb = skb_recv_udp(sk, MSG_DONTWAIT, &err); if (skb == NULL) break; xs_udp_data_read_skb(&transport->xprt, sk, skb); @@ -1453,7 +1370,7 @@ static void xs_udp_data_receive_workfn(struct work_struct *work) } /** - * xs_data_ready - "data ready" callback for UDP sockets + * xs_data_ready - "data ready" callback for sockets * @sk: socket with data to read * */ @@ -1461,12 +1378,13 @@ static void xs_data_ready(struct sock *sk) { struct rpc_xprt *xprt; - read_lock_bh(&sk->sk_callback_lock); - dprintk("RPC: xs_data_ready...\n"); xprt = xprt_from_sock(sk); if (xprt != NULL) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + trace_xs_data_ready(xprt); + transport->old_data_ready(sk); /* Any data means we had a useful conversation, so * then we don't need to delay the next reconnect @@ -1476,7 +1394,6 @@ static void xs_data_ready(struct sock *sk) if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) queue_work(xprtiod_workqueue, &transport->recv_worker); } - read_unlock_bh(&sk->sk_callback_lock); } /* @@ -1496,6 +1413,26 @@ static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** + * xs_local_state_change - callback to handle AF_LOCAL socket state changes + * @sk: socket whose state has changed + * + */ +static void xs_local_state_change(struct sock *sk) +{ + struct rpc_xprt *xprt; + struct sock_xprt *transport; + + if (!(xprt = xprt_from_sock(sk))) + return; + transport = container_of(xprt, struct sock_xprt, xprt); + if (sk->sk_shutdown & SHUTDOWN_MASK) { + clear_bit(XPRT_CONNECTED, &xprt->state); + /* Trigger the socket release */ + xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); + } +} + +/** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed * @@ -1505,9 +1442,8 @@ static void xs_tcp_state_change(struct sock *sk) struct rpc_xprt *xprt; struct sock_xprt *transport; - read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) - goto out; + return; dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); dprintk("RPC: state %x conn %d dead %d zapped %d sk_shutdown %d\n", sk->sk_state, xprt_connected(xprt), @@ -1545,7 +1481,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->connect_cookie++; clear_bit(XPRT_CONNECTED, &xprt->state); xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); - /* fall through */ + fallthrough; case TCP_CLOSING: /* * If the server closed down the connection, make sure that @@ -1568,13 +1504,10 @@ static void xs_tcp_state_change(struct sock *sk) /* Trigger the socket release */ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); } - out: - read_unlock_bh(&sk->sk_callback_lock); } static void xs_write_space(struct sock *sk) { - struct socket_wq *wq; struct sock_xprt *transport; struct rpc_xprt *xprt; @@ -1585,15 +1518,10 @@ static void xs_write_space(struct sock *sk) if (unlikely(!(xprt = xprt_from_sock(sk)))) return; transport = container_of(xprt, struct sock_xprt, xprt); - rcu_read_lock(); - wq = rcu_dereference(sk->sk_wq); - if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) - goto out; - + if (!test_and_clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state)) + return; xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE); sk->sk_write_pending--; -out: - rcu_read_unlock(); } /** @@ -1608,13 +1536,9 @@ out: */ static void xs_udp_write_space(struct sock *sk) { - read_lock_bh(&sk->sk_callback_lock); - /* from net/core/sock.c:sock_def_write_space */ if (sock_writeable(sk)) xs_write_space(sk); - - read_unlock_bh(&sk->sk_callback_lock); } /** @@ -1629,13 +1553,9 @@ static void xs_udp_write_space(struct sock *sk) */ static void xs_tcp_write_space(struct sock *sk) { - read_lock_bh(&sk->sk_callback_lock); - /* from net/core/stream.c:sk_stream_write_space */ if (sk_stream_is_writeable(sk)) xs_write_space(sk); - - read_unlock_bh(&sk->sk_callback_lock); } static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) @@ -1699,25 +1619,10 @@ static int xs_get_random_port(void) if (max < min) return -EADDRINUSE; range = max - min + 1; - rand = (unsigned short) prandom_u32() % range; + rand = prandom_u32_max(range); return rand + min; } -/** - * xs_set_reuseaddr_port - set the socket's port and address reuse options - * @sock: socket - * - * Note that this function has to be called on all sockets that share the - * same port, and it must be called before binding. - */ -static void xs_sock_set_reuseport(struct socket *sock) -{ - int opt = 1; - - kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, - (char *)&opt, sizeof(opt)); -} - static unsigned short xs_sock_getport(struct socket *sock) { struct sockaddr_storage buf; @@ -1765,6 +1670,36 @@ static int xs_get_srcport(struct sock_xprt *transport) return port; } +static unsigned short xs_sock_srcport(struct rpc_xprt *xprt) +{ + struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); + unsigned short ret = 0; + mutex_lock(&sock->recv_mutex); + if (sock->sock) + ret = xs_sock_getport(sock->sock); + mutex_unlock(&sock->recv_mutex); + return ret; +} + +static int xs_sock_srcaddr(struct rpc_xprt *xprt, char *buf, size_t buflen) +{ + struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); + union { + struct sockaddr sa; + struct sockaddr_storage st; + } saddr; + int ret = -ENOTCONN; + + mutex_lock(&sock->recv_mutex); + if (sock->sock) { + ret = kernel_getsockname(sock->sock, &saddr.sa); + if (ret >= 0) + ret = snprintf(buf, buflen, "%pISc", &saddr.sa); + } + mutex_unlock(&sock->recv_mutex); + return ret; +} + static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port) { if (transport->srcport != 0) @@ -1791,7 +1726,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) * This ensures that we can continue to establish TCP * connections even when all local ephemeral ports are already * a part of some TCP connection. This makes no difference - * for UDP sockets, but also doens't harm them. + * for UDP sockets, but also doesn't harm them. * * If we're asking for any reserved port (i.e. port == 0 && * transport->xprt.resvport == 1) xs_get_srcport above will @@ -1806,7 +1741,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) err = kernel_bind(sock, (struct sockaddr *)&myaddr, transport->xprt.addrlen); if (err == 0) { - transport->srcport = port; + if (transport->xprt.reuseport) + transport->srcport = port; break; } last = port; @@ -1839,15 +1775,15 @@ static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port) } #ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key xs_key[2]; -static struct lock_class_key xs_slock_key[2]; +static struct lock_class_key xs_key[3]; +static struct lock_class_key xs_slock_key[3]; static inline void xs_reclassify_socketu(struct socket *sock) { struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC", - &xs_slock_key[1], "sk_lock-AF_LOCAL-RPC", &xs_key[1]); + &xs_slock_key[0], "sk_lock-AF_LOCAL-RPC", &xs_key[0]); } static inline void xs_reclassify_socket4(struct socket *sock) @@ -1855,7 +1791,7 @@ static inline void xs_reclassify_socket4(struct socket *sock) struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC", - &xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]); + &xs_slock_key[1], "sk_lock-AF_INET-RPC", &xs_key[1]); } static inline void xs_reclassify_socket6(struct socket *sock) @@ -1863,7 +1799,7 @@ static inline void xs_reclassify_socket6(struct socket *sock) struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC", - &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]); + &xs_slock_key[2], "sk_lock-AF_INET6-RPC", &xs_key[2]); } static inline void xs_reclassify_socket(int family, struct socket *sock) @@ -1910,7 +1846,7 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, xs_reclassify_socket(family, sock); if (reuseport) - xs_sock_set_reuseport(sock); + sock_set_reuseport(sock->sk); err = xs_bind(transport, sock); if (err) { @@ -1937,14 +1873,14 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, if (!transport->inet) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; - sock_set_flag(sk, SOCK_FASYNC); + sk->sk_state_change = xs_local_state_change; sk->sk_error_report = xs_error_report; xprt_clear_connected(xprt); @@ -1953,7 +1889,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, transport->sock = sock; transport->inet = sk; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } xs_stream_start_connect(transport); @@ -1970,7 +1906,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) struct rpc_xprt *xprt = &transport->xprt; struct file *filp; struct socket *sock; - int status = -EIO; + int status; status = __sock_create(xprt->xprt_net, AF_LOCAL, SOCK_STREAM, 0, &sock, 1); @@ -2001,6 +1937,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; xprt_set_connected(xprt); + break; case -ENOBUFS: break; case -ENOENT: @@ -2028,7 +1965,10 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); int ret; - if (RPC_IS_ASYNC(task)) { + if (transport->file) + goto force_disconnect; + + if (RPC_IS_ASYNC(task)) { /* * We want the AF_LOCAL connect to be resolved in the * filesystem namespace of the process making the rpc @@ -2038,20 +1978,25 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) * we'll need to figure out how to pass a namespace to * connect. */ - task->tk_rpc_status = -ENOTCONN; - rpc_exit(task, -ENOTCONN); - return; + rpc_task_set_rpc_status(task, -ENOTCONN); + goto out_wake; } ret = xs_local_setup_socket(transport); if (ret && !RPC_IS_SOFTCONN(task)) msleep_interruptible(15000); + return; +force_disconnect: + xprt_force_disconnect(xprt); +out_wake: + xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, -ENOTCONN); } #if IS_ENABLED(CONFIG_SUNRPC_SWAP) /* - * Note that this should be called with XPRT_LOCKED held (or when we otherwise - * know that we have exclusive access to the socket), to guard against - * races with xs_reset_transport. + * Note that this should be called with XPRT_LOCKED held, or recv_mutex + * held, or when we otherwise know that we have exclusive access to the + * socket, to guard against races with xs_reset_transport. */ static void xs_set_memalloc(struct rpc_xprt *xprt) { @@ -2080,13 +2025,11 @@ xs_enable_swap(struct rpc_xprt *xprt) { struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt); - if (atomic_inc_return(&xprt->swapper) != 1) - return 0; - if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) - return -ERESTARTSYS; - if (xs->inet) + mutex_lock(&xs->recv_mutex); + if (atomic_inc_return(&xprt->swapper) == 1 && + xs->inet) sk_set_memalloc(xs->inet); - xprt_release_xprt(xprt, NULL); + mutex_unlock(&xs->recv_mutex); return 0; } @@ -2102,13 +2045,11 @@ xs_disable_swap(struct rpc_xprt *xprt) { struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt); - if (!atomic_dec_and_test(&xprt->swapper)) - return; - if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) - return; - if (xs->inet) + mutex_lock(&xs->recv_mutex); + if (atomic_dec_and_test(&xprt->swapper) && + xs->inet) sk_clear_memalloc(xs->inet); - xprt_release_xprt(xprt, NULL); + mutex_unlock(&xs->recv_mutex); } #else static void xs_set_memalloc(struct rpc_xprt *xprt) @@ -2134,14 +2075,13 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) if (!transport->inet) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; - sock_set_flag(sk, SOCK_FASYNC); xprt_set_connected(xprt); @@ -2151,7 +2091,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_set_memalloc(xprt); - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } xs_udp_do_set_buffer_size(xprt); @@ -2165,7 +2105,10 @@ static void xs_udp_setup_socket(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; struct socket *sock; int status = -EIO; + unsigned int pflags = current->flags; + if (atomic_read(&xprt->swapper)) + current->flags |= PF_MEMALLOC; sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family, SOCK_DGRAM, IPPROTO_UDP, false); @@ -2185,6 +2128,7 @@ out: xprt_clear_connecting(xprt); xprt_unlock_connect(xprt, transport); xprt_wake_pending_tasks(xprt, status); + current_restore_flags(pflags, PF_MEMALLOC); } /** @@ -2202,13 +2146,20 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) if (sock == NULL) return; + if (!xprt->reuseport) { + xs_close(xprt); + return; + } switch (skst) { - default: + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + break; + case TCP_ESTABLISHED: + case TCP_CLOSE_WAIT: kernel_sock_shutdown(sock, SHUT_RDWR); trace_rpc_socket_shutdown(xprt, sock); break; - case TCP_CLOSE: - case TCP_TIME_WAIT: + default: xs_reset_transport(transport); } } @@ -2219,7 +2170,6 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); unsigned int keepidle; unsigned int keepcnt; - unsigned int opt_on = 1; unsigned int timeo; spin_lock(&xprt->transport_lock); @@ -2231,18 +2181,13 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, spin_unlock(&xprt->transport_lock); /* TCP Keepalive options */ - kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&opt_on, sizeof(opt_on)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); + sock_set_keepalive(sock->sk); + tcp_sock_set_keepidle(sock->sk, keepidle); + tcp_sock_set_keepintvl(sock->sk, keepidle); + tcp_sock_set_keepcnt(sock->sk, keepcnt); /* TCP user timeout (see RFC5482) */ - kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (char *)&timeo, sizeof(timeo)); + tcp_sock_set_user_timeout(sock->sk, timeo); } static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, @@ -2276,11 +2221,9 @@ static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - int ret = -ENOTCONN; if (!transport->inet) { struct sock *sk = sock->sk; - unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC; /* Avoid temporary address, they are bad for long-lived * connections such as NFS mounts. @@ -2289,12 +2232,15 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) * knowledge about the normal duration of connections, * MAY override this as appropriate. */ - kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES, - (char *)&addr_pref, sizeof(addr_pref)); + if (xs_addr(xprt)->sa_family == PF_INET6) { + ip6_sock_set_addr_preferences(sk, + IPV6_PREFER_SRC_PUBLIC); + } xs_tcp_set_socket_timeouts(xprt, sock); + tcp_sock_set_nodelay(sk); - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); xs_save_old_callbacks(transport, sk); @@ -2302,12 +2248,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_data_ready = xs_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; - sock_set_flag(sk, SOCK_FASYNC); sk->sk_error_report = xs_error_report; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); - tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; xprt_clear_connected(xprt); @@ -2315,11 +2259,11 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) transport->sock = sock; transport->inet = sk; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } if (!xprt_bound(xprt)) - goto out; + return -ENOTCONN; xs_set_memalloc(xprt); @@ -2327,22 +2271,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* Tell the socket layer to start connecting... */ set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); - ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); - switch (ret) { - case 0: - xs_set_srcport(transport, sock); - /* fall through */ - case -EINPROGRESS: - /* SYN_SENT! */ - if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) - xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; - break; - case -EADDRNOTAVAIL: - /* Source port number is unavailable. Try a new one! */ - transport->srcport = 0; - } -out: - return ret; + return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); } /** @@ -2357,14 +2286,22 @@ static void xs_tcp_setup_socket(struct work_struct *work) container_of(work, struct sock_xprt, connect_worker.work); struct socket *sock = transport->sock; struct rpc_xprt *xprt = &transport->xprt; - int status = -EIO; + int status; + unsigned int pflags = current->flags; - if (!sock) { - sock = xs_create_sock(xprt, transport, - xs_addr(xprt)->sa_family, SOCK_STREAM, - IPPROTO_TCP, true); + if (atomic_read(&xprt->swapper)) + current->flags |= PF_MEMALLOC; + + if (xprt_connected(xprt)) + goto out; + if (test_and_clear_bit(XPRT_SOCK_CONNECT_SENT, + &transport->sock_state) || + !sock) { + xs_reset_transport(transport); + sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family, + SOCK_STREAM, IPPROTO_TCP, true); if (IS_ERR(sock)) { - status = PTR_ERR(sock); + xprt_wake_pending_tasks(xprt, PTR_ERR(sock)); goto out; } } @@ -2381,21 +2318,20 @@ static void xs_tcp_setup_socket(struct work_struct *work) xprt, -status, xprt_connected(xprt), sock->sk->sk_state); switch (status) { - default: - printk("%s: connect returned unhandled error %d\n", - __func__, status); - /* fall through */ - case -EADDRNOTAVAIL: - /* We're probably in TIME_WAIT. Get rid of existing socket, - * and retry - */ - xs_tcp_force_close(xprt); - break; case 0: case -EINPROGRESS: + /* SYN_SENT! */ + set_bit(XPRT_SOCK_CONNECT_SENT, &transport->sock_state); + if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + fallthrough; case -EALREADY: - xprt_unlock_connect(xprt, transport); - return; + goto out_unlock; + case -EADDRNOTAVAIL: + /* Source port number is unavailable. Try a new one! */ + transport->srcport = 0; + status = -EAGAIN; + break; case -EINVAL: /* Happens, for instance, if the user specified a link * local IPv6 address without a scope-id. @@ -2407,20 +2343,23 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: - /* - * xs_tcp_force_close() wakes tasks with -EIO. - * We need to wake them first to ensure the - * correct error code. - */ - xprt_wake_pending_tasks(xprt, status); - xs_tcp_force_close(xprt); - goto out; + break; + default: + printk("%s: connect returned unhandled error %d\n", + __func__, status); + status = -EAGAIN; } - status = -EAGAIN; + + /* xs_tcp_force_close() wakes tasks with a fixed error code. + * We need to wake them first to ensure the correct error code. + */ + xprt_wake_pending_tasks(xprt, status); + xs_tcp_force_close(xprt); out: xprt_clear_connecting(xprt); +out_unlock: xprt_unlock_connect(xprt, transport); - xprt_wake_pending_tasks(xprt, status); + current_restore_flags(pflags, PF_MEMALLOC); } /** @@ -2446,11 +2385,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) if (transport->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p for %lu " - "seconds\n", - xprt, xprt->reestablish_timeout / HZ); - - /* Start by resetting any existing state */ - xs_reset_transport(transport); + "seconds\n", xprt, xprt->reestablish_timeout / HZ); delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO); @@ -2511,7 +2446,7 @@ static void xs_error_handle(struct work_struct *work) } /** - * xs_local_print_stats - display AF_LOCAL socket-specifc stats + * xs_local_print_stats - display AF_LOCAL socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * @@ -2540,7 +2475,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) } /** - * xs_udp_print_stats - display UDP socket-specifc stats + * xs_udp_print_stats - display UDP socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * @@ -2564,7 +2499,7 @@ static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) } /** - * xs_tcp_print_stats - display TCP socket-specifc stats + * xs_tcp_print_stats - display TCP socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * @@ -2612,7 +2547,7 @@ static int bc_malloc(struct rpc_task *task) return -EINVAL; } - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (!page) return -ENOMEM; @@ -2636,50 +2571,40 @@ static void bc_free(struct rpc_task *task) free_page((unsigned long)buf); } -/* - * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex - * held. Borrows heavily from svc_tcp_sendto and xs_tcp_send_request. - */ static int bc_sendto(struct rpc_rqst *req) { - int len; - struct xdr_buf *xbufp = &req->rq_snd_buf; + struct xdr_buf *xdr = &req->rq_snd_buf; struct sock_xprt *transport = container_of(req->rq_xprt, struct sock_xprt, xprt); - unsigned long headoff; - unsigned long tailoff; - struct page *tailpage; struct msghdr msg = { - .msg_flags = MSG_MORE + .msg_flags = 0, }; rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | - (u32)xbufp->len); - struct kvec iov = { - .iov_base = &marker, - .iov_len = sizeof(marker), - }; + (u32)xdr->len); + unsigned int sent = 0; + int err; req->rq_xtime = ktime_get(); - - len = kernel_sendmsg(transport->sock, &msg, &iov, 1, iov.iov_len); - if (len != iov.iov_len) - return -EAGAIN; - - tailpage = NULL; - if (xbufp->tail[0].iov_len) - tailpage = virt_to_page(xbufp->tail[0].iov_base); - tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK; - headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK; - len = svc_send_common(transport->sock, xbufp, - virt_to_page(xbufp->head[0].iov_base), headoff, - tailpage, tailoff); - if (len != xbufp->len) + err = xdr_alloc_bvec(xdr, rpc_task_gfp_mask()); + if (err < 0) + return err; + err = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, marker, &sent); + xdr_free_bvec(xdr); + if (err < 0 || sent != (xdr->len + sizeof(marker))) return -EAGAIN; - return len; + return sent; } -/* - * The send routine. Borrows from svc_send +/** + * bc_send_request - Send a backchannel Call on a TCP socket + * @req: rpc_rqst containing Call message to be sent + * + * xpt_mutex ensures @rqstp's whole message is written to the socket + * without interruption. + * + * Return values: + * %0 if the message was sent successfully + * %ENOTCONN if the message was not sent */ static int bc_send_request(struct rpc_rqst *req) { @@ -2714,6 +2639,7 @@ static int bc_send_request(struct rpc_rqst *req) static void bc_close(struct rpc_xprt *xprt) { + xprt_disconnect_done(xprt); } /* @@ -2758,6 +2684,8 @@ static const struct rpc_xprt_ops xs_udp_ops = { .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, + .get_srcaddr = xs_sock_srcaddr, + .get_srcport = xs_sock_srcport, .buf_alloc = rpc_malloc, .buf_free = rpc_free, .send_request = xs_udp_send_request, @@ -2780,6 +2708,8 @@ static const struct rpc_xprt_ops xs_tcp_ops = { .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, + .get_srcaddr = xs_sock_srcaddr, + .get_srcport = xs_sock_srcport, .buf_alloc = rpc_malloc, .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, @@ -2914,6 +2844,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = 0; + xprt->xprt_class = &xs_local_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; @@ -2937,9 +2868,6 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) } xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL); - ret = ERR_PTR(xs_local_setup_socket(transport)); - if (ret) - goto out_err; break; default: ret = ERR_PTR(-EAFNOSUPPORT); @@ -2983,6 +2911,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_UDP; + xprt->xprt_class = &xs_udp_transport; /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); @@ -3063,6 +2992,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; + xprt->xprt_class = &xs_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; @@ -3136,6 +3066,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; + xprt->xprt_class = &xs_bc_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->timeout = &xs_tcp_default_timeout; @@ -3203,6 +3134,7 @@ static struct xprt_class xs_local_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_LOCAL, .setup = xs_setup_local, + .netid = { "" }, }; static struct xprt_class xs_udp_transport = { @@ -3211,6 +3143,7 @@ static struct xprt_class xs_udp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_UDP, .setup = xs_setup_udp, + .netid = { "udp", "udp6", "" }, }; static struct xprt_class xs_tcp_transport = { @@ -3219,6 +3152,7 @@ static struct xprt_class xs_tcp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_TCP, .setup = xs_setup_tcp, + .netid = { "tcp", "tcp6", "" }, }; static struct xprt_class xs_bc_tcp_transport = { @@ -3227,6 +3161,7 @@ static struct xprt_class xs_bc_tcp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_BC_TCP, .setup = xs_setup_bc_tcp, + .netid = { "" }, }; /** @@ -3263,24 +3198,6 @@ void cleanup_socket_xprt(void) xprt_unregister_transport(&xs_bc_tcp_transport); } -static int param_set_uint_minmax(const char *val, - const struct kernel_param *kp, - unsigned int min, unsigned int max) -{ - unsigned int num; - int ret; - - if (!val) - return -EINVAL; - ret = kstrtouint(val, 0, &num); - if (ret) - return ret; - if (num < min || num > max) - return -EINVAL; - *((unsigned int *)kp->arg) = num; - return 0; -} - static int param_set_portnr(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, |