aboutsummaryrefslogtreecommitdiffstats
path: root/net/sunrpc
diff options
context:
space:
mode:
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/Kconfig1
-rw-r--r--net/sunrpc/Makefile2
-rw-r--r--net/sunrpc/addr.c44
-rw-r--r--net/sunrpc/auth.c14
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c225
-rw-r--r--net/sunrpc/auth_gss/auth_gss_internal.h45
-rw-r--r--net/sunrpc/auth_gss/gss_generic_token.c6
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c294
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c6
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c126
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seqnum.c91
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c119
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c15
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c20
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c3
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c299
-rw-r--r--net/sunrpc/auth_gss/trace.c3
-rw-r--r--net/sunrpc/auth_unix.c16
-rw-r--r--net/sunrpc/backchannel_rqst.c40
-rw-r--r--net/sunrpc/cache.c239
-rw-r--r--net/sunrpc/clnt.c675
-rw-r--r--net/sunrpc/debugfs.c82
-rw-r--r--net/sunrpc/fail.h25
-rw-r--r--net/sunrpc/rpc_pipe.c15
-rw-r--r--net/sunrpc/rpcb_clnt.c146
-rw-r--r--net/sunrpc/sched.c297
-rw-r--r--net/sunrpc/socklib.c138
-rw-r--r--net/sunrpc/socklib.h15
-rw-r--r--net/sunrpc/stats.c2
-rw-r--r--net/sunrpc/sunrpc.h21
-rw-r--r--net/sunrpc/sunrpc_syms.c12
-rw-r--r--net/sunrpc/svc.c504
-rw-r--r--net/sunrpc/svc_xprt.c259
-rw-r--r--net/sunrpc/svcauth.c35
-rw-r--r--net/sunrpc/svcauth_unix.c108
-rw-r--r--net/sunrpc/svcsock.c754
-rw-r--r--net/sunrpc/sysctl.c46
-rw-r--r--net/sunrpc/sysfs.c626
-rw-r--r--net/sunrpc/sysfs.h42
-rw-r--r--net/sunrpc/xdr.c1205
-rw-r--r--net/sunrpc/xprt.c440
-rw-r--r--net/sunrpc/xprtmultipath.c150
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c28
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c469
-rw-r--r--net/sunrpc/xprtrdma/module.c1
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c302
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c199
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c170
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_pcl.c306
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c683
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c863
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c1016
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c174
-rw-r--r--net/sunrpc/xprtrdma/transport.c142
-rw-r--r--net/sunrpc/xprtrdma/verbs.c913
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h156
-rw-r--r--net/sunrpc/xprtsock.c733
60 files changed, 7742 insertions, 5622 deletions
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 3bcf985507be..bbbb5af0af13 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -21,7 +21,6 @@ config RPCSEC_GSS_KRB5
depends on SUNRPC && CRYPTO
depends on CRYPTO_MD5 && CRYPTO_DES && CRYPTO_CBC && CRYPTO_CTS
depends on CRYPTO_ECB && CRYPTO_HMAC && CRYPTO_SHA1 && CRYPTO_AES
- depends on CRYPTO_ARC4
default y
select SUNRPC_GSS
help
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 9488600451e8..1c8de397d6ad 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
auth.o auth_null.o auth_unix.o \
svc.o svcsock.o svcauth.o svcauth_unix.o \
addr.o rpcb_clnt.o timer.o xdr.o \
- sunrpc_syms.o cache.o rpc_pipe.o \
+ sunrpc_syms.o cache.o rpc_pipe.o sysfs.o \
svc_xprt.o \
xprtmultipath.o
sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index 8b4d72b1a066..d435bffc6199 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -82,11 +82,11 @@ static size_t rpc_ntop6(const struct sockaddr *sap,
rc = snprintf(scopebuf, sizeof(scopebuf), "%c%u",
IPV6_SCOPE_DELIMITER, sin6->sin6_scope_id);
- if (unlikely((size_t)rc > sizeof(scopebuf)))
+ if (unlikely((size_t)rc >= sizeof(scopebuf)))
return 0;
len += rc;
- if (unlikely(len > buflen))
+ if (unlikely(len >= buflen))
return 0;
strcat(buf, scopebuf);
@@ -162,8 +162,10 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
const size_t buflen, const char *delim,
struct sockaddr_in6 *sin6)
{
- char *p;
+ char p[IPV6_SCOPE_ID_LEN + 1];
size_t len;
+ u32 scope_id = 0;
+ struct net_device *dev;
if ((buf + buflen) == delim)
return 1;
@@ -175,29 +177,23 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
return 0;
len = (buf + buflen) - delim - 1;
- p = kmemdup_nul(delim + 1, len, GFP_KERNEL);
- if (p) {
- u32 scope_id = 0;
- struct net_device *dev;
-
- dev = dev_get_by_name(net, p);
- if (dev != NULL) {
- scope_id = dev->ifindex;
- dev_put(dev);
- } else {
- if (kstrtou32(p, 10, &scope_id) == 0) {
- kfree(p);
- return 0;
- }
- }
-
- kfree(p);
-
- sin6->sin6_scope_id = scope_id;
- return 1;
+ if (len > IPV6_SCOPE_ID_LEN)
+ return 0;
+
+ memcpy(p, delim + 1, len);
+ p[len] = 0;
+
+ dev = dev_get_by_name(net, p);
+ if (dev != NULL) {
+ scope_id = dev->ifindex;
+ dev_put(dev);
+ } else {
+ if (kstrtou32(p, 10, &scope_id) != 0)
+ return 0;
}
- return 0;
+ sin6->sin6_scope_id = scope_id;
+ return 1;
}
static size_t rpc_pton6(struct net *net, const char *buf, const size_t buflen,
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 5748ad0ba1bd..fb75a883503f 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -81,7 +81,7 @@ static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp)
unsigned int nbits;
nbits = *(unsigned int *)kp->arg;
- return sprintf(buffer, "%u", 1U << nbits);
+ return sprintf(buffer, "%u\n", 1U << nbits);
}
#define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int);
@@ -445,7 +445,7 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
* Enforce a 60 second garbage collection moratorium
* Note that the cred_unused list must be time-ordered.
*/
- if (!time_in_range(cred->cr_expire, expired, jiffies))
+ if (time_in_range(cred->cr_expire, expired, jiffies))
continue;
if (!rpcauth_unhash_cred(cred))
continue;
@@ -615,6 +615,8 @@ rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
};
struct rpc_cred *ret;
+ if (RPC_IS_ASYNC(task))
+ lookupflags |= RPCAUTH_LOOKUP_ASYNC;
ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags);
put_cred(acred.cred);
return ret;
@@ -631,6 +633,8 @@ rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags)
if (!acred.principal)
return NULL;
+ if (RPC_IS_ASYNC(task))
+ lookupflags |= RPCAUTH_LOOKUP_ASYNC;
return auth->au_ops->lookup_cred(auth, &acred, lookupflags);
}
@@ -654,7 +658,7 @@ rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags)
};
if (flags & RPC_TASK_ASYNC)
- lookupflags |= RPCAUTH_LOOKUP_NEW;
+ lookupflags |= RPCAUTH_LOOKUP_NEW | RPCAUTH_LOOKUP_ASYNC;
if (task->tk_op_cred)
/* Task must use exactly this rpc_cred */
new = get_rpccred(task->tk_op_cred);
@@ -666,7 +670,7 @@ rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags)
/* If machine cred couldn't be bound, try a root cred */
if (new)
;
- else if (cred == &machine_cred || (flags & RPC_TASK_ROOTCREDS))
+ else if (cred == &machine_cred)
new = rpcauth_bind_root_cred(task, lookupflags);
else if (flags & RPC_TASK_NULLCREDS)
new = authnull_ops.lookup_cred(NULL, NULL, 0);
@@ -870,7 +874,7 @@ int __init rpcauth_init_module(void)
err = rpc_init_authunix();
if (err < 0)
goto out1;
- err = register_shrinker(&rpc_cred_shrinker);
+ err = register_shrinker(&rpc_cred_shrinker, "sunrpc_cred");
if (err < 0)
goto out2;
return 0;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 24ca861815b1..7bb247c51e2f 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -20,6 +20,7 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/auth.h>
#include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/gss_krb5.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/gss_err.h>
#include <linux/workqueue.h>
@@ -28,6 +29,7 @@
#include <linux/uaccess.h>
#include <linux/hashtable.h>
+#include "auth_gss_internal.h"
#include "../netns.h"
#include <trace/events/rpcgss.h>
@@ -70,7 +72,8 @@ struct gss_auth {
struct gss_api_mech *mech;
enum rpc_gss_svc service;
struct rpc_clnt *client;
- struct net *net;
+ struct net *net;
+ netns_tracker ns_tracker;
/*
* There are two upcall pipes; dentry[1], named "gssd", is used
* for the new text-based upcall; dentry[0] is named after the
@@ -124,35 +127,6 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags);
}
-static const void *
-simple_get_bytes(const void *p, const void *end, void *res, size_t len)
-{
- const void *q = (const void *)((const char *)p + len);
- if (unlikely(q > end || q < p))
- return ERR_PTR(-EFAULT);
- memcpy(res, p, len);
- return q;
-}
-
-static inline const void *
-simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
-{
- const void *q;
- unsigned int len;
-
- p = simple_get_bytes(p, end, &len, sizeof(len));
- if (IS_ERR(p))
- return p;
- q = (const void *)((const char *)p + len);
- if (unlikely(q > end || q < p))
- return ERR_PTR(-EFAULT);
- dest->data = kmemdup(p, len, GFP_NOFS);
- if (unlikely(dest->data == NULL))
- return ERR_PTR(-ENOMEM);
- dest->len = len;
- return q;
-}
-
static struct gss_cl_ctx *
gss_cred_get_ctx(struct rpc_cred *cred)
{
@@ -172,7 +146,7 @@ gss_alloc_context(void)
{
struct gss_cl_ctx *ctx;
- ctx = kzalloc(sizeof(*ctx), GFP_NOFS);
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (ctx != NULL) {
ctx->gc_proc = RPC_GSS_PROC_DATA;
ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */
@@ -235,7 +209,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
p = ERR_PTR(-EFAULT);
goto err;
}
- ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_NOFS);
+ ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_KERNEL);
if (ret < 0) {
trace_rpcgss_import_ctx(ret);
p = ERR_PTR(ret);
@@ -253,7 +227,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
if (IS_ERR(p))
goto err;
done:
- trace_rpcgss_context(ctx->gc_expiry, now, timeout,
+ trace_rpcgss_context(window_size, ctx->gc_expiry, now, timeout,
ctx->gc_acceptor.len, ctx->gc_acceptor.data);
err:
return p;
@@ -537,7 +511,7 @@ gss_alloc_msg(struct gss_auth *gss_auth,
int vers;
int err = -ENOMEM;
- gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS);
+ gss_msg = kzalloc(sizeof(*gss_msg), GFP_KERNEL);
if (gss_msg == NULL)
goto err;
vers = get_pipe_version(gss_auth->net);
@@ -553,7 +527,7 @@ gss_alloc_msg(struct gss_auth *gss_auth,
gss_msg->auth = gss_auth;
kref_get(&gss_auth->kref);
if (service_name) {
- gss_msg->service_name = kstrdup_const(service_name, GFP_NOFS);
+ gss_msg->service_name = kstrdup_const(service_name, GFP_KERNEL);
if (!gss_msg->service_name) {
err = -ENOMEM;
goto err_put_pipe_version;
@@ -696,10 +670,12 @@ retry:
}
schedule();
}
- if (gss_msg->ctx)
+ if (gss_msg->ctx) {
+ trace_rpcgss_ctx_init(gss_cred);
gss_cred_set_ctx(cred, gss_msg->ctx);
- else
+ } else {
err = gss_msg->msg.errno;
+ }
spin_unlock(&pipe->lock);
out_intr:
finish_wait(&gss_msg->waitqueue, &wait);
@@ -727,7 +703,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
if (mlen > MSG_BUF_MAXSIZE)
goto out;
err = -ENOMEM;
- buf = kmalloc(mlen, GFP_NOFS);
+ buf = kmalloc(mlen, GFP_KERNEL);
if (!buf)
goto out;
@@ -1038,7 +1014,8 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
goto err_free;
}
gss_auth->client = clnt;
- gss_auth->net = get_net(rpc_net_ns(clnt));
+ gss_auth->net = get_net_track(rpc_net_ns(clnt), &gss_auth->ns_tracker,
+ GFP_KERNEL);
err = -EINVAL;
gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
if (!gss_auth->mech)
@@ -1050,14 +1027,14 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
goto err_put_mech;
auth = &gss_auth->rpc_auth;
auth->au_cslack = GSS_CRED_SLACK >> 2;
- auth->au_rslack = GSS_VERF_SLACK >> 2;
+ auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2;
auth->au_verfsize = GSS_VERF_SLACK >> 2;
auth->au_ralign = GSS_VERF_SLACK >> 2;
- auth->au_flags = 0;
+ __set_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags);
auth->au_ops = &authgss_ops;
auth->au_flavor = flavor;
if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
- auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
+ __set_bit(RPCAUTH_AUTH_DATATOUCH, &auth->au_flags);
refcount_set(&auth->au_count, 1);
kref_init(&gss_auth->kref);
@@ -1093,7 +1070,7 @@ err_destroy_credcache:
err_put_mech:
gss_mech_put(gss_auth->mech);
err_put_net:
- put_net(gss_auth->net);
+ put_net_track(gss_auth->net, &gss_auth->ns_tracker);
err_free:
kfree(gss_auth->target_name);
kfree(gss_auth);
@@ -1109,7 +1086,7 @@ gss_free(struct gss_auth *gss_auth)
gss_pipe_free(gss_auth->gss_pipe[0]);
gss_pipe_free(gss_auth->gss_pipe[1]);
gss_mech_put(gss_auth->mech);
- put_net(gss_auth->net);
+ put_net_track(gss_auth->net, &gss_auth->ns_tracker);
kfree(gss_auth->target_name);
kfree(gss_auth);
@@ -1243,7 +1220,7 @@ gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
struct gss_cred *new;
/* Make a copy of the cred so that we can reference count it */
- new = kzalloc(sizeof(*gss_cred), GFP_NOFS);
+ new = kzalloc(sizeof(*gss_cred), GFP_KERNEL);
if (new) {
struct auth_cred acred = {
.cred = gss_cred->gc_base.cr_cred,
@@ -1283,8 +1260,9 @@ gss_send_destroy_context(struct rpc_cred *cred)
if (new) {
ctx->gc_proc = RPC_GSS_PROC_DESTROY;
+ trace_rpcgss_ctx_destroy(gss_cred);
task = rpc_call_null(gss_auth->client, &new->gc_base,
- RPC_TASK_ASYNC|RPC_TASK_SOFT);
+ RPC_TASK_ASYNC);
if (!IS_ERR(task))
rpc_put_task(task);
@@ -1348,7 +1326,6 @@ gss_destroy_nullcred(struct rpc_cred *cred)
static void
gss_destroy_cred(struct rpc_cred *cred)
{
-
if (test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0)
gss_send_destroy_context(cred);
gss_destroy_nullcred(cred);
@@ -1363,10 +1340,11 @@ gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
/*
* Lookup RPCSEC_GSS cred for the current process
*/
-static struct rpc_cred *
-gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+static struct rpc_cred *gss_lookup_cred(struct rpc_auth *auth,
+ struct auth_cred *acred, int flags)
{
- return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
+ return rpcauth_lookup_credcache(auth, acred, flags,
+ rpc_task_gfp_mask());
}
static struct rpc_cred *
@@ -1612,6 +1590,7 @@ static int gss_renew_cred(struct rpc_task *task)
new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW);
if (IS_ERR(new))
return PTR_ERR(new);
+
task->tk_rqstp->rq_cred = new;
put_rpccred(oldcred);
return 0;
@@ -1691,7 +1670,7 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr)
if (!p)
goto validate_failed;
- seq = kmalloc(4, GFP_NOFS);
+ seq = kmalloc(4, GFP_KERNEL);
if (!seq)
goto validate_failed;
*seq = cpu_to_be32(task->tk_rqstp->rq_seqno);
@@ -1708,7 +1687,8 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr)
/* We leave it to unwrap to calculate au_rslack. For now we just
* calculate the length of the verifier: */
- cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2;
+ if (test_bit(RPCAUTH_AUTH_UPDATE_SLACK, &cred->cr_auth->au_flags))
+ cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2;
status = 0;
out:
gss_put_ctx(ctx);
@@ -1724,8 +1704,9 @@ bad_mic:
goto out;
}
-static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_rqst *rqstp = task->tk_rqstp;
struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf;
@@ -1799,11 +1780,11 @@ alloc_enc_pages(struct rpc_rqst *rqstp)
rqstp->rq_enc_pages
= kmalloc_array(rqstp->rq_enc_pages_num,
sizeof(struct page *),
- GFP_NOFS);
+ GFP_KERNEL);
if (!rqstp->rq_enc_pages)
goto out;
for (i=0; i < rqstp->rq_enc_pages_num; i++) {
- rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS);
+ rqstp->rq_enc_pages[i] = alloc_page(GFP_KERNEL);
if (rqstp->rq_enc_pages[i] == NULL)
goto out_free;
}
@@ -1816,8 +1797,9 @@ out:
return -EAGAIN;
}
-static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_rqst *rqstp = task->tk_rqstp;
struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
@@ -1877,7 +1859,7 @@ static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
else
iov = snd_buf->head;
p = iov->iov_base + iov->iov_len;
- pad = 3 - ((snd_buf->len - offset - 1) & 3);
+ pad = xdr_pad_size(snd_buf->len - offset);
memset(p, 0, pad);
iov->iov_len += pad;
snd_buf->len += pad;
@@ -1924,73 +1906,125 @@ out:
return status;
}
-static int
-gss_unwrap_resp_auth(struct rpc_cred *cred)
+/**
+ * gss_update_rslack - Possibly update RPC receive buffer size estimates
+ * @task: rpc_task for incoming RPC Reply being unwrapped
+ * @cred: controlling rpc_cred for @task
+ * @before: XDR words needed before each RPC Reply message
+ * @after: XDR words needed following each RPC Reply message
+ *
+ */
+static void gss_update_rslack(struct rpc_task *task, struct rpc_cred *cred,
+ unsigned int before, unsigned int after)
{
struct rpc_auth *auth = cred->cr_auth;
- auth->au_rslack = auth->au_verfsize;
- auth->au_ralign = auth->au_verfsize;
- return 0;
+ if (test_and_clear_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags)) {
+ auth->au_ralign = auth->au_verfsize + before;
+ auth->au_rslack = auth->au_verfsize + after;
+ trace_rpcgss_update_slack(task, auth);
+ }
}
static int
+gss_unwrap_resp_auth(struct rpc_task *task, struct rpc_cred *cred)
+{
+ gss_update_rslack(task, cred, 0, 0);
+ return 0;
+}
+
+/*
+ * RFC 2203, Section 5.3.2.2
+ *
+ * struct rpc_gss_integ_data {
+ * opaque databody_integ<>;
+ * opaque checksum<>;
+ * };
+ *
+ * struct rpc_gss_data_t {
+ * unsigned int seq_num;
+ * proc_req_arg_t arg;
+ * };
+ */
+static noinline_for_stack int
gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
struct xdr_stream *xdr)
{
- struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf;
- u32 data_offset, mic_offset, integ_len, maj_stat;
- struct rpc_auth *auth = cred->cr_auth;
+ struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf;
+ u32 len, offset, seqno, maj_stat;
struct xdr_netobj mic;
- __be32 *p;
+ int ret;
- p = xdr_inline_decode(xdr, 2 * sizeof(*p));
- if (unlikely(!p))
+ ret = -EIO;
+ mic.data = NULL;
+
+ /* opaque databody_integ<>; */
+ if (xdr_stream_decode_u32(xdr, &len))
goto unwrap_failed;
- integ_len = be32_to_cpup(p++);
- if (integ_len & 3)
+ if (len & 3)
goto unwrap_failed;
- data_offset = (u8 *)(p) - (u8 *)rcv_buf->head[0].iov_base;
- mic_offset = integ_len + data_offset;
- if (mic_offset > rcv_buf->len)
+ offset = rcv_buf->len - xdr_stream_remaining(xdr);
+ if (xdr_stream_decode_u32(xdr, &seqno))
goto unwrap_failed;
- if (be32_to_cpup(p) != rqstp->rq_seqno)
+ if (seqno != rqstp->rq_seqno)
goto bad_seqno;
+ if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len))
+ goto unwrap_failed;
- if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
+ /*
+ * The xdr_stream now points to the beginning of the
+ * upper layer payload, to be passed below to
+ * rpcauth_unwrap_resp_decode(). The checksum, which
+ * follows the upper layer payload in @rcv_buf, is
+ * located and parsed without updating the xdr_stream.
+ */
+
+ /* opaque checksum<>; */
+ offset += len;
+ if (xdr_decode_word(rcv_buf, offset, &len))
goto unwrap_failed;
- if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset))
+ offset += sizeof(__be32);
+ if (offset + len > rcv_buf->len)
goto unwrap_failed;
- maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
+ mic.len = len;
+ mic.data = kmalloc(len, GFP_KERNEL);
+ if (ZERO_OR_NULL_PTR(mic.data))
+ goto unwrap_failed;
+ if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len))
+ goto unwrap_failed;
+
+ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
if (maj_stat != GSS_S_COMPLETE)
goto bad_mic;
- auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len);
- auth->au_ralign = auth->au_verfsize + 2;
- return 0;
+ gss_update_rslack(task, cred, 2, 2 + 1 + XDR_QUADLEN(mic.len));
+ ret = 0;
+
+out:
+ kfree(mic.data);
+ return ret;
+
unwrap_failed:
trace_rpcgss_unwrap_failed(task);
- return -EIO;
+ goto out;
bad_seqno:
- trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(p));
- return -EIO;
+ trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno);
+ goto out;
bad_mic:
trace_rpcgss_verify_mic(task, maj_stat);
- return -EIO;
+ goto out;
}
-static int
+static noinline_for_stack int
gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
struct xdr_stream *xdr)
{
struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
struct kvec *head = rqstp->rq_rcv_buf.head;
- struct rpc_auth *auth = cred->cr_auth;
- unsigned int savedlen = rcv_buf->len;
u32 offset, opaque_len, maj_stat;
__be32 *p;
@@ -2001,9 +2035,9 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
offset = (u8 *)(p) - (u8 *)head->iov_base;
if (offset + opaque_len > rcv_buf->len)
goto unwrap_failed;
- rcv_buf->len = offset + opaque_len;
- maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf);
+ maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset,
+ offset + opaque_len, rcv_buf);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
if (maj_stat != GSS_S_COMPLETE)
@@ -2017,10 +2051,9 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
*/
xdr_init_decode(xdr, rcv_buf, p, rqstp);
- auth->au_rslack = auth->au_verfsize + 2 +
- XDR_QUADLEN(savedlen - rcv_buf->len);
- auth->au_ralign = auth->au_verfsize + 2 +
- XDR_QUADLEN(savedlen - rcv_buf->len);
+ gss_update_rslack(task, cred, 2 + ctx->gc_gss_ctx->align,
+ 2 + ctx->gc_gss_ctx->slack);
+
return 0;
unwrap_failed:
trace_rpcgss_unwrap_failed(task);
@@ -2090,7 +2123,7 @@ gss_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr)
goto out_decode;
switch (gss_cred->gc_service) {
case RPC_GSS_SVC_NONE:
- status = gss_unwrap_resp_auth(cred);
+ status = gss_unwrap_resp_auth(task, cred);
break;
case RPC_GSS_SVC_INTEGRITY:
status = gss_unwrap_resp_integ(task, cred, ctx, rqstp, xdr);
diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h
new file mode 100644
index 000000000000..c53b329092d4
--- /dev/null
+++ b/net/sunrpc/auth_gss/auth_gss_internal.h
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*
+ * linux/net/sunrpc/auth_gss/auth_gss_internal.h
+ *
+ * Internal definitions for RPCSEC_GSS client authentication
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ */
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/sunrpc/xdr.h>
+
+static inline const void *
+simple_get_bytes(const void *p, const void *end, void *res, size_t len)
+{
+ const void *q = (const void *)((const char *)p + len);
+ if (unlikely(q > end || q < p))
+ return ERR_PTR(-EFAULT);
+ memcpy(res, p, len);
+ return q;
+}
+
+static inline const void *
+simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
+{
+ const void *q;
+ unsigned int len;
+
+ p = simple_get_bytes(p, end, &len, sizeof(len));
+ if (IS_ERR(p))
+ return p;
+ q = (const void *)((const char *)p + len);
+ if (unlikely(q > end || q < p))
+ return ERR_PTR(-EFAULT);
+ if (len) {
+ dest->data = kmemdup(p, len, GFP_KERNEL);
+ if (unlikely(dest->data == NULL))
+ return ERR_PTR(-ENOMEM);
+ } else
+ dest->data = NULL;
+ dest->len = len;
+ return q;
+}
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
index fe97f3106536..4a4082bb22ad 100644
--- a/net/sunrpc/auth_gss/gss_generic_token.c
+++ b/net/sunrpc/auth_gss/gss_generic_token.c
@@ -222,10 +222,8 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
if (ret)
return ret;
- if (!ret) {
- *buf_in = buf;
- *body_size = toksize;
- }
+ *buf_in = buf;
+ *body_size = toksize;
return ret;
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 6f2d30d7b766..3ea58175e159 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -138,135 +138,6 @@ checksummer(struct scatterlist *sg, void *data)
return crypto_ahash_update(req);
}
-static int
-arcfour_hmac_md5_usage_to_salt(unsigned int usage, u8 salt[4])
-{
- unsigned int ms_usage;
-
- switch (usage) {
- case KG_USAGE_SIGN:
- ms_usage = 15;
- break;
- case KG_USAGE_SEAL:
- ms_usage = 13;
- break;
- default:
- return -EINVAL;
- }
- salt[0] = (ms_usage >> 0) & 0xff;
- salt[1] = (ms_usage >> 8) & 0xff;
- salt[2] = (ms_usage >> 16) & 0xff;
- salt[3] = (ms_usage >> 24) & 0xff;
-
- return 0;
-}
-
-static u32
-make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
- struct xdr_buf *body, int body_offset, u8 *cksumkey,
- unsigned int usage, struct xdr_netobj *cksumout)
-{
- struct scatterlist sg[1];
- int err = -1;
- u8 *checksumdata;
- u8 *rc4salt;
- struct crypto_ahash *md5;
- struct crypto_ahash *hmac_md5;
- struct ahash_request *req;
-
- if (cksumkey == NULL)
- return GSS_S_FAILURE;
-
- if (cksumout->len < kctx->gk5e->cksumlength) {
- dprintk("%s: checksum buffer length, %u, too small for %s\n",
- __func__, cksumout->len, kctx->gk5e->name);
- return GSS_S_FAILURE;
- }
-
- rc4salt = kmalloc_array(4, sizeof(*rc4salt), GFP_NOFS);
- if (!rc4salt)
- return GSS_S_FAILURE;
-
- if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) {
- dprintk("%s: invalid usage value %u\n", __func__, usage);
- goto out_free_rc4salt;
- }
-
- checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
- if (!checksumdata)
- goto out_free_rc4salt;
-
- md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(md5))
- goto out_free_cksum;
-
- hmac_md5 = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(hmac_md5))
- goto out_free_md5;
-
- req = ahash_request_alloc(md5, GFP_NOFS);
- if (!req)
- goto out_free_hmac_md5;
-
- ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
-
- err = crypto_ahash_init(req);
- if (err)
- goto out;
- sg_init_one(sg, rc4salt, 4);
- ahash_request_set_crypt(req, sg, NULL, 4);
- err = crypto_ahash_update(req);
- if (err)
- goto out;
-
- sg_init_one(sg, header, hdrlen);
- ahash_request_set_crypt(req, sg, NULL, hdrlen);
- err = crypto_ahash_update(req);
- if (err)
- goto out;
- err = xdr_process_buf(body, body_offset, body->len - body_offset,
- checksummer, req);
- if (err)
- goto out;
- ahash_request_set_crypt(req, NULL, checksumdata, 0);
- err = crypto_ahash_final(req);
- if (err)
- goto out;
-
- ahash_request_free(req);
- req = ahash_request_alloc(hmac_md5, GFP_NOFS);
- if (!req)
- goto out_free_hmac_md5;
-
- ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
-
- err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
- if (err)
- goto out;
-
- sg_init_one(sg, checksumdata, crypto_ahash_digestsize(md5));
- ahash_request_set_crypt(req, sg, checksumdata,
- crypto_ahash_digestsize(md5));
- err = crypto_ahash_digest(req);
- if (err)
- goto out;
-
- memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
- cksumout->len = kctx->gk5e->cksumlength;
-out:
- ahash_request_free(req);
-out_free_hmac_md5:
- crypto_free_ahash(hmac_md5);
-out_free_md5:
- crypto_free_ahash(md5);
-out_free_cksum:
- kfree(checksumdata);
-out_free_rc4salt:
- kfree(rc4salt);
- return err ? GSS_S_FAILURE : 0;
-}
-
/*
* checksum the plaintext data and hdrlen bytes of the token header
* The checksum is performed over the first 8 bytes of the
@@ -284,18 +155,13 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
u8 *checksumdata;
unsigned int checksumlen;
- if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR)
- return make_checksum_hmac_md5(kctx, header, hdrlen,
- body, body_offset,
- cksumkey, usage, cksumout);
-
if (cksumout->len < kctx->gk5e->cksumlength) {
dprintk("%s: checksum buffer length, %u, too small for %s\n",
__func__, cksumout->len, kctx->gk5e->name);
return GSS_S_FAILURE;
}
- checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
+ checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_KERNEL);
if (checksumdata == NULL)
return GSS_S_FAILURE;
@@ -303,7 +169,7 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
if (IS_ERR(tfm))
goto out_free_cksum;
- req = ahash_request_alloc(tfm, GFP_NOFS);
+ req = ahash_request_alloc(tfm, GFP_KERNEL);
if (!req)
goto out_free_ahash;
@@ -391,7 +257,7 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
return GSS_S_FAILURE;
}
- checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
+ checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_KERNEL);
if (!checksumdata)
return GSS_S_FAILURE;
@@ -399,7 +265,7 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
if (IS_ERR(tfm))
goto out_free_cksum;
- req = ahash_request_alloc(tfm, GFP_NOFS);
+ req = ahash_request_alloc(tfm, GFP_KERNEL);
if (!req)
goto out_free_ahash;
@@ -688,7 +554,7 @@ gss_krb5_cts_crypt(struct crypto_sync_skcipher *cipher, struct xdr_buf *buf,
WARN_ON(0);
return -ENOMEM;
}
- data = kmalloc(GSS_KRB5_MAX_BLOCKSIZE * 2, GFP_NOFS);
+ data = kmalloc(GSS_KRB5_MAX_BLOCKSIZE * 2, GFP_KERNEL);
if (!data)
return -ENOMEM;
@@ -851,8 +717,8 @@ out_err:
}
u32
-gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
- u32 *headskip, u32 *tailskip)
+gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, u32 len,
+ struct xdr_buf *buf, u32 *headskip, u32 *tailskip)
{
struct xdr_buf subbuf;
u32 ret = 0;
@@ -881,7 +747,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
/* create a segment skipping the header and leaving out the checksum */
xdr_buf_subsegment(buf, &subbuf, offset + GSS_KRB5_TOK_HDR_LEN,
- (buf->len - offset - GSS_KRB5_TOK_HDR_LEN -
+ (len - offset - GSS_KRB5_TOK_HDR_LEN -
kctx->gk5e->cksumlength));
nblocks = (subbuf.len + blocksize - 1) / blocksize;
@@ -926,7 +792,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
goto out_err;
/* Get the packet's hmac value */
- ret = read_bytes_from_xdr_buf(buf, buf->len - kctx->gk5e->cksumlength,
+ ret = read_bytes_from_xdr_buf(buf, len - kctx->gk5e->cksumlength,
pkt_hmac, kctx->gk5e->cksumlength);
if (ret)
goto out_err;
@@ -942,145 +808,3 @@ out_err:
ret = GSS_S_FAILURE;
return ret;
}
-
-/*
- * Compute Kseq given the initial session key and the checksum.
- * Set the key of the given cipher.
- */
-int
-krb5_rc4_setup_seq_key(struct krb5_ctx *kctx,
- struct crypto_sync_skcipher *cipher,
- unsigned char *cksum)
-{
- struct crypto_shash *hmac;
- struct shash_desc *desc;
- u8 Kseq[GSS_KRB5_MAX_KEYLEN];
- u32 zeroconstant = 0;
- int err;
-
- dprintk("%s: entered\n", __func__);
-
- hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0);
- if (IS_ERR(hmac)) {
- dprintk("%s: error %ld, allocating hash '%s'\n",
- __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
- return PTR_ERR(hmac);
- }
-
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_NOFS);
- if (!desc) {
- dprintk("%s: failed to allocate shash descriptor for '%s'\n",
- __func__, kctx->gk5e->cksum_name);
- crypto_free_shash(hmac);
- return -ENOMEM;
- }
-
- desc->tfm = hmac;
-
- /* Compute intermediate Kseq from session key */
- err = crypto_shash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = crypto_shash_digest(desc, (u8 *)&zeroconstant, 4, Kseq);
- if (err)
- goto out_err;
-
- /* Compute final Kseq from the checksum and intermediate Kseq */
- err = crypto_shash_setkey(hmac, Kseq, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = crypto_shash_digest(desc, cksum, 8, Kseq);
- if (err)
- goto out_err;
-
- err = crypto_sync_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = 0;
-
-out_err:
- kzfree(desc);
- crypto_free_shash(hmac);
- dprintk("%s: returning %d\n", __func__, err);
- return err;
-}
-
-/*
- * Compute Kcrypt given the initial session key and the plaintext seqnum.
- * Set the key of cipher kctx->enc.
- */
-int
-krb5_rc4_setup_enc_key(struct krb5_ctx *kctx,
- struct crypto_sync_skcipher *cipher,
- s32 seqnum)
-{
- struct crypto_shash *hmac;
- struct shash_desc *desc;
- u8 Kcrypt[GSS_KRB5_MAX_KEYLEN];
- u8 zeroconstant[4] = {0};
- u8 seqnumarray[4];
- int err, i;
-
- dprintk("%s: entered, seqnum %u\n", __func__, seqnum);
-
- hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0);
- if (IS_ERR(hmac)) {
- dprintk("%s: error %ld, allocating hash '%s'\n",
- __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
- return PTR_ERR(hmac);
- }
-
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_NOFS);
- if (!desc) {
- dprintk("%s: failed to allocate shash descriptor for '%s'\n",
- __func__, kctx->gk5e->cksum_name);
- crypto_free_shash(hmac);
- return -ENOMEM;
- }
-
- desc->tfm = hmac;
-
- /* Compute intermediate Kcrypt from session key */
- for (i = 0; i < kctx->gk5e->keylength; i++)
- Kcrypt[i] = kctx->Ksess[i] ^ 0xf0;
-
- err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = crypto_shash_digest(desc, zeroconstant, 4, Kcrypt);
- if (err)
- goto out_err;
-
- /* Compute final Kcrypt from the seqnum and intermediate Kcrypt */
- err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- seqnumarray[0] = (unsigned char) ((seqnum >> 24) & 0xff);
- seqnumarray[1] = (unsigned char) ((seqnum >> 16) & 0xff);
- seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff);
- seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff);
-
- err = crypto_shash_digest(desc, seqnumarray, 4, Kcrypt);
- if (err)
- goto out_err;
-
- err = crypto_sync_skcipher_setkey(cipher, Kcrypt,
- kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = 0;
-
-out_err:
- kzfree(desc);
- crypto_free_shash(hmac);
- dprintk("%s: returning %d\n", __func__, err);
- return err;
-}
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 3b7f721c023b..726c076950c0 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -228,11 +228,11 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
ret = 0;
err_free_raw:
- kzfree(rawkey);
+ kfree_sensitive(rawkey);
err_free_out:
- kzfree(outblockdata);
+ kfree_sensitive(outblockdata);
err_free_in:
- kzfree(inblockdata);
+ kfree_sensitive(inblockdata);
err_free_cipher:
crypto_free_sync_skcipher(cipher);
err_return:
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 75b3c2e9e8f8..1c092b05c2bb 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -21,6 +21,8 @@
#include <linux/sunrpc/xdr.h>
#include <linux/sunrpc/gss_krb5_enctypes.h>
+#include "auth_gss_internal.h"
+
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_AUTH
#endif
@@ -52,27 +54,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = {
},
#endif /* CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES */
/*
- * RC4-HMAC
- */
- {
- .etype = ENCTYPE_ARCFOUR_HMAC,
- .ctype = CKSUMTYPE_HMAC_MD5_ARCFOUR,
- .name = "rc4-hmac",
- .encrypt_name = "ecb(arc4)",
- .cksum_name = "hmac(md5)",
- .encrypt = krb5_encrypt,
- .decrypt = krb5_decrypt,
- .mk_key = NULL,
- .signalg = SGN_ALG_HMAC_MD5,
- .sealalg = SEAL_ALG_MICROSOFT_RC4,
- .keybytes = 16,
- .keylength = 16,
- .blocksize = 1,
- .conflen = 8,
- .cksumlength = 8,
- .keyed_cksum = 1,
- },
- /*
* 3DES
*/
{
@@ -164,35 +145,6 @@ get_gss_krb5_enctype(int etype)
return NULL;
}
-static const void *
-simple_get_bytes(const void *p, const void *end, void *res, int len)
-{
- const void *q = (const void *)((const char *)p + len);
- if (unlikely(q > end || q < p))
- return ERR_PTR(-EFAULT);
- memcpy(res, p, len);
- return q;
-}
-
-static const void *
-simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res)
-{
- const void *q;
- unsigned int len;
-
- p = simple_get_bytes(p, end, &len, sizeof(len));
- if (IS_ERR(p))
- return p;
- q = (const void *)((const char *)p + len);
- if (unlikely(q > end || q < p))
- return ERR_PTR(-EFAULT);
- res->data = kmemdup(p, len, GFP_NOFS);
- if (unlikely(res->data == NULL))
- return ERR_PTR(-ENOMEM);
- res->len = len;
- return q;
-}
-
static inline const void *
get_key(const void *p, const void *end,
struct krb5_ctx *ctx, struct crypto_sync_skcipher **res)
@@ -401,78 +353,6 @@ out_err:
return -EINVAL;
}
-/*
- * Note that RC4 depends on deriving keys using the sequence
- * number or the checksum of a token. Therefore, the final keys
- * cannot be calculated until the token is being constructed!
- */
-static int
-context_derive_keys_rc4(struct krb5_ctx *ctx)
-{
- struct crypto_shash *hmac;
- char sigkeyconstant[] = "signaturekey";
- int slen = strlen(sigkeyconstant) + 1; /* include null terminator */
- struct shash_desc *desc;
- int err;
-
- dprintk("RPC: %s: entered\n", __func__);
- /*
- * derive cksum (aka Ksign) key
- */
- hmac = crypto_alloc_shash(ctx->gk5e->cksum_name, 0, 0);
- if (IS_ERR(hmac)) {
- dprintk("%s: error %ld allocating hash '%s'\n",
- __func__, PTR_ERR(hmac), ctx->gk5e->cksum_name);
- err = PTR_ERR(hmac);
- goto out_err;
- }
-
- err = crypto_shash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength);
- if (err)
- goto out_err_free_hmac;
-
-
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), GFP_NOFS);
- if (!desc) {
- dprintk("%s: failed to allocate hash descriptor for '%s'\n",
- __func__, ctx->gk5e->cksum_name);
- err = -ENOMEM;
- goto out_err_free_hmac;
- }
-
- desc->tfm = hmac;
-
- err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum);
- kzfree(desc);
- if (err)
- goto out_err_free_hmac;
- /*
- * allocate hash, and skciphers for data and seqnum encryption
- */
- ctx->enc = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(ctx->enc)) {
- err = PTR_ERR(ctx->enc);
- goto out_err_free_hmac;
- }
-
- ctx->seq = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(ctx->seq)) {
- crypto_free_sync_skcipher(ctx->enc);
- err = PTR_ERR(ctx->seq);
- goto out_err_free_hmac;
- }
-
- dprintk("RPC: %s: returning success\n", __func__);
-
- err = 0;
-
-out_err_free_hmac:
- crypto_free_shash(hmac);
-out_err:
- dprintk("RPC: %s: returning %d\n", __func__, err);
- return err;
-}
-
static int
context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask)
{
@@ -649,8 +529,6 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
switch (ctx->enctype) {
case ENCTYPE_DES3_CBC_RAW:
return context_derive_keys_des3(ctx, gfp_mask);
- case ENCTYPE_ARCFOUR_HMAC:
- return context_derive_keys_rc4(ctx);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
return context_derive_keys_new(ctx, gfp_mask);
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index f1d280accf43..33061417ec97 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -214,7 +214,6 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
return gss_get_mic_v1(ctx, text, token);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index 507105127095..3200b971a814 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -39,42 +39,6 @@
# define RPCDBG_FACILITY RPCDBG_AUTH
#endif
-static s32
-krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
- unsigned char *cksum, unsigned char *buf)
-{
- struct crypto_sync_skcipher *cipher;
- unsigned char *plain;
- s32 code;
-
- dprintk("RPC: %s:\n", __func__);
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(cipher))
- return PTR_ERR(cipher);
-
- plain = kmalloc(8, GFP_NOFS);
- if (!plain)
- return -ENOMEM;
-
- plain[0] = (unsigned char) ((seqnum >> 24) & 0xff);
- plain[1] = (unsigned char) ((seqnum >> 16) & 0xff);
- plain[2] = (unsigned char) ((seqnum >> 8) & 0xff);
- plain[3] = (unsigned char) ((seqnum >> 0) & 0xff);
- plain[4] = direction;
- plain[5] = direction;
- plain[6] = direction;
- plain[7] = direction;
-
- code = krb5_rc4_setup_seq_key(kctx, cipher, cksum);
- if (code)
- goto out;
-
- code = krb5_encrypt(cipher, cksum, plain, buf, 8);
-out:
- kfree(plain);
- crypto_free_sync_skcipher(cipher);
- return code;
-}
s32
krb5_make_seq_num(struct krb5_ctx *kctx,
struct crypto_sync_skcipher *key,
@@ -85,11 +49,7 @@ krb5_make_seq_num(struct krb5_ctx *kctx,
unsigned char *plain;
s32 code;
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
- return krb5_make_rc4_seq_num(kctx, direction, seqnum,
- cksum, buf);
-
- plain = kmalloc(8, GFP_NOFS);
+ plain = kmalloc(8, GFP_KERNEL);
if (!plain)
return -ENOMEM;
@@ -108,50 +68,6 @@ krb5_make_seq_num(struct krb5_ctx *kctx,
return code;
}
-static s32
-krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
- unsigned char *buf, int *direction, s32 *seqnum)
-{
- struct crypto_sync_skcipher *cipher;
- unsigned char *plain;
- s32 code;
-
- dprintk("RPC: %s:\n", __func__);
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(cipher))
- return PTR_ERR(cipher);
-
- code = krb5_rc4_setup_seq_key(kctx, cipher, cksum);
- if (code)
- goto out;
-
- plain = kmalloc(8, GFP_NOFS);
- if (!plain) {
- code = -ENOMEM;
- goto out;
- }
-
- code = krb5_decrypt(cipher, cksum, buf, plain, 8);
- if (code)
- goto out_plain;
-
- if ((plain[4] != plain[5]) || (plain[4] != plain[6])
- || (plain[4] != plain[7])) {
- code = (s32)KG_BAD_SEQ;
- goto out_plain;
- }
-
- *direction = plain[4];
-
- *seqnum = ((plain[0] << 24) | (plain[1] << 16) |
- (plain[2] << 8) | (plain[3]));
-out_plain:
- kfree(plain);
-out:
- crypto_free_sync_skcipher(cipher);
- return code;
-}
-
s32
krb5_get_seq_num(struct krb5_ctx *kctx,
unsigned char *cksum,
@@ -164,10 +80,7 @@ krb5_get_seq_num(struct krb5_ctx *kctx,
dprintk("RPC: krb5_get_seq_num:\n");
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
- return krb5_get_rc4_seq_num(kctx, cksum, buf,
- direction, seqnum);
- plain = kmalloc(8, GFP_NOFS);
+ plain = kmalloc(8, GFP_KERNEL);
if (!plain)
return -ENOMEM;
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index aaab91cf24c8..ba04e3ec970a 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -218,7 +218,6 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
return gss_verify_mic_v1(ctx, message_buffer, read_token);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 6c1920eed771..48337687848c 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -130,14 +130,14 @@ gss_krb5_make_confounder(char *p, u32 conflen)
/* initialize to random value */
if (i == 0) {
- i = prandom_u32();
- i = (i << 32) | prandom_u32();
+ i = get_random_u32();
+ i = (i << 32) | get_random_u32();
}
switch (conflen) {
case 16:
*q++ = i++;
- /* fall through */
+ fallthrough;
case 8:
*q++ = i++;
break;
@@ -236,32 +236,17 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)))
return GSS_S_FAILURE;
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
- struct crypto_sync_skcipher *cipher;
- int err;
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name,
- 0, 0);
- if (IS_ERR(cipher))
- return GSS_S_FAILURE;
-
- krb5_rc4_setup_enc_key(kctx, cipher, seq_send);
-
- err = gss_encrypt_xdr_buf(cipher, buf,
- offset + headlen - conflen, pages);
- crypto_free_sync_skcipher(cipher);
- if (err)
- return GSS_S_FAILURE;
- } else {
- if (gss_encrypt_xdr_buf(kctx->enc, buf,
- offset + headlen - conflen, pages))
- return GSS_S_FAILURE;
- }
+ if (gss_encrypt_xdr_buf(kctx->enc, buf,
+ offset + headlen - conflen, pages))
+ return GSS_S_FAILURE;
return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
}
static u32
-gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
+gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, int len,
+ struct xdr_buf *buf, unsigned int *slack,
+ unsigned int *align)
{
int signalg;
int sealalg;
@@ -279,12 +264,13 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
u32 conflen = kctx->gk5e->conflen;
int crypt_offset;
u8 *cksumkey;
+ unsigned int saved_len = buf->len;
dprintk("RPC: gss_unwrap_kerberos\n");
ptr = (u8 *)buf->head[0].iov_base + offset;
if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr,
- buf->len - offset))
+ len - offset))
return GSS_S_DEFECTIVE_TOKEN;
if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) ||
@@ -313,36 +299,9 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) -
(unsigned char *)buf->head[0].iov_base;
- /*
- * Need plaintext seqnum to derive encryption key for arcfour-hmac
- */
- if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN,
- ptr + 8, &direction, &seqnum))
- return GSS_S_BAD_SIG;
-
- if ((kctx->initiate && direction != 0xff) ||
- (!kctx->initiate && direction != 0))
- return GSS_S_BAD_SIG;
-
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
- struct crypto_sync_skcipher *cipher;
- int err;
-
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name,
- 0, 0);
- if (IS_ERR(cipher))
- return GSS_S_FAILURE;
-
- krb5_rc4_setup_enc_key(kctx, cipher, seqnum);
-
- err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset);
- crypto_free_sync_skcipher(cipher);
- if (err)
- return GSS_S_DEFECTIVE_TOKEN;
- } else {
- if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset))
- return GSS_S_DEFECTIVE_TOKEN;
- }
+ buf->len = len;
+ if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset))
+ return GSS_S_DEFECTIVE_TOKEN;
if (kctx->gk5e->keyed_cksum)
cksumkey = kctx->cksum;
@@ -366,6 +325,14 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
/* do sequencing checks */
+ if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN,
+ ptr + 8, &direction, &seqnum))
+ return GSS_S_BAD_SIG;
+
+ if ((kctx->initiate && direction != 0xff) ||
+ (!kctx->initiate && direction != 0))
+ return GSS_S_BAD_SIG;
+
/* Copy the data back to the right position. XXX: Would probably be
* better to copy and encrypt at the same time. */
@@ -376,11 +343,15 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start;
memmove(orig_start, data_start, data_len);
buf->head[0].iov_len -= (data_start - orig_start);
- buf->len -= (data_start - orig_start);
+ buf->len = len - (data_start - orig_start);
if (gss_krb5_remove_padding(buf, blocksize))
return GSS_S_DEFECTIVE_TOKEN;
+ /* slack must include room for krb5 padding */
+ *slack = XDR_QUADLEN(saved_len - buf->len);
+ /* The GSS blob always precedes the RPC message payload */
+ *align = *slack;
return GSS_S_COMPLETE;
}
@@ -438,7 +409,7 @@ static u32
gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
struct xdr_buf *buf, struct page **pages)
{
- u8 *ptr, *plainhdr;
+ u8 *ptr;
time64_t now;
u8 flags = 0x00;
__be16 *be16ptr;
@@ -455,7 +426,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
return GSS_S_FAILURE;
/* construct gss token header */
- ptr = plainhdr = buf->head[0].iov_base + offset;
+ ptr = buf->head[0].iov_base + offset;
*ptr++ = (unsigned char) ((KG2_TOK_WRAP>>8) & 0xff);
*ptr++ = (unsigned char) (KG2_TOK_WRAP & 0xff);
@@ -486,7 +457,9 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
}
static u32
-gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
+gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len,
+ struct xdr_buf *buf, unsigned int *slack,
+ unsigned int *align)
{
time64_t now;
u8 *ptr;
@@ -532,7 +505,7 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
if (rrc != 0)
rotate_left(offset + 16, buf, rrc);
- err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf,
+ err = (*kctx->gk5e->decrypt_v2)(kctx, offset, len, buf,
&headskip, &tailskip);
if (err)
return GSS_S_FAILURE;
@@ -542,7 +515,7 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
* it against the original
*/
err = read_bytes_from_xdr_buf(buf,
- buf->len - GSS_KRB5_TOK_HDR_LEN - tailskip,
+ len - GSS_KRB5_TOK_HDR_LEN - tailskip,
decrypted_hdr, GSS_KRB5_TOK_HDR_LEN);
if (err) {
dprintk("%s: error %u getting decrypted_hdr\n", __func__, err);
@@ -568,18 +541,19 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
* Note that buf->head[0].iov_len may indicate the available
* head buffer space rather than that actually occupied.
*/
- movelen = min_t(unsigned int, buf->head[0].iov_len, buf->len);
+ movelen = min_t(unsigned int, buf->head[0].iov_len, len);
movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip;
- if (offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen >
- buf->head[0].iov_len)
- return GSS_S_FAILURE;
+ BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen >
+ buf->head[0].iov_len);
memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen);
buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip;
- buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip;
+ buf->len = len - (GSS_KRB5_TOK_HDR_LEN + headskip);
/* Trim off the trailing "extra count" and checksum blob */
- buf->len -= ec + GSS_KRB5_TOK_HDR_LEN + tailskip;
+ xdr_buf_trim(buf, ec + GSS_KRB5_TOK_HDR_LEN + tailskip);
+ *align = XDR_QUADLEN(GSS_KRB5_TOK_HDR_LEN + headskip);
+ *slack = *align + XDR_QUADLEN(ec + GSS_KRB5_TOK_HDR_LEN + tailskip);
return GSS_S_COMPLETE;
}
@@ -594,7 +568,6 @@ gss_wrap_kerberos(struct gss_ctx *gctx, int offset,
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
return gss_wrap_kerberos_v1(kctx, offset, buf, pages);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
@@ -603,7 +576,8 @@ gss_wrap_kerberos(struct gss_ctx *gctx, int offset,
}
u32
-gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf)
+gss_unwrap_kerberos(struct gss_ctx *gctx, int offset,
+ int len, struct xdr_buf *buf)
{
struct krb5_ctx *kctx = gctx->internal_ctx_id;
@@ -612,10 +586,11 @@ gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf)
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
- return gss_unwrap_kerberos_v1(kctx, offset, buf);
+ return gss_unwrap_kerberos_v1(kctx, offset, len, buf,
+ &gctx->slack, &gctx->align);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
- return gss_unwrap_kerberos_v2(kctx, offset, buf);
+ return gss_unwrap_kerberos_v2(kctx, offset, len, buf,
+ &gctx->slack, &gctx->align);
}
}
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index db550bfc2642..fae632da1058 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -37,6 +37,8 @@ gss_mech_free(struct gss_api_mech *gm)
for (i = 0; i < gm->gm_pf_num; i++) {
pf = &gm->gm_pfs[i];
+ if (pf->domain)
+ auth_domain_put(pf->domain);
kfree(pf->auth_domain_name);
pf->auth_domain_name = NULL;
}
@@ -59,6 +61,7 @@ make_auth_domain_name(char *name)
static int
gss_mech_svc_setup(struct gss_api_mech *gm)
{
+ struct auth_domain *dom;
struct pf_desc *pf;
int i, status;
@@ -68,10 +71,13 @@ gss_mech_svc_setup(struct gss_api_mech *gm)
status = -ENOMEM;
if (pf->auth_domain_name == NULL)
goto out;
- status = svcauth_gss_register_pseudoflavor(pf->pseudoflavor,
- pf->auth_domain_name);
- if (status)
+ dom = svcauth_gss_register_pseudoflavor(
+ pf->pseudoflavor, pf->auth_domain_name);
+ if (IS_ERR(dom)) {
+ status = PTR_ERR(dom);
goto out;
+ }
+ pf->domain = dom;
}
return 0;
out:
@@ -411,10 +417,11 @@ gss_wrap(struct gss_ctx *ctx_id,
u32
gss_unwrap(struct gss_ctx *ctx_id,
int offset,
+ int len,
struct xdr_buf *buf)
{
return ctx_id->mech_type->gm_ops
- ->gss_unwrap(ctx_id, offset, buf);
+ ->gss_unwrap(ctx_id, offset, len, buf);
}
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index 0349f455a862..f549e4c05def 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -98,6 +98,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt)
* done without the correct namespace:
*/
.flags = RPC_CLNT_CREATE_NOPING |
+ RPC_CLNT_CREATE_CONNECTED |
RPC_CLNT_CREATE_NO_IDLE_TIMEOUT
};
struct rpc_clnt *clnt;
@@ -160,7 +161,7 @@ static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn)
mutex_lock(&sn->gssp_lock);
clnt = sn->gssp_clnt;
if (clnt)
- atomic_inc(&clnt->cl_count);
+ refcount_inc(&clnt->cl_count);
mutex_unlock(&sn->gssp_lock);
return clnt;
}
@@ -200,7 +201,7 @@ static int gssp_call(struct net *net, struct rpc_message *msg)
static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg)
{
- int i;
+ unsigned int i;
for (i = 0; i < arg->npages && arg->pages[i]; i++)
__free_page(arg->pages[i]);
@@ -210,20 +211,25 @@ static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg)
static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg)
{
+ unsigned int i;
+
arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE);
arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL);
- /*
- * XXX: actual pages are allocated by xdr layer in
- * xdr_partial_copy_from_skb.
- */
if (!arg->pages)
return -ENOMEM;
+ for (i = 0; i < arg->npages; i++) {
+ arg->pages[i] = alloc_page(GFP_KERNEL);
+ if (!arg->pages[i]) {
+ gssp_free_receive_pages(arg);
+ return -ENOMEM;
+ }
+ }
return 0;
}
static char *gssp_stringify(struct xdr_netobj *netobj)
{
- return kstrndup(netobj->data, netobj->len, GFP_KERNEL);
+ return kmemdup_nul(netobj->data, netobj->len, GFP_KERNEL);
}
static void gssp_hostbased_service(char **principal)
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 2ff7b7083eba..d79f12c2550a 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -771,7 +771,6 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req,
xdr_inline_pages(&req->rq_rcv_buf,
PAGE_SIZE/2 /* pretty arbitrary */,
arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE);
- req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
done:
if (err)
dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err);
@@ -789,7 +788,7 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
scratch = alloc_page(GFP_KERNEL);
if (!scratch)
return -ENOMEM;
- xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE);
+ xdr_set_scratch_page(xdr, scratch);
/* res->status */
err = gssx_dec_status(xdr, &res->status);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 65b67b257302..bcd74dddbe2d 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -55,10 +55,6 @@
#include "gss_rpc_upcall.h"
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_AUTH
-#endif
-
/* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests
* into replies.
*
@@ -184,6 +180,11 @@ static struct cache_head *rsi_alloc(void)
return NULL;
}
+static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall_timeout(cd, h);
+}
+
static void rsi_request(struct cache_detail *cd,
struct cache_head *h,
char **bpp, int *blen)
@@ -193,6 +194,8 @@ static void rsi_request(struct cache_detail *cd,
qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len);
qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len);
(*bpp)[-1] = '\n';
+ WARN_ONCE(*blen < 0,
+ "RPCSEC/GSS credential too large - please use gssproxy\n");
}
static int rsi_parse(struct cache_detail *cd,
@@ -282,6 +285,7 @@ static const struct cache_detail rsi_cache_template = {
.hash_size = RSI_HASHMAX,
.name = "auth.rpcsec.init",
.cache_put = rsi_put,
+ .cache_upcall = rsi_upcall,
.cache_request = rsi_request,
.cache_parse = rsi_parse,
.match = rsi_match,
@@ -330,7 +334,7 @@ static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct r
struct gss_svc_seq_data {
/* highest seq number seen so far: */
- int sd_max;
+ u32 sd_max;
/* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of
* sd_win is nonzero iff sequence number i has been seen already: */
unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG];
@@ -428,6 +432,11 @@ rsc_alloc(void)
return NULL;
}
+static int rsc_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return -EINVAL;
+}
+
static int rsc_parse(struct cache_detail *cd,
char *mesg, int mlen)
{
@@ -554,6 +563,7 @@ static const struct cache_detail rsc_cache_template = {
.hash_size = RSC_HASHMAX,
.name = "auth.rpcsec.context",
.cache_put = rsc_put,
+ .cache_upcall = rsc_upcall,
.cache_parse = rsc_parse,
.match = rsc_match,
.init = rsc_init,
@@ -605,16 +615,29 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle)
return found;
}
-/* Implements sequence number algorithm as specified in RFC 2203. */
-static int
-gss_check_seq_num(struct rsc *rsci, int seq_num)
+/**
+ * gss_check_seq_num - GSS sequence number window check
+ * @rqstp: RPC Call to use when reporting errors
+ * @rsci: cached GSS context state (updated on return)
+ * @seq_num: sequence number to check
+ *
+ * Implements sequence number algorithm as specified in
+ * RFC 2203, Section 5.3.3.1. "Context Management".
+ *
+ * Return values:
+ * %true: @rqstp's GSS sequence number is inside the window
+ * %false: @rqstp's GSS sequence number is outside the window
+ */
+static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci,
+ u32 seq_num)
{
struct gss_svc_seq_data *sd = &rsci->seqdata;
+ bool result = false;
spin_lock(&sd->sd_lock);
if (seq_num > sd->sd_max) {
if (seq_num >= sd->sd_max + GSS_SEQ_WIN) {
- memset(sd->sd_win,0,sizeof(sd->sd_win));
+ memset(sd->sd_win, 0, sizeof(sd->sd_win));
sd->sd_max = seq_num;
} else while (sd->sd_max < seq_num) {
sd->sd_max++;
@@ -622,18 +645,26 @@ gss_check_seq_num(struct rsc *rsci, int seq_num)
}
__set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win);
goto ok;
- } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) {
- goto drop;
+ } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) {
+ goto toolow;
}
- /* sd_max - GSS_SEQ_WIN < seq_num <= sd_max */
if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win))
- goto drop;
+ goto alreadyseen;
+
ok:
+ result = true;
+out:
spin_unlock(&sd->sd_lock);
- return 1;
-drop:
- spin_unlock(&sd->sd_lock);
- return 0;
+ return result;
+
+toolow:
+ trace_rpcgss_svc_seqno_low(rqstp, seq_num,
+ sd->sd_max - GSS_SEQ_WIN,
+ sd->sd_max);
+ goto out;
+alreadyseen:
+ trace_rpcgss_svc_seqno_seen(rqstp, seq_num);
+ goto out;
}
static inline u32 round_up_to_quad(u32 i)
@@ -678,11 +709,11 @@ svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o)
/*
* Verify the checksum on the header and return SVC_OK on success.
* Otherwise, return SVC_DROP (in the case of a bad sequence number)
- * or return SVC_DENIED and indicate error in authp.
+ * or return SVC_DENIED and indicate error in rqstp->rq_auth_stat.
*/
static int
gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci,
- __be32 *rpcstart, struct rpc_gss_wire_cred *gc, __be32 *authp)
+ __be32 *rpcstart, struct rpc_gss_wire_cred *gc)
{
struct gss_ctx *ctx_id = rsci->mechctx;
struct xdr_buf rpchdr;
@@ -696,7 +727,7 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci,
iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart;
xdr_buf_from_iov(&iov, &rpchdr);
- *authp = rpc_autherr_badverf;
+ rqstp->rq_auth_stat = rpc_autherr_badverf;
if (argv->iov_len < 4)
return SVC_DENIED;
flavor = svc_getnl(argv);
@@ -708,21 +739,17 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci,
if (rqstp->rq_deferred) /* skip verification of revisited request */
return SVC_OK;
if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) {
- *authp = rpcsec_gsserr_credproblem;
+ rqstp->rq_auth_stat = rpcsec_gsserr_credproblem;
return SVC_DENIED;
}
if (gc->gc_seq > MAXSEQ) {
- dprintk("RPC: svcauth_gss: discarding request with "
- "large sequence number %d\n", gc->gc_seq);
- *authp = rpcsec_gsserr_ctxproblem;
+ trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq);
+ rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem;
return SVC_DENIED;
}
- if (!gss_check_seq_num(rsci, gc->gc_seq)) {
- dprintk("RPC: svcauth_gss: discarding request with "
- "old sequence number %d\n", gc->gc_seq);
+ if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq))
return SVC_DROP;
- }
return SVC_OK;
}
@@ -754,7 +781,7 @@ gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq)
svc_putnl(rqstp->rq_res.head, RPC_AUTH_GSS);
xdr_seq = kmalloc(4, GFP_KERNEL);
if (!xdr_seq)
- return -1;
+ return -ENOMEM;
*xdr_seq = htonl(seq);
iov.iov_base = xdr_seq;
@@ -803,7 +830,7 @@ u32 svcauth_gss_flavor(struct auth_domain *dom)
EXPORT_SYMBOL_GPL(svcauth_gss_flavor);
-int
+struct auth_domain *
svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name)
{
struct gss_domain *new;
@@ -820,21 +847,23 @@ svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name)
new->h.flavour = &svcauthops_gss;
new->pseudoflavor = pseudoflavor;
- stat = 0;
test = auth_domain_lookup(name, &new->h);
- if (test != &new->h) { /* Duplicate registration */
+ if (test != &new->h) {
+ pr_warn("svc: duplicate registration of gss pseudo flavour %s.\n",
+ name);
+ stat = -EADDRINUSE;
auth_domain_put(test);
- kfree(new->h.name);
- goto out_free_dom;
+ goto out_free_name;
}
- return 0;
+ return test;
+out_free_name:
+ kfree(new->h.name);
out_free_dom:
kfree(new);
out:
- return stat;
+ return ERR_PTR(stat);
}
-
EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor);
static inline int
@@ -858,18 +887,20 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
static int
unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
{
+ u32 integ_len, rseqno, maj_stat;
int stat = -EINVAL;
- u32 integ_len, maj_stat;
struct xdr_netobj mic;
struct xdr_buf integ_buf;
+ mic.data = NULL;
+
/* NFS READ normally uses splice to send data in-place. However
* the data in cache can change after the reply's MIC is computed
* but before the RPC reply is sent. To prevent the client from
* rejecting the server-computed MIC in this somewhat rare case,
* do not use splice with the GSS integrity service.
*/
- clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+ __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* Did we already verify the signature on the original pass through? */
if (rqstp->rq_deferred)
@@ -877,34 +908,44 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g
integ_len = svc_getnl(&buf->head[0]);
if (integ_len & 3)
- return stat;
+ goto unwrap_failed;
if (integ_len > buf->len)
- return stat;
- if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) {
- WARN_ON_ONCE(1);
- return stat;
- }
+ goto unwrap_failed;
+ if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len))
+ goto unwrap_failed;
+
/* copy out mic... */
if (read_u32_from_xdr_buf(buf, integ_len, &mic.len))
- return stat;
+ goto unwrap_failed;
if (mic.len > RPC_MAX_AUTH_SIZE)
- return stat;
+ goto unwrap_failed;
mic.data = kmalloc(mic.len, GFP_KERNEL);
if (!mic.data)
- return stat;
+ goto unwrap_failed;
if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len))
- goto out;
+ goto unwrap_failed;
maj_stat = gss_verify_mic(ctx, &integ_buf, &mic);
if (maj_stat != GSS_S_COMPLETE)
- goto out;
- if (svc_getnl(&buf->head[0]) != seq)
- goto out;
+ goto bad_mic;
+ rseqno = svc_getnl(&buf->head[0]);
+ if (rseqno != seq)
+ goto bad_seqno;
/* trim off the mic and padding at the end before returning */
- buf->len -= 4 + round_up_to_quad(mic.len);
+ xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4);
stat = 0;
out:
kfree(mic.data);
return stat;
+
+unwrap_failed:
+ trace_rpcgss_svc_unwrap_failed(rqstp);
+ goto out;
+bad_seqno:
+ trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno);
+ goto out;
+bad_mic:
+ trace_rpcgss_svc_mic(rqstp, maj_stat);
+ goto out;
}
static inline int
@@ -928,9 +969,10 @@ static int
unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
{
u32 priv_len, maj_stat;
- int pad, saved_len, remaining_len, offset;
+ int pad, remaining_len, offset;
+ u32 rseqno;
- clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+ __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
priv_len = svc_getnl(&buf->head[0]);
if (rqstp->rq_deferred) {
@@ -943,36 +985,42 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs
* not yet read from the head, so these two values are different: */
remaining_len = total_buf_len(buf);
if (priv_len > remaining_len)
- return -EINVAL;
+ goto unwrap_failed;
pad = remaining_len - priv_len;
buf->len -= pad;
fix_priv_head(buf, pad);
- /* Maybe it would be better to give gss_unwrap a length parameter: */
- saved_len = buf->len;
- buf->len = priv_len;
- maj_stat = gss_unwrap(ctx, 0, buf);
+ maj_stat = gss_unwrap(ctx, 0, priv_len, buf);
pad = priv_len - buf->len;
- buf->len = saved_len;
- buf->len -= pad;
/* The upper layers assume the buffer is aligned on 4-byte boundaries.
* In the krb5p case, at least, the data ends up offset, so we need to
* move it around. */
/* XXX: This is very inefficient. It would be better to either do
* this while we encrypt, or maybe in the receive code, if we can peak
* ahead and work out the service and mechanism there. */
- offset = buf->head[0].iov_len % 4;
+ offset = xdr_pad_size(buf->head[0].iov_len);
if (offset) {
buf->buflen = RPCSVC_MAXPAYLOAD;
xdr_shift_buf(buf, offset);
fix_priv_head(buf, pad);
}
if (maj_stat != GSS_S_COMPLETE)
- return -EINVAL;
+ goto bad_unwrap;
out_seq:
- if (svc_getnl(&buf->head[0]) != seq)
- return -EINVAL;
+ rseqno = svc_getnl(&buf->head[0]);
+ if (rseqno != seq)
+ goto bad_seqno;
return 0;
+
+unwrap_failed:
+ trace_rpcgss_svc_unwrap_failed(rqstp);
+ return -EINVAL;
+bad_seqno:
+ trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno);
+ return -EINVAL;
+bad_unwrap:
+ trace_rpcgss_svc_unwrap(rqstp, maj_stat);
+ return -EINVAL;
}
struct gss_svc_data {
@@ -992,6 +1040,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp)
struct rpc_gss_wire_cred *gc = &svcdata->clcred;
int stat;
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
+
/*
* A gss export can be specified either by:
* export *(sec=krb5,rw)
@@ -1007,6 +1057,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp)
stat = svcauth_unix_set_client(rqstp);
if (stat == SVC_DROP || stat == SVC_CLOSE)
return stat;
+
+ rqstp->rq_auth_stat = rpc_auth_ok;
return SVC_OK;
}
@@ -1096,16 +1148,16 @@ static void gss_free_in_token_pages(struct gssp_in_token *in_token)
}
static int gss_read_proxy_verf(struct svc_rqst *rqstp,
- struct rpc_gss_wire_cred *gc, __be32 *authp,
+ struct rpc_gss_wire_cred *gc,
struct xdr_netobj *in_handle,
struct gssp_in_token *in_token)
{
struct kvec *argv = &rqstp->rq_arg.head[0];
- unsigned int page_base, length;
- int pages, i, res;
- size_t inlen;
+ unsigned int length, pgto_offs, pgfrom_offs;
+ int pages, i, res, pgto, pgfrom;
+ size_t inlen, to_offs, from_offs;
- res = gss_read_common_verf(gc, argv, authp, in_handle);
+ res = gss_read_common_verf(gc, argv, &rqstp->rq_auth_stat, in_handle);
if (res)
return res;
@@ -1131,17 +1183,24 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp,
memcpy(page_address(in_token->pages[0]), argv->iov_base, length);
inlen -= length;
- i = 1;
- page_base = rqstp->rq_arg.page_base;
+ to_offs = length;
+ from_offs = rqstp->rq_arg.page_base;
while (inlen) {
- length = min_t(unsigned int, inlen, PAGE_SIZE);
- memcpy(page_address(in_token->pages[i]),
- page_address(rqstp->rq_arg.pages[i]) + page_base,
+ pgto = to_offs >> PAGE_SHIFT;
+ pgfrom = from_offs >> PAGE_SHIFT;
+ pgto_offs = to_offs & ~PAGE_MASK;
+ pgfrom_offs = from_offs & ~PAGE_MASK;
+
+ length = min_t(unsigned int, inlen,
+ min_t(unsigned int, PAGE_SIZE - pgto_offs,
+ PAGE_SIZE - pgfrom_offs));
+ memcpy(page_address(in_token->pages[pgto]) + pgto_offs,
+ page_address(rqstp->rq_arg.pages[pgfrom]) + pgfrom_offs,
length);
+ to_offs += length;
+ from_offs += length;
inlen -= length;
- page_base = 0;
- i++;
}
return 0;
}
@@ -1174,7 +1233,7 @@ gss_write_resv(struct kvec *resv, size_t size_limit,
* Otherwise, drop the request pending an answer to the upcall.
*/
static int svcauth_gss_legacy_init(struct svc_rqst *rqstp,
- struct rpc_gss_wire_cred *gc, __be32 *authp)
+ struct rpc_gss_wire_cred *gc)
{
struct kvec *argv = &rqstp->rq_arg.head[0];
struct kvec *resv = &rqstp->rq_res.head[0];
@@ -1183,7 +1242,7 @@ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp,
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
memset(&rsikey, 0, sizeof(rsikey));
- ret = gss_read_verf(gc, argv, authp,
+ ret = gss_read_verf(gc, argv, &rqstp->rq_auth_stat,
&rsikey.in_handle, &rsikey.in_token);
if (ret)
return ret;
@@ -1222,7 +1281,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd,
long long ctxh;
struct gss_api_mech *gm = NULL;
time64_t expiry;
- int status = -EINVAL;
+ int status;
memset(&rsci, 0, sizeof(rsci));
/* context handle */
@@ -1245,7 +1304,6 @@ static int gss_proxy_save_rsc(struct cache_detail *cd,
if (!ud->found_creds) {
/* userspace seem buggy, we should always get at least a
* mapping to nobody */
- dprintk("RPC: No creds found!\n");
goto out;
} else {
struct timespec64 boot;
@@ -1287,7 +1345,7 @@ out:
}
static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
- struct rpc_gss_wire_cred *gc, __be32 *authp)
+ struct rpc_gss_wire_cred *gc)
{
struct kvec *resv = &rqstp->rq_res.head[0];
struct xdr_netobj cli_handle;
@@ -1299,8 +1357,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
memset(&ud, 0, sizeof(ud));
- ret = gss_read_proxy_verf(rqstp, gc, authp,
- &ud.in_handle, &ud.in_token);
+ ret = gss_read_proxy_verf(rqstp, gc, &ud.in_handle, &ud.in_token);
if (ret)
return ret;
@@ -1311,8 +1368,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
if (status)
goto out;
- trace_rpcgss_accept_upcall(rqstp->rq_xid, ud.major_status,
- ud.minor_status);
+ trace_rpcgss_svc_accept_upcall(rqstp, ud.major_status, ud.minor_status);
switch (ud.major_status) {
case GSS_S_CONTINUE_NEEDED:
@@ -1320,31 +1376,23 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
break;
case GSS_S_COMPLETE:
status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle);
- if (status) {
- pr_info("%s: gss_proxy_save_rsc failed (%d)\n",
- __func__, status);
+ if (status)
goto out;
- }
cli_handle.data = (u8 *)&handle;
cli_handle.len = sizeof(handle);
break;
default:
- ret = SVC_CLOSE;
goto out;
}
/* Got an answer to the upcall; use it: */
if (gss_write_init_verf(sn->rsc_cache, rqstp,
- &cli_handle, &ud.major_status)) {
- pr_info("%s: gss_write_init_verf failed\n", __func__);
+ &cli_handle, &ud.major_status))
goto out;
- }
if (gss_write_resv(resv, PAGE_SIZE,
&cli_handle, &ud.out_token,
- ud.major_status, ud.minor_status)) {
- pr_info("%s: gss_write_resv failed\n", __func__);
+ ud.major_status, ud.minor_status))
goto out;
- }
ret = SVC_COMPLETE;
out:
@@ -1385,7 +1433,7 @@ static bool use_gss_proxy(struct net *net)
static ssize_t write_gssp(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct net *net = PDE_DATA(file_inode(file));
+ struct net *net = pde_data(file_inode(file));
char tbuf[20];
unsigned long i;
int res;
@@ -1413,7 +1461,7 @@ static ssize_t write_gssp(struct file *file, const char __user *buf,
static ssize_t read_gssp(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct net *net = PDE_DATA(file_inode(file));
+ struct net *net = pde_data(file_inode(file));
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
unsigned long p = *ppos;
char tbuf[10];
@@ -1482,7 +1530,7 @@ static void destroy_use_gss_proxy_proc_entry(struct net *net) {}
* response here and return SVC_COMPLETE.
*/
static int
-svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
+svcauth_gss_accept(struct svc_rqst *rqstp)
{
struct kvec *argv = &rqstp->rq_arg.head[0];
struct kvec *resv = &rqstp->rq_res.head[0];
@@ -1495,10 +1543,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
int ret;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
- dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n",
- argv->iov_len);
-
- *authp = rpc_autherr_badcred;
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
if (!svcdata)
svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL);
if (!svcdata)
@@ -1535,22 +1580,22 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0))
goto auth_err;
- *authp = rpc_autherr_badverf;
+ rqstp->rq_auth_stat = rpc_autherr_badverf;
switch (gc->gc_proc) {
case RPC_GSS_PROC_INIT:
case RPC_GSS_PROC_CONTINUE_INIT:
if (use_gss_proxy(SVC_NET(rqstp)))
- return svcauth_gss_proxy_init(rqstp, gc, authp);
+ return svcauth_gss_proxy_init(rqstp, gc);
else
- return svcauth_gss_legacy_init(rqstp, gc, authp);
+ return svcauth_gss_legacy_init(rqstp, gc);
case RPC_GSS_PROC_DATA:
case RPC_GSS_PROC_DESTROY:
/* Look up the context, and check the verifier: */
- *authp = rpcsec_gsserr_credproblem;
+ rqstp->rq_auth_stat = rpcsec_gsserr_credproblem;
rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx);
if (!rsci)
goto auth_err;
- switch (gss_verify_header(rqstp, rsci, rpcstart, gc, authp)) {
+ switch (gss_verify_header(rqstp, rsci, rpcstart, gc)) {
case SVC_OK:
break;
case SVC_DENIED:
@@ -1560,7 +1605,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
}
break;
default:
- *authp = rpc_autherr_rejectedcred;
+ rqstp->rq_auth_stat = rpc_autherr_rejectedcred;
goto auth_err;
}
@@ -1576,13 +1621,13 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
svc_putnl(resv, RPC_SUCCESS);
goto complete;
case RPC_GSS_PROC_DATA:
- *authp = rpcsec_gsserr_ctxproblem;
+ rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem;
svcdata->verf_start = resv->iov_base + resv->iov_len;
if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
goto auth_err;
rqstp->rq_cred = rsci->cred;
get_group_info(rsci->cred.cr_group_info);
- *authp = rpc_autherr_badcred;
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
switch (gc->gc_svc) {
case RPC_GSS_SVC_NONE:
break;
@@ -1614,6 +1659,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
GSS_C_QOP_DEFAULT,
gc->gc_svc);
ret = SVC_OK;
+ trace_rpcgss_svc_authenticate(rqstp, gc);
goto out;
}
garbage_args:
@@ -1680,7 +1726,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
goto out;
integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
integ_len = resbuf->len - integ_offset;
- BUG_ON(integ_len % 4);
+ if (integ_len & 3)
+ goto out;
*p++ = htonl(integ_len);
*p++ = htonl(gc->gc_seq);
if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) {
@@ -1704,7 +1751,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
resv->iov_len += XDR_QUADLEN(mic.len) << 2;
/* not strictly required: */
resbuf->len += XDR_QUADLEN(mic.len) << 2;
- BUG_ON(resv->iov_len > PAGE_SIZE);
+ if (resv->iov_len > PAGE_SIZE)
+ goto out_err;
out:
stat = 0;
out_err:
@@ -1740,9 +1788,11 @@ svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp)
* both the head and tail.
*/
if (resbuf->tail[0].iov_base) {
- BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base
- + PAGE_SIZE);
- BUG_ON(resbuf->tail[0].iov_base < resbuf->head[0].iov_base);
+ if (resbuf->tail[0].iov_base >=
+ resbuf->head[0].iov_base + PAGE_SIZE)
+ return -EINVAL;
+ if (resbuf->tail[0].iov_base < resbuf->head[0].iov_base)
+ return -EINVAL;
if (resbuf->tail[0].iov_len + resbuf->head[0].iov_len
+ 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE)
return -ENOMEM;
@@ -1780,11 +1830,14 @@ static int
svcauth_gss_release(struct svc_rqst *rqstp)
{
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
- struct rpc_gss_wire_cred *gc = &gsd->clcred;
+ struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = &rqstp->rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
+ if (!gsd)
+ goto out;
+ gc = &gsd->clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1825,10 +1878,10 @@ out_err:
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
- if (gsd->rsci)
+ if (gsd && gsd->rsci) {
cache_put(&gsd->rsci->h, sn->rsc_cache);
- gsd->rsci = NULL;
-
+ gsd->rsci = NULL;
+ }
return stat;
}
@@ -1932,7 +1985,7 @@ gss_svc_init_net(struct net *net)
goto out2;
return 0;
out2:
- destroy_use_gss_proxy_proc_entry(net);
+ rsi_cache_destroy_net(net);
out1:
rsc_cache_destroy_net(net);
return rv;
diff --git a/net/sunrpc/auth_gss/trace.c b/net/sunrpc/auth_gss/trace.c
index 5576f1e66de9..76685abba60f 100644
--- a/net/sunrpc/auth_gss/trace.c
+++ b/net/sunrpc/auth_gss/trace.c
@@ -5,6 +5,9 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/auth_gss.h>
#include <linux/sunrpc/gss_err.h>
#define CREATE_TRACE_POINTS
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index e7df1f782b2e..1e091d3fa607 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -40,11 +40,19 @@ unx_destroy(struct rpc_auth *auth)
/*
* Lookup AUTH_UNIX creds for current process
*/
-static struct rpc_cred *
-unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+static struct rpc_cred *unx_lookup_cred(struct rpc_auth *auth,
+ struct auth_cred *acred, int flags)
{
- struct rpc_cred *ret = mempool_alloc(unix_pool, GFP_NOFS);
-
+ struct rpc_cred *ret;
+
+ ret = kmalloc(sizeof(*ret), rpc_task_gfp_mask());
+ if (!ret) {
+ if (!(flags & RPCAUTH_LOOKUP_ASYNC))
+ return ERR_PTR(-ENOMEM);
+ ret = mempool_alloc(unix_pool, GFP_NOWAIT);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+ }
rpcauth_init_cred(ret, acred, auth, &unix_credops);
ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
return ret;
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 195b40c5dae4..65a6c6429a53 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/******************************************************************************
(c) 2007 Network Appliance, Inc. All Rights Reserved.
(c) 2009 NetApp. All Rights Reserved.
-NetApp provides this source code under the GPL v2 License.
-The GPL v2 license is available at
-http://opensource.org/licenses/gpl-license.php.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
@@ -64,6 +50,17 @@ static void xprt_free_allocation(struct rpc_rqst *req)
kfree(req);
}
+static void xprt_bc_reinit_xdr_buf(struct xdr_buf *buf)
+{
+ buf->head[0].iov_len = PAGE_SIZE;
+ buf->tail[0].iov_len = 0;
+ buf->pages = NULL;
+ buf->page_len = 0;
+ buf->flags = 0;
+ buf->len = 0;
+ buf->buflen = PAGE_SIZE;
+}
+
static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
{
struct page *page;
@@ -75,9 +72,9 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
return 0;
}
-static
-struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags)
+static struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt)
{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
struct rpc_rqst *req;
/* Pre-allocate one backchannel rpc_rqst */
@@ -111,7 +108,7 @@ out_free:
* by the backchannel. This function can be called multiple times
* when creating new sessions that use the same rpc_xprt. The
* preallocated buffers are added to the pool of resources used by
- * the rpc_xprt. Anyone of these resources may be used used by an
+ * the rpc_xprt. Any one of these resources may be used by an
* incoming callback request. It's up to the higher levels in the
* stack to enforce that the maximum number of session slots is not
* being exceeded.
@@ -154,7 +151,7 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
INIT_LIST_HEAD(&tmp_list);
for (i = 0; i < min_reqs; i++) {
/* Pre-allocate one backchannel rpc_rqst */
- req = xprt_alloc_bc_req(xprt, GFP_KERNEL);
+ req = xprt_alloc_bc_req(xprt);
if (req == NULL) {
printk(KERN_ERR "Failed to create bc rpc_rqst\n");
goto out_free;
@@ -292,6 +289,9 @@ void xprt_free_bc_rqst(struct rpc_rqst *req)
*/
spin_lock_bh(&xprt->bc_pa_lock);
if (xprt_need_to_requeue(xprt)) {
+ xprt_bc_reinit_xdr_buf(&req->rq_snd_buf);
+ xprt_bc_reinit_xdr_buf(&req->rq_rcv_buf);
+ req->rq_rcv_buf.len = PAGE_SIZE;
list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
xprt->bc_alloc_count++;
atomic_inc(&xprt->bc_slot_count);
@@ -343,7 +343,7 @@ found:
break;
} else if (req)
break;
- new = xprt_alloc_bc_req(xprt, GFP_KERNEL);
+ new = xprt_alloc_bc_req(xprt);
} while (new);
return req;
}
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index bd843a81afa0..f075a9fb5ccc 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -32,13 +32,15 @@
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <trace/events/sunrpc.h>
+
#include "netns.h"
+#include "fail.h"
#define RPCDBG_FACILITY RPCDBG_CACHE
static bool cache_defer_req(struct cache_req *req, struct cache_head *item);
static void cache_revisit_request(struct cache_head *item);
-static bool cache_listeners_exist(struct cache_detail *detail);
static void cache_init(struct cache_head *h, struct cache_detail *detail)
{
@@ -65,13 +67,14 @@ static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail,
rcu_read_lock();
hlist_for_each_entry_rcu(tmp, head, cache_list) {
- if (detail->match(tmp, key)) {
- if (cache_is_expired(detail, tmp))
- continue;
- tmp = cache_get_rcu(tmp);
- rcu_read_unlock();
- return tmp;
- }
+ if (!detail->match(tmp, key))
+ continue;
+ if (test_bit(CACHE_VALID, &tmp->flags) &&
+ cache_is_expired(detail, tmp))
+ continue;
+ tmp = cache_get_rcu(tmp);
+ rcu_read_unlock();
+ return tmp;
}
rcu_read_unlock();
return NULL;
@@ -113,18 +116,21 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
spin_lock(&detail->hash_lock);
/* check if entry appeared while we slept */
- hlist_for_each_entry_rcu(tmp, head, cache_list) {
- if (detail->match(tmp, key)) {
- if (cache_is_expired(detail, tmp)) {
- sunrpc_begin_cache_remove_entry(tmp, detail);
- freeme = tmp;
- break;
- }
- cache_get(tmp);
- spin_unlock(&detail->hash_lock);
- cache_put(new, detail);
- return tmp;
+ hlist_for_each_entry_rcu(tmp, head, cache_list,
+ lockdep_is_held(&detail->hash_lock)) {
+ if (!detail->match(tmp, key))
+ continue;
+ if (test_bit(CACHE_VALID, &tmp->flags) &&
+ cache_is_expired(detail, tmp)) {
+ sunrpc_begin_cache_remove_entry(tmp, detail);
+ trace_cache_entry_expired(detail, tmp);
+ freeme = tmp;
+ break;
}
+ cache_get(tmp);
+ spin_unlock(&detail->hash_lock);
+ cache_put(new, detail);
+ return tmp;
}
hlist_add_head_rcu(&new->cache_list, head);
@@ -174,6 +180,25 @@ static void cache_fresh_unlocked(struct cache_head *head,
}
}
+static void cache_make_negative(struct cache_detail *detail,
+ struct cache_head *h)
+{
+ set_bit(CACHE_NEGATIVE, &h->flags);
+ trace_cache_entry_make_negative(detail, h);
+}
+
+static void cache_entry_update(struct cache_detail *detail,
+ struct cache_head *h,
+ struct cache_head *new)
+{
+ if (!test_bit(CACHE_NEGATIVE, &new->flags)) {
+ detail->update(h, new);
+ trace_cache_entry_update(detail, h);
+ } else {
+ cache_make_negative(detail, h);
+ }
+}
+
struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
struct cache_head *new, struct cache_head *old, int hash)
{
@@ -186,10 +211,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
if (!test_bit(CACHE_VALID, &old->flags)) {
spin_lock(&detail->hash_lock);
if (!test_bit(CACHE_VALID, &old->flags)) {
- if (test_bit(CACHE_NEGATIVE, &new->flags))
- set_bit(CACHE_NEGATIVE, &old->flags);
- else
- detail->update(old, new);
+ cache_entry_update(detail, old, new);
cache_fresh_locked(old, new->expiry_time, detail);
spin_unlock(&detail->hash_lock);
cache_fresh_unlocked(old, detail);
@@ -207,10 +229,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
detail->init(tmp, old);
spin_lock(&detail->hash_lock);
- if (test_bit(CACHE_NEGATIVE, &new->flags))
- set_bit(CACHE_NEGATIVE, &tmp->flags);
- else
- detail->update(tmp, new);
+ cache_entry_update(detail, tmp, new);
hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]);
detail->entries++;
cache_get(tmp);
@@ -224,13 +243,6 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
}
EXPORT_SYMBOL_GPL(sunrpc_cache_update);
-static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h)
-{
- if (cd->cache_upcall)
- return cd->cache_upcall(cd, h);
- return sunrpc_cache_pipe_upcall(cd, h);
-}
-
static inline int cache_is_valid(struct cache_head *h)
{
if (!test_bit(CACHE_VALID, &h->flags))
@@ -259,7 +271,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h
spin_lock(&detail->hash_lock);
rv = cache_is_valid(h);
if (rv == -EAGAIN) {
- set_bit(CACHE_NEGATIVE, &h->flags);
+ cache_make_negative(detail, h);
cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY,
detail);
rv = -ENOENT;
@@ -303,17 +315,14 @@ int cache_check(struct cache_detail *detail,
(h->expiry_time != 0 && age > refresh_age/2)) {
dprintk("RPC: Want update, refage=%lld, age=%lld\n",
refresh_age, age);
- if (!test_and_set_bit(CACHE_PENDING, &h->flags)) {
- switch (cache_make_upcall(detail, h)) {
- case -EINVAL:
- rv = try_to_negate_entry(detail, h);
- break;
- case -EAGAIN:
- cache_fresh_unlocked(h, detail);
- break;
- }
- } else if (!cache_listeners_exist(detail))
+ switch (detail->cache_upcall(detail, h)) {
+ case -EINVAL:
rv = try_to_negate_entry(detail, h);
+ break;
+ case -EAGAIN:
+ cache_fresh_unlocked(h, detail);
+ break;
+ }
}
if (rv == -EAGAIN) {
@@ -468,6 +477,7 @@ static int cache_clean(void)
continue;
sunrpc_begin_cache_remove_entry(ch, current_detail);
+ trace_cache_entry_expired(current_detail, ch);
rv = 1;
break;
}
@@ -490,16 +500,17 @@ static int cache_clean(void)
*/
static void do_cache_clean(struct work_struct *work)
{
- int delay = 5;
- if (cache_clean() == -1)
- delay = round_jiffies_relative(30*HZ);
+ int delay;
if (list_empty(&cache_list))
- delay = 0;
+ return;
- if (delay)
- queue_delayed_work(system_power_efficient_wq,
- &cache_cleaner, delay);
+ if (cache_clean() == -1)
+ delay = round_jiffies_relative(30*HZ);
+ else
+ delay = 5;
+
+ queue_delayed_work(system_power_efficient_wq, &cache_cleaner, delay);
}
@@ -521,7 +532,6 @@ void cache_purge(struct cache_detail *detail)
{
struct cache_head *ch = NULL;
struct hlist_head *head = NULL;
- struct hlist_node *tmp = NULL;
int i = 0;
spin_lock(&detail->hash_lock);
@@ -533,7 +543,9 @@ void cache_purge(struct cache_detail *detail)
dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name);
for (i = 0; i < detail->hash_size; i++) {
head = &detail->hash_table[i];
- hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
+ while (!hlist_empty(head)) {
+ ch = hlist_entry(head->first, struct cache_head,
+ cache_list);
sunrpc_begin_cache_remove_entry(ch, detail);
spin_unlock(&detail->hash_lock);
sunrpc_end_cache_remove_entry(ch, detail);
@@ -665,7 +677,7 @@ static void cache_limit_defers(void)
/* Consider removing either the first or the last */
if (cache_defer_cnt > DFR_MAX) {
- if (prandom_u32() & 1)
+ if (prandom_u32_max(2))
discard = list_entry(cache_defer_list.next,
struct cache_deferred_req, recent);
else
@@ -678,16 +690,30 @@ static void cache_limit_defers(void)
discard->revisit(discard, 1);
}
+#if IS_ENABLED(CONFIG_FAIL_SUNRPC)
+static inline bool cache_defer_immediately(void)
+{
+ return !fail_sunrpc.ignore_cache_wait &&
+ should_fail(&fail_sunrpc.attr, 1);
+}
+#else
+static inline bool cache_defer_immediately(void)
+{
+ return false;
+}
+#endif
+
/* Return true if and only if a deferred request is queued. */
static bool cache_defer_req(struct cache_req *req, struct cache_head *item)
{
struct cache_deferred_req *dreq;
- if (req->thread_wait) {
+ if (!cache_defer_immediately()) {
cache_wait_req(req, item);
if (!test_bit(CACHE_PENDING, &item->flags))
return false;
}
+
dreq = req->defer(req);
if (dreq == NULL)
return false;
@@ -768,7 +794,6 @@ void cache_clean_deferred(void *owner)
*/
static DEFINE_SPINLOCK(queue_lock);
-static DEFINE_MUTEX(queue_io_mutex);
struct cache_queue {
struct list_head list;
@@ -794,7 +819,7 @@ static int cache_request(struct cache_detail *detail,
detail->cache_request(detail, crq->item, &bp, &len);
if (len < 0)
- return -EAGAIN;
+ return -E2BIG;
return PAGE_SIZE - len;
}
@@ -896,44 +921,26 @@ static ssize_t cache_do_downcall(char *kaddr, const char __user *buf,
return ret;
}
-static ssize_t cache_slow_downcall(const char __user *buf,
- size_t count, struct cache_detail *cd)
-{
- static char write_buf[8192]; /* protected by queue_io_mutex */
- ssize_t ret = -EINVAL;
-
- if (count >= sizeof(write_buf))
- goto out;
- mutex_lock(&queue_io_mutex);
- ret = cache_do_downcall(write_buf, buf, count, cd);
- mutex_unlock(&queue_io_mutex);
-out:
- return ret;
-}
-
static ssize_t cache_downcall(struct address_space *mapping,
const char __user *buf,
size_t count, struct cache_detail *cd)
{
- struct page *page;
- char *kaddr;
+ char *write_buf;
ssize_t ret = -ENOMEM;
- if (count >= PAGE_SIZE)
- goto out_slow;
+ if (count >= 32768) { /* 32k is max userland buffer, lets check anyway */
+ ret = -EINVAL;
+ goto out;
+ }
- page = find_or_create_page(mapping, 0, GFP_KERNEL);
- if (!page)
- goto out_slow;
+ write_buf = kvmalloc(count + 1, GFP_KERNEL);
+ if (!write_buf)
+ goto out;
- kaddr = kmap(page);
- ret = cache_do_downcall(kaddr, buf, count, cd);
- kunmap(page);
- unlock_page(page);
- put_page(page);
+ ret = cache_do_downcall(write_buf, buf, count, cd);
+ kvfree(write_buf);
+out:
return ret;
-out_slow:
- return cache_slow_downcall(buf, count, cd);
}
static ssize_t cache_write(struct file *filp, const char __user *buf,
@@ -1195,20 +1202,12 @@ static bool cache_listeners_exist(struct cache_detail *detail)
*
* Each request is at most one page long.
*/
-int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
+static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
{
-
char *buf;
struct cache_request *crq;
int ret = 0;
- if (!detail->cache_request)
- return -EINVAL;
-
- if (!cache_listeners_exist(detail)) {
- warn_no_listener(detail);
- return -EINVAL;
- }
if (test_bit(CACHE_CLEANED, &h->flags))
/* Too late to make an upcall */
return -EAGAIN;
@@ -1231,6 +1230,7 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
if (test_bit(CACHE_PENDING, &h->flags)) {
crq->item = cache_get(h);
list_add_tail(&crq->q.list, &detail->queue);
+ trace_cache_entry_upcall(detail, h);
} else
/* Lost a race, no longer PENDING, so don't enqueue */
ret = -EAGAIN;
@@ -1242,8 +1242,27 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
}
return ret;
}
+
+int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
+{
+ if (test_and_set_bit(CACHE_PENDING, &h->flags))
+ return 0;
+ return cache_pipe_upcall(detail, h);
+}
EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall);
+int sunrpc_cache_pipe_upcall_timeout(struct cache_detail *detail,
+ struct cache_head *h)
+{
+ if (!cache_listeners_exist(detail)) {
+ warn_no_listener(detail);
+ trace_cache_entry_no_listener(detail, h);
+ return -EINVAL;
+ }
+ return sunrpc_cache_pipe_upcall(detail, h);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall_timeout);
+
/*
* parse a message from user-space and pass it
* to an appropriate cache
@@ -1415,10 +1434,10 @@ static int c_show(struct seq_file *m, void *p)
cache_get(cp);
if (cache_check(cd, cp, NULL))
/* cache_check does a cache_put on failure */
- seq_printf(m, "# ");
+ seq_puts(m, "# ");
else {
if (cache_is_expired(cd, cp))
- seq_printf(m, "# ");
+ seq_puts(m, "# ");
cache_put(cp, cd);
}
@@ -1533,7 +1552,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
static ssize_t cache_read_procfs(struct file *filp, char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = PDE_DATA(file_inode(filp));
+ struct cache_detail *cd = pde_data(file_inode(filp));
return cache_read(filp, buf, count, ppos, cd);
}
@@ -1541,14 +1560,14 @@ static ssize_t cache_read_procfs(struct file *filp, char __user *buf,
static ssize_t cache_write_procfs(struct file *filp, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = PDE_DATA(file_inode(filp));
+ struct cache_detail *cd = pde_data(file_inode(filp));
return cache_write(filp, buf, count, ppos, cd);
}
static __poll_t cache_poll_procfs(struct file *filp, poll_table *wait)
{
- struct cache_detail *cd = PDE_DATA(file_inode(filp));
+ struct cache_detail *cd = pde_data(file_inode(filp));
return cache_poll(filp, wait, cd);
}
@@ -1557,21 +1576,21 @@ static long cache_ioctl_procfs(struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
- struct cache_detail *cd = PDE_DATA(inode);
+ struct cache_detail *cd = pde_data(inode);
return cache_ioctl(inode, filp, cmd, arg, cd);
}
static int cache_open_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE_DATA(inode);
+ struct cache_detail *cd = pde_data(inode);
return cache_open(inode, filp, cd);
}
static int cache_release_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE_DATA(inode);
+ struct cache_detail *cd = pde_data(inode);
return cache_release(inode, filp, cd);
}
@@ -1588,14 +1607,14 @@ static const struct proc_ops cache_channel_proc_ops = {
static int content_open_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE_DATA(inode);
+ struct cache_detail *cd = pde_data(inode);
return content_open(inode, filp, cd);
}
static int content_release_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE_DATA(inode);
+ struct cache_detail *cd = pde_data(inode);
return content_release(inode, filp, cd);
}
@@ -1609,14 +1628,14 @@ static const struct proc_ops content_proc_ops = {
static int open_flush_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE_DATA(inode);
+ struct cache_detail *cd = pde_data(inode);
return open_flush(inode, filp, cd);
}
static int release_flush_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE_DATA(inode);
+ struct cache_detail *cd = pde_data(inode);
return release_flush(inode, filp, cd);
}
@@ -1624,7 +1643,7 @@ static int release_flush_procfs(struct inode *inode, struct file *filp)
static ssize_t read_flush_procfs(struct file *filp, char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = PDE_DATA(file_inode(filp));
+ struct cache_detail *cd = pde_data(file_inode(filp));
return read_flush(filp, buf, count, ppos, cd);
}
@@ -1633,7 +1652,7 @@ static ssize_t write_flush_procfs(struct file *filp,
const char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = PDE_DATA(file_inode(filp));
+ struct cache_detail *cd = pde_data(file_inode(filp));
return write_flush(filp, buf, count, ppos, cd);
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7324b21f923e..993acf38af87 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -41,16 +41,13 @@
#include <trace/events/sunrpc.h>
#include "sunrpc.h"
+#include "sysfs.h"
#include "netns.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_CALL
#endif
-#define dprint_status(t) \
- dprintk("RPC: %5u %s (status %d)\n", t->tk_pid, \
- __func__, t->tk_status)
-
/*
* All RPC clients are linked into this list
*/
@@ -79,6 +76,7 @@ static int rpc_encode_header(struct rpc_task *task,
static int rpc_decode_header(struct rpc_task *task,
struct xdr_stream *xdr);
static int rpc_ping(struct rpc_clnt *clnt);
+static int rpc_ping_noreply(struct rpc_clnt *clnt);
static void rpc_check_timeout(struct rpc_task *task);
static void rpc_register_client(struct rpc_clnt *clnt)
@@ -170,7 +168,7 @@ static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event)
case RPC_PIPEFS_MOUNT:
if (clnt->cl_pipedir_objects.pdh_dentry != NULL)
return 1;
- if (atomic_read(&clnt->cl_count) == 0)
+ if (refcount_read(&clnt->cl_count) == 0)
return 1;
break;
case RPC_PIPEFS_UMOUNT:
@@ -331,6 +329,7 @@ err_auth:
out:
if (pipefs_sb)
rpc_put_sb_net(net);
+ rpc_sysfs_client_destroy(clnt);
rpc_clnt_debugfs_unregister(clnt);
return err;
}
@@ -346,7 +345,7 @@ static int rpc_alloc_clid(struct rpc_clnt *clnt)
{
int clid;
- clid = ida_simple_get(&rpc_clids, 0, 0, GFP_KERNEL);
+ clid = ida_alloc(&rpc_clids, GFP_KERNEL);
if (clid < 0)
return clid;
clnt->cl_clid = clid;
@@ -355,7 +354,7 @@ static int rpc_alloc_clid(struct rpc_clnt *clnt)
static void rpc_free_clid(struct rpc_clnt *clnt)
{
- ida_simple_remove(&rpc_clids, clnt->cl_clid);
+ ida_free(&rpc_clids, clnt->cl_clid);
}
static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
@@ -370,10 +369,6 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
const char *nodename = args->nodename;
int err;
- /* sanity check the name before trying to print it */
- dprintk("RPC: creating %s client for %s (xprt %p)\n",
- program->name, args->servername, xprt);
-
err = rpciod_up();
if (err)
goto out_no_rpciod;
@@ -418,24 +413,28 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
}
rpc_clnt_set_transport(clnt, xprt, timeout);
+ xprt->main = true;
xprt_iter_init(&clnt->cl_xpi, xps);
xprt_switch_put(xps);
clnt->cl_rtt = &clnt->cl_rtt_default;
rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval);
- atomic_set(&clnt->cl_count, 1);
+ refcount_set(&clnt->cl_count, 1);
if (nodename == NULL)
nodename = utsname()->nodename;
/* save the nodename */
rpc_clnt_set_nodename(clnt, nodename);
+ rpc_sysfs_client_setup(clnt, xps, rpc_net_ns(clnt));
err = rpc_client_register(clnt, args->authflavor, args->client_name);
if (err)
goto out_no_path;
if (parent)
- atomic_inc(&parent->cl_count);
+ refcount_inc(&parent->cl_count);
+
+ trace_rpc_clnt_new(clnt, xprt, program->name, args->servername);
return clnt;
out_no_path:
@@ -450,6 +449,7 @@ out_err:
out_no_rpciod:
xprt_switch_put(xps);
xprt_put(xprt);
+ trace_rpc_clnt_new_err(program->name, args->servername, err);
return ERR_PTR(err);
}
@@ -484,6 +484,12 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
rpc_shutdown_client(clnt);
return ERR_PTR(err);
}
+ } else if (args->flags & RPC_CLNT_CREATE_CONNECTED) {
+ int err = rpc_ping_noreply(clnt);
+ if (err != 0) {
+ rpc_shutdown_client(clnt);
+ return ERR_PTR(err);
+ }
}
clnt->cl_softrtry = 1;
@@ -634,10 +640,8 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
args->nodename = clnt->cl_nodename;
new = rpc_new_client(args, xps, xprt, clnt);
- if (IS_ERR(new)) {
- err = PTR_ERR(new);
- goto out_err;
- }
+ if (IS_ERR(new))
+ return new;
/* Turn off autobind on clones */
new->cl_autobind = 0;
@@ -647,10 +651,11 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
new->cl_discrtry = clnt->cl_discrtry;
new->cl_chatty = clnt->cl_chatty;
new->cl_principal = clnt->cl_principal;
+ new->cl_max_connect = clnt->cl_max_connect;
return new;
out_err:
- dprintk("RPC: %s: returned error %d\n", __func__, err);
+ trace_rpc_clnt_clone_err(clnt, err);
return ERR_PTR(err);
}
@@ -723,11 +728,8 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt,
int err;
xprt = xprt_create_transport(args);
- if (IS_ERR(xprt)) {
- dprintk("RPC: failed to create new xprt for clnt %p\n",
- clnt);
+ if (IS_ERR(xprt))
return PTR_ERR(xprt);
- }
xps = xprt_switch_alloc(xprt, GFP_KERNEL);
if (xps == NULL) {
@@ -743,6 +745,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt,
rpc_unregister_client(clnt);
__rpc_clnt_remove_pipedir(clnt);
+ rpc_sysfs_client_destroy(clnt);
rpc_clnt_debugfs_unregister(clnt);
/*
@@ -767,7 +770,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt,
rpc_release_client(parent);
xprt_switch_put(oldxps);
xprt_put(old);
- dprintk("RPC: replaced xprt for clnt %p\n", clnt);
+ trace_rpc_clnt_replace_xprt(clnt);
return 0;
out_revert:
@@ -777,13 +780,14 @@ out_revert:
rpc_client_register(clnt, pseudoflavor, NULL);
xprt_switch_put(xps);
xprt_put(xprt);
- dprintk("RPC: failed to switch xprt for clnt %p\n", clnt);
+ trace_rpc_clnt_replace_xprt_err(clnt);
return err;
}
EXPORT_SYMBOL_GPL(rpc_switch_client_transport);
static
-int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
+int _rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi,
+ void func(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps))
{
struct rpc_xprt_switch *xps;
@@ -792,11 +796,24 @@ int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
rcu_read_unlock();
if (xps == NULL)
return -EAGAIN;
- xprt_iter_init_listall(xpi, xps);
+ func(xpi, xps);
xprt_switch_put(xps);
return 0;
}
+static
+int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
+{
+ return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listall);
+}
+
+static
+int rpc_clnt_xprt_iter_offline_init(struct rpc_clnt *clnt,
+ struct rpc_xprt_iter *xpi)
+{
+ return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listoffline);
+}
+
/**
* rpc_clnt_iterate_for_each_xprt - Apply a function to all transports
* @clnt: pointer to client
@@ -844,10 +861,11 @@ void rpc_killall_tasks(struct rpc_clnt *clnt)
if (list_empty(&clnt->cl_tasks))
return;
- dprintk("RPC: killing all tasks for client %p\n", clnt);
+
/*
* Spin lock all_tasks to prevent changes...
*/
+ trace_rpc_clnt_killall(clnt);
spin_lock(&clnt->cl_lock);
list_for_each_entry(rovr, &clnt->cl_tasks, tk_task)
rpc_signal_task(rovr);
@@ -855,6 +873,57 @@ void rpc_killall_tasks(struct rpc_clnt *clnt)
}
EXPORT_SYMBOL_GPL(rpc_killall_tasks);
+/**
+ * rpc_cancel_tasks - try to cancel a set of RPC tasks
+ * @clnt: Pointer to RPC client
+ * @error: RPC task error value to set
+ * @fnmatch: Pointer to selector function
+ * @data: User data
+ *
+ * Uses @fnmatch to define a set of RPC tasks that are to be cancelled.
+ * The argument @error must be a negative error value.
+ */
+unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error,
+ bool (*fnmatch)(const struct rpc_task *,
+ const void *),
+ const void *data)
+{
+ struct rpc_task *task;
+ unsigned long count = 0;
+
+ if (list_empty(&clnt->cl_tasks))
+ return 0;
+ /*
+ * Spin lock all_tasks to prevent changes...
+ */
+ spin_lock(&clnt->cl_lock);
+ list_for_each_entry(task, &clnt->cl_tasks, tk_task) {
+ if (!RPC_IS_ACTIVATED(task))
+ continue;
+ if (!fnmatch(task, data))
+ continue;
+ rpc_task_try_cancel(task, error);
+ count++;
+ }
+ spin_unlock(&clnt->cl_lock);
+ return count;
+}
+EXPORT_SYMBOL_GPL(rpc_cancel_tasks);
+
+static int rpc_clnt_disconnect_xprt(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt, void *dummy)
+{
+ if (xprt_connected(xprt))
+ xprt_force_disconnect(xprt);
+ return 0;
+}
+
+void rpc_clnt_disconnect(struct rpc_clnt *clnt)
+{
+ rpc_clnt_iterate_for_each_xprt(clnt, rpc_clnt_disconnect_xprt, NULL);
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_disconnect);
+
/*
* Properly shut down an RPC client, terminating all outstanding
* requests.
@@ -863,9 +932,7 @@ void rpc_shutdown_client(struct rpc_clnt *clnt)
{
might_sleep();
- dprintk_rcu("RPC: shutting down %s client for %s\n",
- clnt->cl_program->name,
- rcu_dereference(clnt->cl_xprt)->servername);
+ trace_rpc_clnt_shutdown(clnt);
while (!list_empty(&clnt->cl_tasks)) {
rpc_killall_tasks(clnt);
@@ -880,27 +947,41 @@ EXPORT_SYMBOL_GPL(rpc_shutdown_client);
/*
* Free an RPC client
*/
+static void rpc_free_client_work(struct work_struct *work)
+{
+ struct rpc_clnt *clnt = container_of(work, struct rpc_clnt, cl_work);
+
+ trace_rpc_clnt_free(clnt);
+
+ /* These might block on processes that might allocate memory,
+ * so they cannot be called in rpciod, so they are handled separately
+ * here.
+ */
+ rpc_sysfs_client_destroy(clnt);
+ rpc_clnt_debugfs_unregister(clnt);
+ rpc_free_clid(clnt);
+ rpc_clnt_remove_pipedir(clnt);
+ xprt_put(rcu_dereference_raw(clnt->cl_xprt));
+
+ kfree(clnt);
+ rpciod_down();
+}
static struct rpc_clnt *
rpc_free_client(struct rpc_clnt *clnt)
{
struct rpc_clnt *parent = NULL;
- dprintk_rcu("RPC: destroying %s client for %s\n",
- clnt->cl_program->name,
- rcu_dereference(clnt->cl_xprt)->servername);
+ trace_rpc_clnt_release(clnt);
if (clnt->cl_parent != clnt)
parent = clnt->cl_parent;
- rpc_clnt_debugfs_unregister(clnt);
- rpc_clnt_remove_pipedir(clnt);
rpc_unregister_client(clnt);
rpc_free_iostats(clnt->cl_metrics);
clnt->cl_metrics = NULL;
- xprt_put(rcu_dereference_raw(clnt->cl_xprt));
xprt_iter_destroy(&clnt->cl_xpi);
- rpciod_down();
put_cred(clnt->cl_cred);
- rpc_free_clid(clnt);
- kfree(clnt);
+
+ INIT_WORK(&clnt->cl_work, rpc_free_client_work);
+ schedule_work(&clnt->cl_work);
return parent;
}
@@ -910,18 +991,16 @@ rpc_free_client(struct rpc_clnt *clnt)
static struct rpc_clnt *
rpc_free_auth(struct rpc_clnt *clnt)
{
- if (clnt->cl_auth == NULL)
- return rpc_free_client(clnt);
-
/*
* Note: RPCSEC_GSS may need to send NULL RPC calls in order to
* release remaining GSS contexts. This mechanism ensures
* that it can do so safely.
*/
- atomic_inc(&clnt->cl_count);
- rpcauth_release(clnt->cl_auth);
- clnt->cl_auth = NULL;
- if (atomic_dec_and_test(&clnt->cl_count))
+ if (clnt->cl_auth != NULL) {
+ rpcauth_release(clnt->cl_auth);
+ clnt->cl_auth = NULL;
+ }
+ if (refcount_dec_and_test(&clnt->cl_count))
return rpc_free_client(clnt);
return NULL;
}
@@ -932,12 +1011,10 @@ rpc_free_auth(struct rpc_clnt *clnt)
void
rpc_release_client(struct rpc_clnt *clnt)
{
- dprintk("RPC: rpc_release_client(%p)\n", clnt);
-
do {
if (list_empty(&clnt->cl_tasks))
wake_up(&destroy_wait);
- if (!atomic_dec_and_test(&clnt->cl_count))
+ if (refcount_dec_not_one(&clnt->cl_count))
break;
clnt = rpc_free_auth(clnt);
} while (clnt != NULL);
@@ -1061,8 +1138,13 @@ rpc_task_get_next_xprt(struct rpc_clnt *clnt)
static
void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
{
- if (task->tk_xprt)
- return;
+ if (task->tk_xprt) {
+ if (!(test_bit(XPRT_OFFLINE, &task->tk_xprt->state) &&
+ (task->tk_flags & RPC_TASK_MOVEABLE)))
+ return;
+ xprt_release(task);
+ xprt_put(task->tk_xprt);
+ }
if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
task->tk_xprt = rpc_task_get_first_xprt(clnt);
else
@@ -1072,24 +1154,19 @@ void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
static
void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
{
-
- if (clnt != NULL) {
- rpc_task_set_transport(task, clnt);
- task->tk_client = clnt;
- atomic_inc(&clnt->cl_count);
- if (clnt->cl_softrtry)
- task->tk_flags |= RPC_TASK_SOFT;
- if (clnt->cl_softerr)
- task->tk_flags |= RPC_TASK_TIMEOUT;
- if (clnt->cl_noretranstimeo)
- task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
- if (atomic_read(&clnt->cl_swapper))
- task->tk_flags |= RPC_TASK_SWAPPER;
- /* Add to the client's list of all tasks */
- spin_lock(&clnt->cl_lock);
- list_add_tail(&task->tk_task, &clnt->cl_tasks);
- spin_unlock(&clnt->cl_lock);
- }
+ rpc_task_set_transport(task, clnt);
+ task->tk_client = clnt;
+ refcount_inc(&clnt->cl_count);
+ if (clnt->cl_softrtry)
+ task->tk_flags |= RPC_TASK_SOFT;
+ if (clnt->cl_softerr)
+ task->tk_flags |= RPC_TASK_TIMEOUT;
+ if (clnt->cl_noretranstimeo)
+ task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
+ /* Add to the client's list of all tasks */
+ spin_lock(&clnt->cl_lock);
+ list_add_tail(&task->tk_task, &clnt->cl_tasks);
+ spin_unlock(&clnt->cl_lock);
}
static void
@@ -1099,8 +1176,9 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
task->tk_msg.rpc_proc = msg->rpc_proc;
task->tk_msg.rpc_argp = msg->rpc_argp;
task->tk_msg.rpc_resp = msg->rpc_resp;
- if (msg->rpc_cred != NULL)
- task->tk_msg.rpc_cred = get_cred(msg->rpc_cred);
+ task->tk_msg.rpc_cred = msg->rpc_cred;
+ if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+ get_cred(task->tk_msg.rpc_cred);
}
}
@@ -1125,6 +1203,11 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
struct rpc_task *task;
task = rpc_new_task(task_setup_data);
+ if (IS_ERR(task))
+ return task;
+
+ if (!RPC_IS_ASYNC(task))
+ task->tk_flags |= RPC_TASK_CRED_NOREF;
rpc_task_set_client(task, task_setup_data->rpc_client);
rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
@@ -1222,6 +1305,11 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
* Create an rpc_task to send the data
*/
task = rpc_new_task(&task_setup_data);
+ if (IS_ERR(task)) {
+ xprt_free_bc_request(req);
+ return task;
+ }
+
xprt_init_bc_request(req, task);
task->tk_action = call_bc_encode;
@@ -1247,13 +1335,10 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages,
unsigned int base, unsigned int len,
unsigned int hdrsize)
{
- /* Subtract one to force an extra word of buffer space for the
- * payload's XDR pad to fall into the rcv_buf's tail iovec.
- */
- hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1;
+ hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign;
xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len);
- trace_rpc_reply_pages(req);
+ trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf);
}
EXPORT_SYMBOL_GPL(rpc_prepare_reply_pages);
@@ -1607,7 +1692,8 @@ const char
static void
__rpc_call_rpcerror(struct rpc_task *task, int tk_status, int rpc_status)
{
- task->tk_rpc_status = rpc_status;
+ trace_rpc_call_rpcerror(task, tk_status, rpc_status);
+ rpc_task_set_rpc_status(task, rpc_status);
rpc_exit(task, tk_status);
}
@@ -1630,10 +1716,6 @@ call_start(struct rpc_task *task)
int idx = task->tk_msg.rpc_proc->p_statidx;
trace_rpc_request(task);
- dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid,
- clnt->cl_program->name, clnt->cl_vers,
- rpc_proc_name(task),
- (RPC_IS_ASYNC(task) ? "async" : "sync"));
/* Increment call count (version might not be valid for ping) */
if (clnt->cl_program->version[clnt->cl_vers])
@@ -1649,8 +1731,6 @@ call_start(struct rpc_task *task)
static void
call_reserve(struct rpc_task *task)
{
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_reserveresult;
xprt_reserve(task);
@@ -1666,8 +1746,6 @@ call_reserveresult(struct rpc_task *task)
{
int status = task->tk_status;
- dprint_status(task);
-
/*
* After a call to xprt_reserve(), we must have either
* a request slot or else an error status.
@@ -1683,17 +1761,10 @@ call_reserveresult(struct rpc_task *task)
return;
}
- /*
- * Even though there was an error, we may have acquired
- * a request slot somehow. Make sure not to leak it.
- */
- if (task->tk_rqstp)
- xprt_release(task);
-
switch (status) {
case -ENOMEM:
rpc_delay(task, HZ >> 2);
- /* fall through */
+ fallthrough;
case -EAGAIN: /* woken up; retry */
task->tk_action = call_retry_reserve;
return;
@@ -1708,8 +1779,6 @@ call_reserveresult(struct rpc_task *task)
static void
call_retry_reserve(struct rpc_task *task)
{
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_reserveresult;
xprt_retry_reserve(task);
@@ -1721,8 +1790,6 @@ call_retry_reserve(struct rpc_task *task)
static void
call_refresh(struct rpc_task *task)
{
- dprint_status(task);
-
task->tk_action = call_refreshresult;
task->tk_status = 0;
task->tk_client->cl_stats->rpcauthrefresh++;
@@ -1737,8 +1804,6 @@ call_refreshresult(struct rpc_task *task)
{
int status = task->tk_status;
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_refresh;
switch (status) {
@@ -1750,23 +1815,24 @@ call_refreshresult(struct rpc_task *task)
/* Use rate-limiting and a max number of retries if refresh
* had status 0 but failed to update the cred.
*/
- /* fall through */
+ fallthrough;
case -ETIMEDOUT:
rpc_delay(task, 3*HZ);
- /* fall through */
+ fallthrough;
case -EAGAIN:
status = -EACCES;
- /* fall through */
+ fallthrough;
case -EKEYEXPIRED:
if (!task->tk_cred_retry)
break;
task->tk_cred_retry--;
- dprintk("RPC: %5u %s: retry refresh creds\n",
- task->tk_pid, __func__);
+ trace_rpc_retry_refresh_status(task);
+ return;
+ case -ENOMEM:
+ rpc_delay(task, HZ >> 4);
return;
}
- dprintk("RPC: %5u %s: refresh creds failed with error %d\n",
- task->tk_pid, __func__, status);
+ trace_rpc_refresh_status(task);
rpc_call_rpcerror(task, status);
}
@@ -1783,8 +1849,6 @@ call_allocate(struct rpc_task *task)
const struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
int status;
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_encode;
@@ -1814,7 +1878,7 @@ call_allocate(struct rpc_task *task)
req->rq_rcvsize <<= 2;
status = xprt->ops->buf_alloc(task);
- xprt_inject_disconnect(xprt);
+ trace_rpc_buf_alloc(task, status);
if (status == 0)
return;
if (status != -ENOMEM) {
@@ -1822,8 +1886,6 @@ call_allocate(struct rpc_task *task)
return;
}
- dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
-
if (RPC_IS_ASYNC(task) || !fatal_signal_pending(current)) {
task->tk_action = call_allocate;
rpc_delay(task, HZ>>4);
@@ -1859,7 +1921,6 @@ rpc_xdr_encode(struct rpc_task *task)
req->rq_snd_buf.head[0].iov_len = 0;
xdr_init_encode(&xdr, &req->rq_snd_buf,
req->rq_snd_buf.head[0].iov_base, req);
- xdr_free_bvec(&req->rq_snd_buf);
if (rpc_encode_header(task, &xdr))
return;
@@ -1874,11 +1935,14 @@ call_encode(struct rpc_task *task)
{
if (!rpc_task_need_encode(task))
goto out;
- dprint_status(task);
+
/* Dequeue task from the receive queue while we're encoding */
xprt_request_dequeue_xprt(task);
/* Encode here so that rpcsec_gss can use correct sequence number. */
rpc_xdr_encode(task);
+ /* Add task to reply queue before transmission to avoid races */
+ if (task->tk_status == 0 && rpc_reply_expected(task))
+ task->tk_status = xprt_request_enqueue_receive(task);
/* Did the encode result in an error condition? */
if (task->tk_status != 0) {
/* Was the error nonfatal? */
@@ -1889,12 +1953,11 @@ call_encode(struct rpc_task *task)
break;
case -EKEYEXPIRED:
if (!task->tk_cred_retry) {
- rpc_exit(task, task->tk_status);
+ rpc_call_rpcerror(task, task->tk_status);
} else {
task->tk_action = call_refresh;
task->tk_cred_retry--;
- dprintk("RPC: %5u %s: retry refresh creds\n",
- task->tk_pid, __func__);
+ trace_rpc_retry_refresh_status(task);
}
break;
default:
@@ -1903,9 +1966,6 @@ call_encode(struct rpc_task *task)
return;
}
- /* Add task to reply queue before transmission to avoid races */
- if (rpc_reply_expected(task))
- xprt_request_enqueue_receive(task);
xprt_request_enqueue_transmit(task);
out:
task->tk_action = call_transmit;
@@ -1951,8 +2011,6 @@ call_bind(struct rpc_task *task)
return;
}
- dprint_status(task);
-
task->tk_action = call_bind_status;
if (!xprt_prepare_transmit(task))
return;
@@ -1974,8 +2032,6 @@ call_bind_status(struct rpc_task *task)
return;
}
- dprint_status(task);
- trace_rpc_bind_status(task);
if (task->tk_status >= 0)
goto out_next;
if (xprt_bound(xprt)) {
@@ -1985,12 +2041,10 @@ call_bind_status(struct rpc_task *task)
switch (task->tk_status) {
case -ENOMEM:
- dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid);
rpc_delay(task, HZ >> 2);
goto retry_timeout;
case -EACCES:
- dprintk("RPC: %5u remote rpcbind: RPC program/version "
- "unavailable\n", task->tk_pid);
+ trace_rpcb_prog_unavail_err(task);
/* fail immediately if this is an RPC ping */
if (task->tk_msg.rpc_proc->p_proc == 0) {
status = -EOPNOTSUPP;
@@ -2007,17 +2061,14 @@ call_bind_status(struct rpc_task *task)
case -EAGAIN:
goto retry_timeout;
case -ETIMEDOUT:
- dprintk("RPC: %5u rpcbind request timed out\n",
- task->tk_pid);
+ trace_rpcb_timeout_err(task);
goto retry_timeout;
case -EPFNOSUPPORT:
/* server doesn't support any rpcbind version we know of */
- dprintk("RPC: %5u unrecognized remote rpcbind service\n",
- task->tk_pid);
+ trace_rpcb_bind_version_err(task);
break;
case -EPROTONOSUPPORT:
- dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n",
- task->tk_pid);
+ trace_rpcb_bind_version_err(task);
goto retry_timeout;
case -ECONNREFUSED: /* connection problems */
case -ECONNRESET:
@@ -2028,8 +2079,7 @@ call_bind_status(struct rpc_task *task)
case -EHOSTUNREACH:
case -ENETUNREACH:
case -EPIPE:
- dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
- task->tk_pid, task->tk_status);
+ trace_rpcb_unreachable_err(task);
if (!RPC_IS_SOFTCONN(task)) {
rpc_delay(task, 5*HZ);
goto retry_timeout;
@@ -2037,8 +2087,7 @@ call_bind_status(struct rpc_task *task)
status = task->tk_status;
break;
default:
- dprintk("RPC: %5u unrecognized rpcbind error (%d)\n",
- task->tk_pid, -task->tk_status);
+ trace_rpcb_unrecognized_err(task);
}
rpc_call_rpcerror(task, status);
@@ -2070,10 +2119,6 @@ call_connect(struct rpc_task *task)
return;
}
- dprintk("RPC: %5u call_connect xprt %p %s connected\n",
- task->tk_pid, xprt,
- (xprt_connected(xprt) ? "is" : "is not"));
-
task->tk_action = call_connect_status;
if (task->tk_status < 0)
return;
@@ -2101,7 +2146,6 @@ call_connect_status(struct rpc_task *task)
return;
}
- dprint_status(task);
trace_rpc_connect_status(task);
if (task->tk_status == 0) {
@@ -2123,7 +2167,7 @@ call_connect_status(struct rpc_task *task)
rpc_force_rebind(clnt);
goto out_retry;
}
- /* fall through */
+ fallthrough;
case -ECONNRESET:
case -ECONNABORTED:
case -ENETDOWN:
@@ -2137,11 +2181,36 @@ call_connect_status(struct rpc_task *task)
break;
/* retry with existing socket, after a delay */
rpc_delay(task, 3*HZ);
- /* fall through */
+ fallthrough;
case -EADDRINUSE:
case -ENOTCONN:
case -EAGAIN:
case -ETIMEDOUT:
+ if (!(task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) &&
+ (task->tk_flags & RPC_TASK_MOVEABLE) &&
+ test_bit(XPRT_REMOVE, &xprt->state)) {
+ struct rpc_xprt *saved = task->tk_xprt;
+ struct rpc_xprt_switch *xps;
+
+ rcu_read_lock();
+ xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+ rcu_read_unlock();
+ if (xps->xps_nxprts > 1) {
+ long value;
+
+ xprt_release(task);
+ value = atomic_long_dec_return(&xprt->queuelen);
+ if (value == 0)
+ rpc_xprt_switch_remove_xprt(xps, saved,
+ true);
+ xprt_put(saved);
+ task->tk_xprt = NULL;
+ task->tk_action = call_start;
+ }
+ xprt_switch_put(xps);
+ if (!task->tk_xprt)
+ return;
+ }
goto out_retry;
case -ENOBUFS:
rpc_delay(task, HZ >> 2);
@@ -2169,8 +2238,6 @@ call_transmit(struct rpc_task *task)
return;
}
- dprint_status(task);
-
task->tk_action = call_transmit_status;
if (!xprt_prepare_transmit(task))
return;
@@ -2205,7 +2272,6 @@ call_transmit_status(struct rpc_task *task)
switch (task->tk_status) {
default:
- dprint_status(task);
break;
case -EBADMSG:
task->tk_status = 0;
@@ -2217,9 +2283,10 @@ call_transmit_status(struct rpc_task *task)
* socket just returned a connection error,
* then hold onto the transport lock.
*/
+ case -ENOMEM:
case -ENOBUFS:
rpc_delay(task, HZ>>2);
- /* fall through */
+ fallthrough;
case -EBADSLT:
case -EAGAIN:
task->tk_action = call_transmit;
@@ -2238,7 +2305,7 @@ call_transmit_status(struct rpc_task *task)
rpc_call_rpcerror(task, task->tk_status);
return;
}
- /* fall through */
+ fallthrough;
case -ECONNRESET:
case -ECONNABORTED:
case -EADDRINUSE:
@@ -2287,8 +2354,6 @@ call_bc_transmit_status(struct rpc_task *task)
if (rpc_task_transmitted(task))
task->tk_status = 0;
- dprint_status(task);
-
switch (task->tk_status) {
case 0:
/* Success */
@@ -2302,9 +2367,10 @@ call_bc_transmit_status(struct rpc_task *task)
case -ENOTCONN:
case -EPIPE:
break;
+ case -ENOMEM:
case -ENOBUFS:
rpc_delay(task, HZ>>2);
- /* fall through */
+ fallthrough;
case -EBADSLT:
case -EAGAIN:
task->tk_status = 0;
@@ -2348,8 +2414,6 @@ call_status(struct rpc_task *task)
if (!task->tk_msg.rpc_proc->p_proc)
trace_xprt_ping(task->tk_xprt, task->tk_status);
- dprint_status(task);
-
status = task->tk_status;
if (status >= 0) {
task->tk_action = call_decode;
@@ -2371,7 +2435,7 @@ call_status(struct rpc_task *task)
* were a timeout.
*/
rpc_delay(task, 3*HZ);
- /* fall through */
+ fallthrough;
case -ETIMEDOUT:
break;
case -ECONNREFUSED:
@@ -2382,10 +2446,15 @@ call_status(struct rpc_task *task)
break;
case -EADDRINUSE:
rpc_delay(task, 3*HZ);
- /* fall through */
+ fallthrough;
case -EPIPE:
case -EAGAIN:
break;
+ case -ENFILE:
+ case -ENOBUFS:
+ case -ENOMEM:
+ rpc_delay(task, HZ>>2);
+ break;
case -EIO:
/* shutdown or soft timeout */
goto out_exit;
@@ -2396,7 +2465,8 @@ call_status(struct rpc_task *task)
goto out_exit;
}
task->tk_action = call_encode;
- rpc_check_timeout(task);
+ if (status != -ECONNRESET && status != -ECONNABORTED)
+ rpc_check_timeout(task);
return;
out_exit:
rpc_call_rpcerror(task, status);
@@ -2416,10 +2486,13 @@ rpc_check_timeout(struct rpc_task *task)
{
struct rpc_clnt *clnt = task->tk_client;
+ if (RPC_SIGNALLED(task))
+ return;
+
if (xprt_adjust_timeout(task->tk_rqstp) == 0)
return;
- dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid);
+ trace_rpc_timeout_status(task);
task->tk_timeouts++;
if (RPC_IS_SOFTCONN(task) && !rpc_check_connected(task->tk_rqstp)) {
@@ -2478,8 +2551,6 @@ call_decode(struct rpc_task *task)
struct xdr_stream xdr;
int err;
- dprint_status(task);
-
if (!task->tk_msg.rpc_proc->p_decode) {
task->tk_action = rpc_exit_task;
return;
@@ -2495,12 +2566,6 @@ call_decode(struct rpc_task *task)
}
/*
- * Ensure that we see all writes made by xprt_complete_rqst()
- * before it changed req->rq_reply_bytes_recvd.
- */
- smp_rmb();
-
- /*
* Did we ever call xprt_complete_rqst()? If not, we should assume
* the message is incomplete.
*/
@@ -2508,7 +2573,13 @@ call_decode(struct rpc_task *task)
if (!req->rq_reply_bytes_recvd)
goto out;
+ /* Ensure that we see all writes made by xprt_complete_rqst()
+ * before it changed req->rq_reply_bytes_recvd.
+ */
+ smp_rmb();
+
req->rq_rcv_buf.len = req->rq_private_buf.len;
+ trace_rpc_xdr_recvfrom(task, &req->rq_rcv_buf);
/* Check that the softirq receive buffer is valid */
WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf,
@@ -2522,8 +2593,6 @@ out:
case 0:
task->tk_action = rpc_exit_task;
task->tk_status = rpcauth_unwrap_resp(task, &xdr);
- dprintk("RPC: %5u %s result %d\n",
- task->tk_pid, __func__, task->tk_status);
return;
case -EAGAIN:
task->tk_status = 0;
@@ -2644,7 +2713,7 @@ out_unparsable:
out_verifier:
trace_rpc_bad_verifier(task);
- goto out_garbage;
+ goto out_err;
out_msg_denied:
error = -EACCES;
@@ -2711,17 +2780,22 @@ static const struct rpc_procinfo rpcproc_null = {
.p_decode = rpcproc_decode_null,
};
-static int rpc_ping(struct rpc_clnt *clnt)
+static const struct rpc_procinfo rpcproc_null_noreply = {
+ .p_encode = rpcproc_encode_null,
+};
+
+static void
+rpc_null_call_prepare(struct rpc_task *task, void *data)
{
- struct rpc_message msg = {
- .rpc_proc = &rpcproc_null,
- };
- int err;
- err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN |
- RPC_TASK_NULLCREDS);
- return err;
+ task->tk_flags &= ~RPC_TASK_NO_RETRANS_TIMEOUT;
+ rpc_call_start(task);
}
+static const struct rpc_call_ops rpc_null_ops = {
+ .rpc_call_prepare = rpc_null_call_prepare,
+ .rpc_call_done = rpc_default_callback,
+};
+
static
struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt,
struct rpc_xprt *xprt, struct rpc_cred *cred, int flags,
@@ -2735,9 +2809,10 @@ struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt,
.rpc_xprt = xprt,
.rpc_message = &msg,
.rpc_op_cred = cred,
- .callback_ops = (ops != NULL) ? ops : &rpc_default_ops,
+ .callback_ops = ops ?: &rpc_null_ops,
.callback_data = data,
- .flags = flags | RPC_TASK_NULLCREDS,
+ .flags = flags | RPC_TASK_SOFT | RPC_TASK_SOFTCONN |
+ RPC_TASK_NULLCREDS,
};
return rpc_run_task(&task_setup_data);
@@ -2749,6 +2824,41 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int
}
EXPORT_SYMBOL_GPL(rpc_call_null);
+static int rpc_ping(struct rpc_clnt *clnt)
+{
+ struct rpc_task *task;
+ int status;
+
+ task = rpc_call_null_helper(clnt, NULL, NULL, 0, NULL, NULL);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = task->tk_status;
+ rpc_put_task(task);
+ return status;
+}
+
+static int rpc_ping_noreply(struct rpc_clnt *clnt)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &rpcproc_null_noreply,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .rpc_message = &msg,
+ .callback_ops = &rpc_null_ops,
+ .flags = RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS,
+ };
+ struct rpc_task *task;
+ int status;
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = task->tk_status;
+ rpc_put_task(task);
+ return status;
+}
+
struct rpc_cb_add_xprt_calldata {
struct rpc_xprt_switch *xps;
struct rpc_xprt *xprt;
@@ -2772,6 +2882,7 @@ static void rpc_cb_add_xprt_release(void *calldata)
}
static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
+ .rpc_call_prepare = rpc_null_call_prepare,
.rpc_call_done = rpc_cb_add_xprt_done,
.rpc_release = rpc_cb_add_xprt_release,
};
@@ -2790,7 +2901,16 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
struct rpc_cb_add_xprt_calldata *data;
struct rpc_task *task;
- data = kmalloc(sizeof(*data), GFP_NOFS);
+ if (xps->xps_nunique_destaddr_xprts + 1 > clnt->cl_max_connect) {
+ rcu_read_lock();
+ pr_warn("SUNRPC: reached max allowed number (%d) did not add "
+ "transport to server: %s\n", clnt->cl_max_connect,
+ rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR));
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
data->xps = xprt_switch_get(xps);
@@ -2800,17 +2920,42 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
goto success;
}
- task = rpc_call_null_helper(clnt, xprt, NULL,
- RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS,
+ task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_ASYNC,
&rpc_cb_add_xprt_call_ops, data);
if (IS_ERR(task))
return PTR_ERR(task);
+
+ data->xps->xps_nunique_destaddr_xprts++;
rpc_put_task(task);
success:
return 1;
}
EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
+static int rpc_clnt_add_xprt_helper(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ struct rpc_add_xprt_test *data)
+{
+ struct rpc_task *task;
+ int status = -EADDRINUSE;
+
+ /* Test the connection */
+ task = rpc_call_null_helper(clnt, xprt, NULL, 0, NULL, NULL);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ status = task->tk_status;
+ rpc_put_task(task);
+
+ if (status < 0)
+ return status;
+
+ /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */
+ data->add_xprt_test(clnt, xprt, data->data);
+
+ return 0;
+}
+
/**
* rpc_clnt_setup_test_and_add_xprt()
*
@@ -2834,8 +2979,6 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt,
struct rpc_xprt *xprt,
void *data)
{
- struct rpc_task *task;
- struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data;
int status = -EADDRINUSE;
xprt = xprt_get(xprt);
@@ -2844,33 +2987,19 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt,
if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
goto out_err;
- /* Test the connection */
- task = rpc_call_null_helper(clnt, xprt, NULL,
- RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS,
- NULL, NULL);
- if (IS_ERR(task)) {
- status = PTR_ERR(task);
- goto out_err;
- }
- status = task->tk_status;
- rpc_put_task(task);
-
+ status = rpc_clnt_add_xprt_helper(clnt, xprt, data);
if (status < 0)
goto out_err;
- /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */
- xtest->add_xprt_test(clnt, xprt, xtest->data);
-
- xprt_put(xprt);
- xprt_switch_put(xps);
-
- /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */
- return 1;
+ status = 1;
out_err:
xprt_put(xprt);
xprt_switch_put(xps);
- pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not added\n",
- status, xprt->address_strings[RPC_DISPLAY_ADDR]);
+ if (status < 0)
+ pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not "
+ "added\n", status,
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+ /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */
return status;
}
EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt);
@@ -2901,7 +3030,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
unsigned long connect_timeout;
unsigned long reconnect_timeout;
unsigned char resvport, reuseport;
- int ret = 0;
+ int ret = 0, ident;
rcu_read_lock();
xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
@@ -2915,8 +3044,11 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
reuseport = xprt->reuseport;
connect_timeout = xprt->connect_timeout;
reconnect_timeout = xprt->max_reconnect_timeout;
+ ident = xprt->xprt_class->ident;
rcu_read_unlock();
+ if (!xprtargs->ident)
+ xprtargs->ident = ident;
xprt = xprt_create_transport(xprtargs);
if (IS_ERR(xprt)) {
ret = PTR_ERR(xprt);
@@ -2944,6 +3076,110 @@ out_put_switch:
}
EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
+static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ struct rpc_add_xprt_test *data)
+{
+ struct rpc_xprt_switch *xps;
+ struct rpc_xprt *main_xprt;
+ int status = 0;
+
+ xprt_get(xprt);
+
+ rcu_read_lock();
+ main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+ xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+ status = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr,
+ (struct sockaddr *)&main_xprt->addr);
+ rcu_read_unlock();
+ xprt_put(main_xprt);
+ if (status || !test_bit(XPRT_OFFLINE, &xprt->state))
+ goto out;
+
+ status = rpc_clnt_add_xprt_helper(clnt, xprt, data);
+out:
+ xprt_put(xprt);
+ xprt_switch_put(xps);
+ return status;
+}
+
+/* rpc_clnt_probe_trunked_xprt -- probe offlined transport for session trunking
+ * @clnt rpc_clnt structure
+ *
+ * For each offlined transport found in the rpc_clnt structure call
+ * the function rpc_xprt_probe_trunked() which will determine if this
+ * transport still belongs to the trunking group.
+ */
+void rpc_clnt_probe_trunked_xprts(struct rpc_clnt *clnt,
+ struct rpc_add_xprt_test *data)
+{
+ struct rpc_xprt_iter xpi;
+ int ret;
+
+ ret = rpc_clnt_xprt_iter_offline_init(clnt, &xpi);
+ if (ret)
+ return;
+ for (;;) {
+ struct rpc_xprt *xprt = xprt_iter_get_next(&xpi);
+
+ if (!xprt)
+ break;
+ ret = rpc_xprt_probe_trunked(clnt, xprt, data);
+ xprt_put(xprt);
+ if (ret < 0)
+ break;
+ xprt_iter_rewind(&xpi);
+ }
+ xprt_iter_destroy(&xpi);
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_probe_trunked_xprts);
+
+static int rpc_xprt_offline(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *data)
+{
+ struct rpc_xprt *main_xprt;
+ struct rpc_xprt_switch *xps;
+ int err = 0;
+
+ xprt_get(xprt);
+
+ rcu_read_lock();
+ main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+ xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+ err = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr,
+ (struct sockaddr *)&main_xprt->addr);
+ rcu_read_unlock();
+ xprt_put(main_xprt);
+ if (err)
+ goto out;
+
+ if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) {
+ err = -EINTR;
+ goto out;
+ }
+ xprt_set_offline_locked(xprt, xps);
+
+ xprt_release_write(xprt, NULL);
+out:
+ xprt_put(xprt);
+ xprt_switch_put(xps);
+ return err;
+}
+
+/* rpc_clnt_manage_trunked_xprts -- offline trunked transports
+ * @clnt rpc_clnt structure
+ *
+ * For each active transport found in the rpc_clnt structure call
+ * the function rpc_xprt_offline() which will identify trunked transports
+ * and will mark them offline.
+ */
+void rpc_clnt_manage_trunked_xprts(struct rpc_clnt *clnt)
+{
+ rpc_clnt_iterate_for_each_xprt(clnt, rpc_xprt_offline, NULL);
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_manage_trunked_xprts);
+
struct connect_timeout_data {
unsigned long connect_timeout;
unsigned long reconnect_timeout;
@@ -2986,8 +3222,22 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
}
EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put);
+void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+ struct rpc_xprt_switch *xps;
+
+ rcu_read_lock();
+ xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+ rcu_read_unlock();
+ xprt_set_online_locked(xprt, xps);
+}
+
void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
{
+ if (rpc_clnt_xprt_switch_has_addr(clnt,
+ (const struct sockaddr *)&xprt->addr)) {
+ return rpc_clnt_xprt_set_online(clnt, xprt);
+ }
rcu_read_lock();
rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
xprt);
@@ -2995,6 +3245,19 @@ void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
}
EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt);
+void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+ struct rpc_xprt_switch *xps;
+
+ rcu_read_lock();
+ xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+ rpc_xprt_switch_remove_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
+ xprt, 0);
+ xps->xps_nunique_destaddr_xprts--;
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_remove_xprt);
+
bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
const struct sockaddr *sap)
{
@@ -3066,6 +3329,8 @@ rpc_clnt_swap_activate_callback(struct rpc_clnt *clnt,
int
rpc_clnt_swap_activate(struct rpc_clnt *clnt)
{
+ while (clnt != clnt->cl_parent)
+ clnt = clnt->cl_parent;
if (atomic_inc_return(&clnt->cl_swapper) == 1)
return rpc_clnt_iterate_for_each_xprt(clnt,
rpc_clnt_swap_activate_callback, NULL);
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index fd9bca242724..a176d5a0b0ee 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -8,14 +8,14 @@
#include <linux/debugfs.h>
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/clnt.h>
+
#include "netns.h"
+#include "fail.h"
static struct dentry *topdir;
static struct dentry *rpc_clnt_dir;
static struct dentry *rpc_xprt_dir;
-unsigned int rpc_inject_disconnect;
-
static int
tasks_show(struct seq_file *f, void *v)
{
@@ -90,7 +90,7 @@ static int tasks_open(struct inode *inode, struct file *filp)
struct seq_file *seq = filp->private_data;
struct rpc_clnt *clnt = seq->private = inode->i_private;
- if (!atomic_inc_not_zero(&clnt->cl_count)) {
+ if (!refcount_inc_not_zero(&clnt->cl_count)) {
seq_release(inode, filp);
ret = -EINVAL;
}
@@ -128,13 +128,13 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n
return 0;
len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
xprt->debugfs->d_name.name);
- if (len > sizeof(name))
+ if (len >= sizeof(name))
return -1;
if (*nump == 0)
strcpy(link, "xprt");
else {
len = snprintf(link, sizeof(link), "xprt%d", *nump);
- if (len > sizeof(link))
+ if (len >= sizeof(link))
return -1;
}
debugfs_create_symlink(link, clnt->cl_debugfs, name);
@@ -235,8 +235,6 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
/* make tasks file */
debugfs_create_file("info", S_IFREG | 0400, xprt->debugfs, xprt,
&xprt_info_fops);
-
- atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
}
void
@@ -246,56 +244,33 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt)
xprt->debugfs = NULL;
}
-static int
-fault_open(struct inode *inode, struct file *filp)
-{
- filp->private_data = kmalloc(128, GFP_KERNEL);
- if (!filp->private_data)
- return -ENOMEM;
- return 0;
-}
+#if IS_ENABLED(CONFIG_FAIL_SUNRPC)
+struct fail_sunrpc_attr fail_sunrpc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+};
+EXPORT_SYMBOL_GPL(fail_sunrpc);
-static int
-fault_release(struct inode *inode, struct file *filp)
+static void fail_sunrpc_init(void)
{
- kfree(filp->private_data);
- return 0;
-}
+ struct dentry *dir;
-static ssize_t
-fault_disconnect_read(struct file *filp, char __user *user_buf,
- size_t len, loff_t *offset)
-{
- char *buffer = (char *)filp->private_data;
- size_t size;
+ dir = fault_create_debugfs_attr("fail_sunrpc", NULL,
+ &fail_sunrpc.attr);
- size = sprintf(buffer, "%u\n", rpc_inject_disconnect);
- return simple_read_from_buffer(user_buf, len, offset, buffer, size);
-}
+ debugfs_create_bool("ignore-client-disconnect", S_IFREG | 0600, dir,
+ &fail_sunrpc.ignore_client_disconnect);
-static ssize_t
-fault_disconnect_write(struct file *filp, const char __user *user_buf,
- size_t len, loff_t *offset)
+ debugfs_create_bool("ignore-server-disconnect", S_IFREG | 0600, dir,
+ &fail_sunrpc.ignore_server_disconnect);
+
+ debugfs_create_bool("ignore-cache-wait", S_IFREG | 0600, dir,
+ &fail_sunrpc.ignore_cache_wait);
+}
+#else
+static void fail_sunrpc_init(void)
{
- char buffer[16];
-
- if (len >= sizeof(buffer))
- len = sizeof(buffer) - 1;
- if (copy_from_user(buffer, user_buf, len))
- return -EFAULT;
- buffer[len] = '\0';
- if (kstrtouint(buffer, 10, &rpc_inject_disconnect))
- return -EINVAL;
- return len;
}
-
-static const struct file_operations fault_disconnect_fops = {
- .owner = THIS_MODULE,
- .open = fault_open,
- .read = fault_disconnect_read,
- .write = fault_disconnect_write,
- .release = fault_release,
-};
+#endif
void __exit
sunrpc_debugfs_exit(void)
@@ -309,16 +284,11 @@ sunrpc_debugfs_exit(void)
void __init
sunrpc_debugfs_init(void)
{
- struct dentry *rpc_fault_dir;
-
topdir = debugfs_create_dir("sunrpc", NULL);
rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
rpc_xprt_dir = debugfs_create_dir("rpc_xprt", topdir);
- rpc_fault_dir = debugfs_create_dir("inject_fault", topdir);
-
- debugfs_create_file("disconnect", S_IFREG | 0400, rpc_fault_dir, NULL,
- &fault_disconnect_fops);
+ fail_sunrpc_init();
}
diff --git a/net/sunrpc/fail.h b/net/sunrpc/fail.h
new file mode 100644
index 000000000000..4b4b500df428
--- /dev/null
+++ b/net/sunrpc/fail.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2021, Oracle. All rights reserved.
+ */
+
+#ifndef _NET_SUNRPC_FAIL_H_
+#define _NET_SUNRPC_FAIL_H_
+
+#include <linux/fault-inject.h>
+
+#if IS_ENABLED(CONFIG_FAULT_INJECTION)
+
+struct fail_sunrpc_attr {
+ struct fault_attr attr;
+
+ bool ignore_client_disconnect;
+ bool ignore_server_disconnect;
+ bool ignore_cache_wait;
+};
+
+extern struct fail_sunrpc_attr fail_sunrpc;
+
+#endif /* CONFIG_FAULT_INJECTION */
+
+#endif /* _NET_SUNRPC_FAIL_H_ */
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 39e14d5edaf1..0b6034fab9ab 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -197,7 +197,7 @@ static struct inode *
rpc_alloc_inode(struct super_block *sb)
{
struct rpc_inode *rpci;
- rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
+ rpci = alloc_inode_sb(sb, rpc_inode_cachep, GFP_KERNEL);
if (!rpci)
return NULL;
return &rpci->vfs_inode;
@@ -423,7 +423,7 @@ rpc_info_open(struct inode *inode, struct file *file)
spin_lock(&file->f_path.dentry->d_lock);
if (!d_unhashed(file->f_path.dentry))
clnt = RPC_I(inode)->private;
- if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) {
+ if (clnt != NULL && refcount_inc_not_zero(&clnt->cl_count)) {
spin_unlock(&file->f_path.dentry->d_lock);
m->private = clnt;
} else {
@@ -478,6 +478,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode)
inode->i_fop = &simple_dir_operations;
inode->i_op = &simple_dir_inode_operations;
inc_nlink(inode);
+ break;
default:
break;
}
@@ -599,9 +600,9 @@ static int __rpc_rmdir(struct inode *dir, struct dentry *dentry)
dget(dentry);
ret = simple_rmdir(dir, dentry);
+ d_drop(dentry);
if (!ret)
fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
dput(dentry);
return ret;
}
@@ -612,9 +613,9 @@ static int __rpc_unlink(struct inode *dir, struct dentry *dentry)
dget(dentry);
ret = simple_unlink(dir, dentry);
+ d_drop(dentry);
if (!ret)
fsnotify_unlink(dir, dentry);
- d_delete(dentry);
dput(dentry);
return ret;
}
@@ -781,7 +782,8 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
}
/**
- * rpc_mkpipe - make an rpc_pipefs file for kernel<->userspace communication
+ * rpc_mkpipe_dentry - make an rpc_pipefs file for kernel<->userspace
+ * communication
* @parent: dentry of directory to create new "pipe" in
* @name: name of pipe
* @private: private data to associate with the pipe, for the caller's use
@@ -1317,6 +1319,7 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
q.len = strlen(gssd_dummy_clnt_dir[0].name);
clnt_dentry = d_hash_and_lookup(gssd_dentry, &q);
if (!clnt_dentry) {
+ __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
pipe_dentry = ERR_PTR(-ENOENT);
goto out;
}
@@ -1509,6 +1512,6 @@ err_notifier:
void unregister_rpc_pipefs(void)
{
rpc_clients_notifier_unregister();
- kmem_cache_destroy(rpc_inode_cachep);
unregister_filesystem(&rpc_pipe_fs_type);
+ kmem_cache_destroy(rpc_inode_cachep);
}
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 4a020b688860..5a8e6d46809a 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -31,11 +31,9 @@
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/xprtsock.h>
-#include "netns.h"
+#include <trace/events/sunrpc.h>
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_BIND
-#endif
+#include "netns.h"
#define RPCBIND_SOCK_PATHNAME "/var/run/rpcbind.sock"
@@ -216,10 +214,6 @@ static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt,
sn->rpcb_is_af_local = is_af_local ? 1 : 0;
smp_wmb();
sn->rpcb_users = 1;
- dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: "
- "%p, rpcb_local_clnt4: %p) for net %x%s\n",
- sn->rpcb_local_clnt, sn->rpcb_local_clnt4,
- net->ns.inum, (net == &init_net) ? " (init_net)" : "");
}
/*
@@ -261,19 +255,13 @@ static int rpcb_create_local_unix(struct net *net)
*/
clnt = rpc_create(&args);
if (IS_ERR(clnt)) {
- dprintk("RPC: failed to create AF_LOCAL rpcbind "
- "client (errno %ld).\n", PTR_ERR(clnt));
result = PTR_ERR(clnt);
goto out;
}
clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
- if (IS_ERR(clnt4)) {
- dprintk("RPC: failed to bind second program to "
- "rpcbind v4 client (errno %ld).\n",
- PTR_ERR(clnt4));
+ if (IS_ERR(clnt4))
clnt4 = NULL;
- }
rpcb_set_local(net, clnt, clnt4, true);
@@ -309,8 +297,6 @@ static int rpcb_create_local_net(struct net *net)
clnt = rpc_create(&args);
if (IS_ERR(clnt)) {
- dprintk("RPC: failed to create local rpcbind "
- "client (errno %ld).\n", PTR_ERR(clnt));
result = PTR_ERR(clnt);
goto out;
}
@@ -321,12 +307,8 @@ static int rpcb_create_local_net(struct net *net)
* v4 upcalls.
*/
clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
- if (IS_ERR(clnt4)) {
- dprintk("RPC: failed to bind second program to "
- "rpcbind v4 client (errno %ld).\n",
- PTR_ERR(clnt4));
+ if (IS_ERR(clnt4))
clnt4 = NULL;
- }
rpcb_set_local(net, clnt, clnt4, false);
@@ -362,13 +344,15 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename,
const char *hostname,
struct sockaddr *srvaddr, size_t salen,
int proto, u32 version,
- const struct cred *cred)
+ const struct cred *cred,
+ const struct rpc_timeout *timeo)
{
struct rpc_create_args args = {
.net = net,
.protocol = proto,
.address = srvaddr,
.addrsize = salen,
+ .timeout = timeo,
.servername = hostname,
.nodename = nodename,
.program = &rpcb_program,
@@ -403,11 +387,8 @@ static int rpcb_register_call(struct sunrpc_net *sn, struct rpc_clnt *clnt, stru
msg->rpc_resp = &result;
error = rpc_call_sync(clnt, msg, flags);
- if (error < 0) {
- dprintk("RPC: failed to contact local rpcbind "
- "server (errno %d).\n", -error);
+ if (error < 0)
return error;
- }
if (!result)
return -EACCES;
@@ -461,9 +442,7 @@ int rpcb_register(struct net *net, u32 prog, u32 vers, int prot, unsigned short
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
bool is_set = false;
- dprintk("RPC: %sregistering (%u, %u, %d, %u) with local "
- "rpcbind\n", (port ? "" : "un"),
- prog, vers, prot, port);
+ trace_pmap_register(prog, vers, prot, port);
msg.rpc_proc = &rpcb_procedures2[RPCBPROC_UNSET];
if (port != 0) {
@@ -489,11 +468,6 @@ static int rpcb_register_inet4(struct sunrpc_net *sn,
map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
- dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
- "local rpcbind\n", (port ? "" : "un"),
- map->r_prog, map->r_vers,
- map->r_addr, map->r_netid);
-
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
if (port != 0) {
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
@@ -520,11 +494,6 @@ static int rpcb_register_inet6(struct sunrpc_net *sn,
map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
- dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
- "local rpcbind\n", (port ? "" : "un"),
- map->r_prog, map->r_vers,
- map->r_addr, map->r_netid);
-
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
if (port != 0) {
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
@@ -541,9 +510,7 @@ static int rpcb_unregister_all_protofamilies(struct sunrpc_net *sn,
{
struct rpcbind_args *map = msg->rpc_argp;
- dprintk("RPC: unregistering [%u, %u, '%s'] with "
- "local rpcbind\n",
- map->r_prog, map->r_vers, map->r_netid);
+ trace_rpcb_unregister(map->r_prog, map->r_vers, map->r_netid);
map->r_addr = "";
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
@@ -615,6 +582,8 @@ int rpcb_v4_register(struct net *net, const u32 program, const u32 version,
if (address == NULL)
return rpcb_unregister_all_protofamilies(sn, &msg);
+ trace_rpcb_register(map.r_prog, map.r_vers, map.r_addr, map.r_netid);
+
switch (address->sa_family) {
case AF_INET:
return rpcb_register_inet4(sn, address, &msg);
@@ -693,18 +662,12 @@ void rpcb_getport_async(struct rpc_task *task)
rcu_read_unlock();
xprt = xprt_get(task->tk_xprt);
- dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
- task->tk_pid, __func__,
- xprt->servername, clnt->cl_prog, clnt->cl_vers, xprt->prot);
-
/* Put self on the wait queue to ensure we get notified if
* some other task is already attempting to bind the port */
rpc_sleep_on_timeout(&xprt->binding, task,
NULL, jiffies + xprt->bind_timeout);
if (xprt_test_and_set_binding(xprt)) {
- dprintk("RPC: %5u %s: waiting for another binder\n",
- task->tk_pid, __func__);
xprt_put(xprt);
return;
}
@@ -712,8 +675,6 @@ void rpcb_getport_async(struct rpc_task *task)
/* Someone else may have bound if we slept */
if (xprt_bound(xprt)) {
status = 0;
- dprintk("RPC: %5u %s: already bound\n",
- task->tk_pid, __func__);
goto bailout_nofree;
}
@@ -732,38 +693,30 @@ void rpcb_getport_async(struct rpc_task *task)
break;
default:
status = -EAFNOSUPPORT;
- dprintk("RPC: %5u %s: bad address family\n",
- task->tk_pid, __func__);
goto bailout_nofree;
}
if (proc == NULL) {
xprt->bind_index = 0;
status = -EPFNOSUPPORT;
- dprintk("RPC: %5u %s: no more getport versions available\n",
- task->tk_pid, __func__);
goto bailout_nofree;
}
- dprintk("RPC: %5u %s: trying rpcbind version %u\n",
- task->tk_pid, __func__, bind_version);
+ trace_rpcb_getport(clnt, task, bind_version);
rpcb_clnt = rpcb_create(xprt->xprt_net,
clnt->cl_nodename,
xprt->servername, sap, salen,
xprt->prot, bind_version,
- clnt->cl_cred);
+ clnt->cl_cred,
+ task->tk_client->cl_timeout);
if (IS_ERR(rpcb_clnt)) {
status = PTR_ERR(rpcb_clnt);
- dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
- task->tk_pid, __func__, PTR_ERR(rpcb_clnt));
goto bailout_nofree;
}
- map = kzalloc(sizeof(struct rpcbind_args), GFP_NOFS);
+ map = kzalloc(sizeof(struct rpcbind_args), rpc_task_gfp_mask());
if (!map) {
status = -ENOMEM;
- dprintk("RPC: %5u %s: no memory available\n",
- task->tk_pid, __func__);
goto bailout_release_client;
}
map->r_prog = clnt->cl_prog;
@@ -777,11 +730,9 @@ void rpcb_getport_async(struct rpc_task *task)
case RPCBVERS_4:
case RPCBVERS_3:
map->r_netid = xprt->address_strings[RPC_DISPLAY_NETID];
- map->r_addr = rpc_sockaddr2uaddr(sap, GFP_NOFS);
+ map->r_addr = rpc_sockaddr2uaddr(sap, rpc_task_gfp_mask());
if (!map->r_addr) {
status = -ENOMEM;
- dprintk("RPC: %5u %s: no memory available\n",
- task->tk_pid, __func__);
goto bailout_free_args;
}
map->r_owner = "";
@@ -795,12 +746,6 @@ void rpcb_getport_async(struct rpc_task *task)
child = rpcb_call_async(rpcb_clnt, map, proc);
rpc_release_client(rpcb_clnt);
- if (IS_ERR(child)) {
- /* rpcb_map_release() has freed the arguments */
- dprintk("RPC: %5u %s: rpc_run_task failed\n",
- task->tk_pid, __func__);
- return;
- }
xprt->stat.bind_count++;
rpc_put_task(child);
@@ -824,34 +769,33 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
{
struct rpcbind_args *map = data;
struct rpc_xprt *xprt = map->r_xprt;
- int status = child->tk_status;
+
+ map->r_status = child->tk_status;
/* Garbage reply: retry with a lesser rpcbind version */
- if (status == -EIO)
- status = -EPROTONOSUPPORT;
+ if (map->r_status == -EIO)
+ map->r_status = -EPROTONOSUPPORT;
/* rpcbind server doesn't support this rpcbind protocol version */
- if (status == -EPROTONOSUPPORT)
+ if (map->r_status == -EPROTONOSUPPORT)
xprt->bind_index++;
- if (status < 0) {
+ if (map->r_status < 0) {
/* rpcbind server not available on remote host? */
- xprt->ops->set_port(xprt, 0);
+ map->r_port = 0;
+
} else if (map->r_port == 0) {
/* Requested RPC service wasn't registered on remote host */
- xprt->ops->set_port(xprt, 0);
- status = -EACCES;
+ map->r_status = -EACCES;
} else {
/* Succeeded */
- xprt->ops->set_port(xprt, map->r_port);
- xprt_set_bound(xprt);
- status = 0;
+ map->r_status = 0;
}
- dprintk("RPC: %5u rpcb_getport_done(status %d, port %u)\n",
- child->tk_pid, status, map->r_port);
-
- map->r_status = status;
+ trace_rpcb_setport(child, map->r_status, map->r_port);
+ xprt->ops->set_port(xprt, map->r_port);
+ if (map->r_port)
+ xprt_set_bound(xprt);
}
/*
@@ -864,11 +808,6 @@ static void rpcb_enc_mapping(struct rpc_rqst *req, struct xdr_stream *xdr,
const struct rpcbind_args *rpcb = data;
__be32 *p;
- dprintk("RPC: %5u encoding PMAP_%s call (%u, %u, %d, %u)\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name,
- rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port);
-
p = xdr_reserve_space(xdr, RPCB_mappingargs_sz << 2);
*p++ = cpu_to_be32(rpcb->r_prog);
*p++ = cpu_to_be32(rpcb->r_vers);
@@ -890,8 +829,6 @@ static int rpcb_dec_getport(struct rpc_rqst *req, struct xdr_stream *xdr,
return -EIO;
port = be32_to_cpup(p);
- dprintk("RPC: %5u PMAP_%s result: %lu\n", req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name, port);
if (unlikely(port > USHRT_MAX))
return -EIO;
@@ -912,11 +849,6 @@ static int rpcb_dec_set(struct rpc_rqst *req, struct xdr_stream *xdr,
*boolp = 0;
if (*p != xdr_zero)
*boolp = 1;
-
- dprintk("RPC: %5u RPCB_%s call %s\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name,
- (*boolp ? "succeeded" : "failed"));
return 0;
}
@@ -941,12 +873,6 @@ static void rpcb_enc_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
const struct rpcbind_args *rpcb = data;
__be32 *p;
- dprintk("RPC: %5u encoding RPCB_%s call (%u, %u, '%s', '%s')\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name,
- rpcb->r_prog, rpcb->r_vers,
- rpcb->r_netid, rpcb->r_addr);
-
p = xdr_reserve_space(xdr, (RPCB_program_sz + RPCB_version_sz) << 2);
*p++ = cpu_to_be32(rpcb->r_prog);
*p = cpu_to_be32(rpcb->r_vers);
@@ -976,11 +902,8 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
* If the returned universal address is a null string,
* the requested RPC service was not registered.
*/
- if (len == 0) {
- dprintk("RPC: %5u RPCB reply: program not registered\n",
- req->rq_task->tk_pid);
+ if (len == 0)
return 0;
- }
if (unlikely(len > RPCBIND_MAXUADDRLEN))
goto out_fail;
@@ -988,8 +911,6 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, len);
if (unlikely(p == NULL))
goto out_fail;
- dprintk("RPC: %5u RPCB_%s reply: %s\n", req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name, (char *)p);
if (rpc_uaddr2sockaddr(req->rq_xprt->xprt_net, (char *)p, len,
sap, sizeof(address)) == 0)
@@ -999,9 +920,6 @@ static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
return 0;
out_fail:
- dprintk("RPC: %5u malformed RPCB_%s reply\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name);
return -EIO;
}
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 55e900255b0c..be587a308e05 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -27,10 +27,6 @@
#include "sunrpc.h"
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-#define RPCDBG_FACILITY RPCDBG_SCHED
-#endif
-
#define CREATE_TRACE_POINTS
#include <trace/events/sunrpc.h>
@@ -61,6 +57,21 @@ struct workqueue_struct *rpciod_workqueue __read_mostly;
struct workqueue_struct *xprtiod_workqueue __read_mostly;
EXPORT_SYMBOL_GPL(xprtiod_workqueue);
+gfp_t rpc_task_gfp_mask(void)
+{
+ if (current->flags & PF_WQ_WORKER)
+ return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+ return GFP_KERNEL;
+}
+EXPORT_SYMBOL_GPL(rpc_task_gfp_mask);
+
+bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status)
+{
+ if (cmpxchg(&task->tk_rpc_status, 0, rpc_status) == 0)
+ return true;
+ return false;
+}
+
unsigned long
rpc_task_timeout(const struct rpc_task *task)
{
@@ -85,7 +96,6 @@ __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
{
if (list_empty(&task->u.tk_wait.timer_list))
return;
- dprintk("RPC: %5u disabling timer\n", task->tk_pid);
task->tk_timeout = 0;
list_del(&task->u.tk_wait.timer_list);
if (list_empty(&queue->timer_list.list))
@@ -111,9 +121,6 @@ static void
__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task,
unsigned long timeout)
{
- dprintk("RPC: %5u setting alarm for %u ms\n",
- task->tk_pid, jiffies_to_msecs(timeout - jiffies));
-
task->tk_timeout = timeout;
if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires))
rpc_set_queue_timer(queue, timeout);
@@ -194,25 +201,14 @@ static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue,
/*
* Add new request to wait queue.
- *
- * Swapper tasks always get inserted at the head of the queue.
- * This should avoid many nasty memory deadlocks and hopefully
- * improve overall performance.
- * Everyone else gets appended to the queue to ensure proper FIFO behavior.
*/
static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
struct rpc_task *task,
unsigned char queue_priority)
{
- WARN_ON_ONCE(RPC_IS_QUEUED(task));
- if (RPC_IS_QUEUED(task))
- return;
-
INIT_LIST_HEAD(&task->u.tk_wait.timer_list);
if (RPC_IS_PRIORITY(queue))
__rpc_add_wait_queue_priority(queue, task, queue_priority);
- else if (RPC_IS_SWAPPER(task))
- list_add(&task->u.tk_wait.list, &queue->tasks[0]);
else
list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
task->tk_waitqueue = queue;
@@ -220,9 +216,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
/* barrier matches the read in rpc_wake_up_task_queue_locked() */
smp_wmb();
rpc_set_queued(task);
-
- dprintk("RPC: %5u added to queue %p \"%s\"\n",
- task->tk_pid, queue, rpc_qname(queue));
}
/*
@@ -245,8 +238,6 @@ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_tas
else
list_del(&task->u.tk_wait.list);
queue->qlen--;
- dprintk("RPC: %5u removed from queue %p \"%s\"\n",
- task->tk_pid, queue, rpc_qname(queue));
}
static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, unsigned char nr_queues)
@@ -285,7 +276,7 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- freezable_schedule_unsafe();
+ schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
return 0;
@@ -294,9 +285,17 @@ static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS)
static void rpc_task_set_debuginfo(struct rpc_task *task)
{
- static atomic_t rpc_pid;
+ struct rpc_clnt *clnt = task->tk_client;
+
+ /* Might be a task carrying a reverse-direction operation */
+ if (!clnt) {
+ static atomic_t rpc_pid;
- task->tk_pid = atomic_inc_return(&rpc_pid);
+ task->tk_pid = atomic_inc_return(&rpc_pid);
+ return;
+ }
+
+ task->tk_pid = atomic_inc_return(&clnt->cl_pid);
}
#else
static inline void rpc_task_set_debuginfo(struct rpc_task *task)
@@ -341,14 +340,12 @@ static int rpc_complete_task(struct rpc_task *task)
* to enforce taking of the wq->lock and hence avoid races with
* rpc_complete_task().
*/
-int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action)
+int rpc_wait_for_completion_task(struct rpc_task *task)
{
- if (action == NULL)
- action = rpc_wait_bit_killable;
return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
- action, TASK_KILLABLE);
+ rpc_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
}
-EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
+EXPORT_SYMBOL_GPL(rpc_wait_for_completion_task);
/*
* Make an RPC task runnable.
@@ -382,25 +379,32 @@ static void rpc_make_runnable(struct workqueue_struct *wq,
* NB: An RPC task will only receive interrupt-driven events as long
* as it's on a wait queue.
*/
-static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
+static void __rpc_do_sleep_on_priority(struct rpc_wait_queue *q,
struct rpc_task *task,
unsigned char queue_priority)
{
- dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
- task->tk_pid, rpc_qname(q), jiffies);
-
trace_rpc_task_sleep(task, q);
__rpc_add_wait_queue(q, task, queue_priority);
+}
+static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
+ struct rpc_task *task,
+ unsigned char queue_priority)
+{
+ if (WARN_ON_ONCE(RPC_IS_QUEUED(task)))
+ return;
+ __rpc_do_sleep_on_priority(q, task, queue_priority);
}
static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q,
struct rpc_task *task, unsigned long timeout,
unsigned char queue_priority)
{
+ if (WARN_ON_ONCE(RPC_IS_QUEUED(task)))
+ return;
if (time_is_after_jiffies(timeout)) {
- __rpc_sleep_on_priority(q, task, queue_priority);
+ __rpc_do_sleep_on_priority(q, task, queue_priority);
__rpc_add_timer(q, task, timeout);
} else
task->tk_status = -ETIMEDOUT;
@@ -503,9 +507,6 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
struct rpc_wait_queue *queue,
struct rpc_task *task)
{
- dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
- task->tk_pid, jiffies);
-
/* Has the task been executed yet? If not, we cannot wake it up! */
if (!RPC_IS_ACTIVATED(task)) {
printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task);
@@ -517,8 +518,6 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
__rpc_remove_wait_queue(queue, task);
rpc_make_runnable(wq, task);
-
- dprintk("RPC: __rpc_wake_up_task done\n");
}
/*
@@ -607,10 +606,20 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q
struct rpc_task *task;
/*
+ * Service the privileged queue.
+ */
+ q = &queue->tasks[RPC_NR_PRIORITY - 1];
+ if (queue->maxpriority > RPC_PRIORITY_PRIVILEGED && !list_empty(q)) {
+ task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
+ goto out;
+ }
+
+ /*
* Service a batch of tasks from a single owner.
*/
q = &queue->tasks[queue->priority];
- if (!list_empty(q) && --queue->nr) {
+ if (!list_empty(q) && queue->nr) {
+ queue->nr--;
task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
goto out;
}
@@ -656,8 +665,6 @@ struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
{
struct rpc_task *task = NULL;
- dprintk("RPC: wake_up_first(%p \"%s\")\n",
- queue, rpc_qname(queue));
spin_lock(&queue->lock);
task = __rpc_find_next_queued(queue);
if (task != NULL)
@@ -693,6 +700,23 @@ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue)
EXPORT_SYMBOL_GPL(rpc_wake_up_next);
/**
+ * rpc_wake_up_locked - wake up all rpc_tasks
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ *
+ */
+static void rpc_wake_up_locked(struct rpc_wait_queue *queue)
+{
+ struct rpc_task *task;
+
+ for (;;) {
+ task = __rpc_find_next_queued(queue);
+ if (task == NULL)
+ break;
+ rpc_wake_up_task_queue_locked(queue, task);
+ }
+}
+
+/**
* rpc_wake_up - wake up all rpc_tasks
* @queue: rpc_wait_queue on which the tasks are sleeping
*
@@ -700,25 +724,28 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_next);
*/
void rpc_wake_up(struct rpc_wait_queue *queue)
{
- struct list_head *head;
-
spin_lock(&queue->lock);
- head = &queue->tasks[queue->maxpriority];
+ rpc_wake_up_locked(queue);
+ spin_unlock(&queue->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up);
+
+/**
+ * rpc_wake_up_status_locked - wake up all rpc_tasks and set their status value.
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ * @status: status value to set
+ */
+static void rpc_wake_up_status_locked(struct rpc_wait_queue *queue, int status)
+{
+ struct rpc_task *task;
+
for (;;) {
- while (!list_empty(head)) {
- struct rpc_task *task;
- task = list_first_entry(head,
- struct rpc_task,
- u.tk_wait.list);
- rpc_wake_up_task_queue_locked(queue, task);
- }
- if (head == &queue->tasks[0])
+ task = __rpc_find_next_queued(queue);
+ if (task == NULL)
break;
- head--;
+ rpc_wake_up_task_queue_set_status_locked(queue, task, status);
}
- spin_unlock(&queue->lock);
}
-EXPORT_SYMBOL_GPL(rpc_wake_up);
/**
* rpc_wake_up_status - wake up all rpc_tasks and set their status value.
@@ -729,23 +756,8 @@ EXPORT_SYMBOL_GPL(rpc_wake_up);
*/
void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
{
- struct list_head *head;
-
spin_lock(&queue->lock);
- head = &queue->tasks[queue->maxpriority];
- for (;;) {
- while (!list_empty(head)) {
- struct rpc_task *task;
- task = list_first_entry(head,
- struct rpc_task,
- u.tk_wait.list);
- task->tk_status = status;
- rpc_wake_up_task_queue_locked(queue, task);
- }
- if (head == &queue->tasks[0])
- break;
- head--;
- }
+ rpc_wake_up_status_locked(queue, status);
spin_unlock(&queue->lock);
}
EXPORT_SYMBOL_GPL(rpc_wake_up_status);
@@ -763,7 +775,7 @@ static void __rpc_queue_timer_fn(struct work_struct *work)
list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
timeo = task->tk_timeout;
if (time_after_eq(now, timeo)) {
- dprintk("RPC: %5u timeout\n", task->tk_pid);
+ trace_rpc_task_timeout(task, task->tk_action);
task->tk_status = -ETIMEDOUT;
rpc_wake_up_task_queue_locked(queue, task);
continue;
@@ -831,6 +843,7 @@ void rpc_exit_task(struct rpc_task *task)
else if (task->tk_client)
rpc_count_iostats(task, task->tk_client->cl_metrics);
if (task->tk_ops->rpc_call_done != NULL) {
+ trace_rpc_task_call_done(task, task->tk_ops->rpc_call_done);
task->tk_ops->rpc_call_done(task, task->tk_calldata);
if (task->tk_action != NULL) {
/* Always release the RPC slot and buffer memory */
@@ -847,12 +860,25 @@ void rpc_signal_task(struct rpc_task *task)
if (!RPC_IS_ACTIVATED(task))
return;
+ if (!rpc_task_set_rpc_status(task, -ERESTARTSYS))
+ return;
trace_rpc_task_signalled(task, task->tk_action);
set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
smp_mb__after_atomic();
queue = READ_ONCE(task->tk_waitqueue);
if (queue)
- rpc_wake_up_queued_task_set_status(queue, task, -ERESTARTSYS);
+ rpc_wake_up_queued_task(queue, task);
+}
+
+void rpc_task_try_cancel(struct rpc_task *task, int error)
+{
+ struct rpc_wait_queue *queue;
+
+ if (!rpc_task_set_rpc_status(task, error))
+ return;
+ queue = READ_ONCE(task->tk_waitqueue);
+ if (queue)
+ rpc_wake_up_queued_task(queue, task);
}
void rpc_exit(struct rpc_task *task, int status)
@@ -869,6 +895,15 @@ void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata)
ops->rpc_release(calldata);
}
+static bool xprt_needs_memalloc(struct rpc_xprt *xprt, struct rpc_task *tk)
+{
+ if (!xprt)
+ return false;
+ if (!atomic_read(&xprt->swapper))
+ return false;
+ return test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == tk;
+}
+
/*
* This is the RPC `scheduler' (or rather, the finite state machine).
*/
@@ -877,9 +912,7 @@ static void __rpc_execute(struct rpc_task *task)
struct rpc_wait_queue *queue;
int task_is_async = RPC_IS_ASYNC(task);
int status = 0;
-
- dprintk("RPC: %5u __rpc_execute flags=0x%x\n",
- task->tk_pid, task->tk_flags);
+ unsigned long pflags = current->flags;
WARN_ON_ONCE(RPC_IS_QUEUED(task));
if (RPC_IS_QUEUED(task))
@@ -892,31 +925,35 @@ static void __rpc_execute(struct rpc_task *task)
* Perform the next FSM step or a pending callback.
*
* tk_action may be NULL if the task has been killed.
- * In particular, note that rpc_killall_tasks may
- * do this at any time, so beware when dereferencing.
*/
do_action = task->tk_action;
+ /* Tasks with an RPC error status should exit */
+ if (do_action != rpc_exit_task &&
+ (status = READ_ONCE(task->tk_rpc_status)) != 0) {
+ task->tk_status = status;
+ if (do_action != NULL)
+ do_action = rpc_exit_task;
+ }
+ /* Callbacks override all actions */
if (task->tk_callback) {
do_action = task->tk_callback;
task->tk_callback = NULL;
}
if (!do_action)
break;
+ if (RPC_IS_SWAPPER(task) ||
+ xprt_needs_memalloc(task->tk_xprt, task))
+ current->flags |= PF_MEMALLOC;
+
trace_rpc_task_run_action(task, do_action);
do_action(task);
/*
* Lockless check for whether task is sleeping or not.
*/
- if (!RPC_IS_QUEUED(task))
+ if (!RPC_IS_QUEUED(task)) {
+ cond_resched();
continue;
-
- /*
- * Signalled tasks should exit rather than sleep.
- */
- if (RPC_SIGNALLED(task)) {
- task->tk_rpc_status = -ERESTARTSYS;
- rpc_exit(task, -ERESTARTSYS);
}
/*
@@ -934,16 +971,22 @@ static void __rpc_execute(struct rpc_task *task)
spin_unlock(&queue->lock);
continue;
}
+ /* Wake up any task that has an exit status */
+ if (READ_ONCE(task->tk_rpc_status) != 0) {
+ rpc_wake_up_task_queue_locked(queue, task);
+ spin_unlock(&queue->lock);
+ continue;
+ }
rpc_clear_running(task);
spin_unlock(&queue->lock);
if (task_is_async)
- return;
+ goto out;
/* sync task: sleep here */
- dprintk("RPC: %5u sync task going to sleep\n", task->tk_pid);
+ trace_rpc_task_sync_sleep(task, task->tk_action);
status = out_of_line_wait_on_bit(&task->tk_runstate,
RPC_TASK_QUEUED, rpc_wait_bit_killable,
- TASK_KILLABLE);
+ TASK_KILLABLE|TASK_FREEZABLE);
if (status < 0) {
/*
* When a sync task receives a signal, it exits with
@@ -951,18 +994,15 @@ static void __rpc_execute(struct rpc_task *task)
* clean up after sleeping on some queue, we don't
* break the loop here, but go around once more.
*/
- trace_rpc_task_signalled(task, task->tk_action);
- set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
- task->tk_rpc_status = -ERESTARTSYS;
- rpc_exit(task, -ERESTARTSYS);
+ rpc_signal_task(task);
}
- dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
+ trace_rpc_task_sync_wake(task, task->tk_action);
}
- dprintk("RPC: %5u return %d, status %d\n", task->tk_pid, status,
- task->tk_status);
/* Release all resources associated with the task */
rpc_release_task(task);
+out:
+ current_restore_flags(pflags, PF_MEMALLOC);
}
/*
@@ -980,8 +1020,11 @@ void rpc_execute(struct rpc_task *task)
rpc_set_active(task);
rpc_make_runnable(rpciod_workqueue, task);
- if (!is_async)
+ if (!is_async) {
+ unsigned int pflags = memalloc_nofs_save();
__rpc_execute(task);
+ memalloc_nofs_restore(pflags);
+ }
}
static void rpc_async_schedule(struct work_struct *work)
@@ -1014,23 +1057,21 @@ int rpc_malloc(struct rpc_task *task)
struct rpc_rqst *rqst = task->tk_rqstp;
size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
struct rpc_buffer *buf;
- gfp_t gfp = GFP_NOFS;
-
- if (RPC_IS_SWAPPER(task))
- gfp = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
+ gfp_t gfp = rpc_task_gfp_mask();
size += sizeof(struct rpc_buffer);
- if (size <= RPC_BUFFER_MAXSIZE)
- buf = mempool_alloc(rpc_buffer_mempool, gfp);
- else
+ if (size <= RPC_BUFFER_MAXSIZE) {
+ buf = kmem_cache_alloc(rpc_buffer_slabp, gfp);
+ /* Reach for the mempool if dynamic allocation fails */
+ if (!buf && RPC_IS_ASYNC(task))
+ buf = mempool_alloc(rpc_buffer_mempool, GFP_NOWAIT);
+ } else
buf = kmalloc(size, gfp);
if (!buf)
return -ENOMEM;
buf->len = size;
- dprintk("RPC: %5u allocated buffer of size %zu at %p\n",
- task->tk_pid, size, buf);
rqst->rq_buffer = buf->data;
rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
return 0;
@@ -1051,9 +1092,6 @@ void rpc_free(struct rpc_task *task)
buf = container_of(buffer, struct rpc_buffer, data);
size = buf->len;
- dprintk("RPC: freeing buffer of size %zu at %p\n",
- size, buf);
-
if (size <= RPC_BUFFER_MAXSIZE)
mempool_free(buf, rpc_buffer_mempool);
else
@@ -1088,15 +1126,16 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
task->tk_action = rpc_prepare_task;
rpc_init_task_statistics(task);
-
- dprintk("RPC: new task initialized, procpid %u\n",
- task_pid_nr(current));
}
-static struct rpc_task *
-rpc_alloc_task(void)
+static struct rpc_task *rpc_alloc_task(void)
{
- return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
+ struct rpc_task *task;
+
+ task = kmem_cache_alloc(rpc_task_slabp, rpc_task_gfp_mask());
+ if (task)
+ return task;
+ return mempool_alloc(rpc_task_mempool, GFP_NOWAIT);
}
/*
@@ -1109,12 +1148,16 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data)
if (task == NULL) {
task = rpc_alloc_task();
+ if (task == NULL) {
+ rpc_release_calldata(setup_data->callback_ops,
+ setup_data->callback_data);
+ return ERR_PTR(-ENOMEM);
+ }
flags = RPC_TASK_DYNAMIC;
}
rpc_init_task(task, setup_data);
task->tk_flags |= flags;
- dprintk("RPC: allocated task %p\n", task);
return task;
}
@@ -1144,10 +1187,8 @@ static void rpc_free_task(struct rpc_task *task)
put_rpccred(task->tk_op_cred);
rpc_release_calldata(task->tk_ops, task->tk_calldata);
- if (tk_flags & RPC_TASK_DYNAMIC) {
- dprintk("RPC: %5u freeing task\n", task->tk_pid);
+ if (tk_flags & RPC_TASK_DYNAMIC)
mempool_free(task, rpc_task_mempool);
- }
}
static void rpc_async_release(struct work_struct *work)
@@ -1162,7 +1203,8 @@ static void rpc_release_resources_task(struct rpc_task *task)
{
xprt_release(task);
if (task->tk_msg.rpc_cred) {
- put_cred(task->tk_msg.rpc_cred);
+ if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+ put_cred(task->tk_msg.rpc_cred);
task->tk_msg.rpc_cred = NULL;
}
rpc_task_release_client(task);
@@ -1200,8 +1242,6 @@ EXPORT_SYMBOL_GPL(rpc_put_task_async);
static void rpc_release_task(struct rpc_task *task)
{
- dprintk("RPC: %5u release task\n", task->tk_pid);
-
WARN_ON_ONCE(RPC_IS_QUEUED(task));
rpc_release_resources_task(task);
@@ -1242,13 +1282,11 @@ static int rpciod_start(void)
/*
* Create the rpciod thread and wait for it to start.
*/
- dprintk("RPC: creating workqueue rpciod\n");
wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
if (!wq)
goto out_failed;
rpciod_workqueue = wq;
- /* Note: highpri because network receive is latency sensitive */
- wq = alloc_workqueue("xprtiod", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_HIGHPRI, 0);
+ wq = alloc_workqueue("xprtiod", WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
if (!wq)
goto free_rpciod;
xprtiod_workqueue = wq;
@@ -1267,7 +1305,6 @@ static void rpciod_stop(void)
if (rpciod_workqueue == NULL)
return;
- dprintk("RPC: destroying workqueue rpciod\n");
wq = rpciod_workqueue;
rpciod_workqueue = NULL;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 1a864f1ed119..71ba4cf513bc 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -14,9 +14,25 @@
#include <linux/types.h>
#include <linux/pagemap.h>
#include <linux/udp.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/xdr.h>
#include <linux/export.h>
+#include "socklib.h"
+
+/*
+ * Helper structure for copying from an sk_buff.
+ */
+struct xdr_skb_reader {
+ struct sk_buff *skb;
+ unsigned int offset;
+ size_t count;
+ __wsum csum;
+};
+
+typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to,
+ size_t len);
/**
* xdr_skb_read_bits - copy some data bits from skb to internal buffer
@@ -55,7 +71,7 @@ static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to,
if (len > desc->count)
len = desc->count;
pos = desc->offset;
- csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0);
+ csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len);
desc->csum = csum_block_add(desc->csum, csum2, pos);
desc->count -= len;
desc->offset += len;
@@ -186,3 +202,123 @@ no_checksum:
return 0;
}
EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr);
+
+static inline int xprt_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t seek)
+{
+ if (seek)
+ iov_iter_advance(&msg->msg_iter, seek);
+ return sock_sendmsg(sock, msg);
+}
+
+static int xprt_send_kvec(struct socket *sock, struct msghdr *msg,
+ struct kvec *vec, size_t seek)
+{
+ iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len);
+ return xprt_sendmsg(sock, msg, seek);
+}
+
+static int xprt_send_pagedata(struct socket *sock, struct msghdr *msg,
+ struct xdr_buf *xdr, size_t base)
+{
+ iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec, xdr_buf_pagecount(xdr),
+ xdr->page_len + xdr->page_base);
+ return xprt_sendmsg(sock, msg, base + xdr->page_base);
+}
+
+/* Common case:
+ * - stream transport
+ * - sending from byte 0 of the message
+ * - the message is wholly contained in @xdr's head iovec
+ */
+static int xprt_send_rm_and_kvec(struct socket *sock, struct msghdr *msg,
+ rpc_fraghdr marker, struct kvec *vec,
+ size_t base)
+{
+ struct kvec iov[2] = {
+ [0] = {
+ .iov_base = &marker,
+ .iov_len = sizeof(marker)
+ },
+ [1] = *vec,
+ };
+ size_t len = iov[0].iov_len + iov[1].iov_len;
+
+ iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len);
+ return xprt_sendmsg(sock, msg, base);
+}
+
+/**
+ * xprt_sock_sendmsg - write an xdr_buf directly to a socket
+ * @sock: open socket to send on
+ * @msg: socket message metadata
+ * @xdr: xdr_buf containing this request
+ * @base: starting position in the buffer
+ * @marker: stream record marker field
+ * @sent_p: return the total number of bytes successfully queued for sending
+ *
+ * Return values:
+ * On success, returns zero and fills in @sent_p.
+ * %-ENOTSOCK if @sock is not a struct socket.
+ */
+int xprt_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ struct xdr_buf *xdr, unsigned int base,
+ rpc_fraghdr marker, unsigned int *sent_p)
+{
+ unsigned int rmsize = marker ? sizeof(marker) : 0;
+ unsigned int remainder = rmsize + xdr->len - base;
+ unsigned int want;
+ int err = 0;
+
+ *sent_p = 0;
+
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ msg->msg_flags |= MSG_MORE;
+ want = xdr->head[0].iov_len + rmsize;
+ if (base < want) {
+ unsigned int len = want - base;
+
+ remainder -= len;
+ if (remainder == 0)
+ msg->msg_flags &= ~MSG_MORE;
+ if (rmsize)
+ err = xprt_send_rm_and_kvec(sock, msg, marker,
+ &xdr->head[0], base);
+ else
+ err = xprt_send_kvec(sock, msg, &xdr->head[0], base);
+ if (remainder == 0 || err != len)
+ goto out;
+ *sent_p += err;
+ base = 0;
+ } else {
+ base -= want;
+ }
+
+ if (base < xdr->page_len) {
+ unsigned int len = xdr->page_len - base;
+
+ remainder -= len;
+ if (remainder == 0)
+ msg->msg_flags &= ~MSG_MORE;
+ err = xprt_send_pagedata(sock, msg, xdr, base);
+ if (remainder == 0 || err != len)
+ goto out;
+ *sent_p += err;
+ base = 0;
+ } else {
+ base -= xdr->page_len;
+ }
+
+ if (base >= xdr->tail[0].iov_len)
+ return 0;
+ msg->msg_flags &= ~MSG_MORE;
+ err = xprt_send_kvec(sock, msg, &xdr->tail[0], base);
+out:
+ if (err > 0) {
+ *sent_p += err;
+ err = 0;
+ }
+ return err;
+}
diff --git a/net/sunrpc/socklib.h b/net/sunrpc/socklib.h
new file mode 100644
index 000000000000..c48114ad6f00
--- /dev/null
+++ b/net/sunrpc/socklib.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ * Copyright (C) 2020, Oracle.
+ */
+
+#ifndef _NET_SUNRPC_SOCKLIB_H_
+#define _NET_SUNRPC_SOCKLIB_H_
+
+int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb);
+int xprt_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ struct xdr_buf *xdr, unsigned int base,
+ rpc_fraghdr marker, unsigned int *sent_p);
+
+#endif /* _NET_SUNRPC_SOCKLIB_H_ */
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index c964b48eaaba..52908f9e6eab 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -66,7 +66,7 @@ static int rpc_proc_show(struct seq_file *seq, void *v) {
static int rpc_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, rpc_proc_show, PDE_DATA(inode));
+ return single_open(file, rpc_proc_show, pde_data(inode));
}
static const struct proc_ops rpc_proc_ops = {
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index c9bacb3c930f..d4a362c9e4b3 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/******************************************************************************
(c) 2008 NetApp. All Rights Reserved.
-NetApp provides this source code under the GPL v2 License.
-The GPL v2 license is available at
-http://opensource.org/licenses/gpl-license.php.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
@@ -50,10 +36,7 @@ static inline int sock_is_loopback(struct sock *sk)
return loopback;
}
-int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
- struct page *headpage, unsigned long headoffset,
- struct page *tailpage, unsigned long tailoffset);
-
int rpc_clients_notifier_register(void);
void rpc_clients_notifier_unregister(void);
+void auth_domain_cleanup(void);
#endif /* _NET_SUNRPC_SUNRPC_H */
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index f9edaa9174a4..691c0000e9ea 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -23,6 +23,8 @@
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/sunrpc/xprtsock.h>
+#include "sunrpc.h"
+#include "sysfs.h"
#include "netns.h"
unsigned int sunrpc_net_id;
@@ -102,6 +104,10 @@ init_sunrpc(void)
if (err)
goto out4;
+ err = rpc_sysfs_init();
+ if (err)
+ goto out5;
+
sunrpc_debugfs_init();
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
rpc_register_sysctl();
@@ -110,6 +116,8 @@ init_sunrpc(void)
init_socket_xprt(); /* clnt sock transport */
return 0;
+out5:
+ unregister_rpc_pipefs();
out4:
unregister_pernet_subsys(&sunrpc_net_ops);
out3:
@@ -123,7 +131,10 @@ out:
static void __exit
cleanup_sunrpc(void)
{
+ rpc_sysfs_exit();
rpc_cleanup_clids();
+ xprt_cleanup_ids();
+ xprt_multipath_cleanup_ids();
rpcauth_remove_module();
cleanup_socket_xprt();
svc_cleanup_xprt_sock();
@@ -131,6 +142,7 @@ cleanup_sunrpc(void)
unregister_rpc_pipefs();
rpc_destroy_mempool();
unregister_pernet_subsys(&sunrpc_net_ops);
+ auth_domain_cleanup();
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
rpc_unregister_sysctl();
#endif
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 187dd4e73d64..149171774bc6 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -31,22 +31,43 @@
#include <trace/events/sunrpc.h>
+#include "fail.h"
+
#define RPCDBG_FACILITY RPCDBG_SVCDSP
static void svc_unregister(const struct svc_serv *serv, struct net *net);
-#define svc_serv_is_pooled(serv) ((serv)->sv_ops->svo_function)
-
#define SVC_POOL_DEFAULT SVC_POOL_GLOBAL
/*
+ * Mode for mapping cpus to pools.
+ */
+enum {
+ SVC_POOL_AUTO = -1, /* choose one of the others */
+ SVC_POOL_GLOBAL, /* no mapping, just a single global pool
+ * (legacy & UP mode) */
+ SVC_POOL_PERCPU, /* one pool per cpu */
+ SVC_POOL_PERNODE /* one pool per numa node */
+};
+
+/*
* Structure for mapping cpus to pools and vice versa.
* Setup once during sunrpc initialisation.
*/
-struct svc_pool_map svc_pool_map = {
+
+struct svc_pool_map {
+ int count; /* How many svc_servs use us */
+ int mode; /* Note: int not enum to avoid
+ * warnings about "enumeration value
+ * not handled in switch" */
+ unsigned int npools;
+ unsigned int *pool_to; /* maps pool id to cpu or node */
+ unsigned int *to_pool; /* maps cpu or node to pool id */
+};
+
+static struct svc_pool_map svc_pool_map = {
.mode = SVC_POOL_DEFAULT
};
-EXPORT_SYMBOL_GPL(svc_pool_map);
static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
@@ -88,15 +109,15 @@ param_get_pool_mode(char *buf, const struct kernel_param *kp)
switch (*ip)
{
case SVC_POOL_AUTO:
- return strlcpy(buf, "auto", 20);
+ return strlcpy(buf, "auto\n", 20);
case SVC_POOL_GLOBAL:
- return strlcpy(buf, "global", 20);
+ return strlcpy(buf, "global\n", 20);
case SVC_POOL_PERCPU:
- return strlcpy(buf, "percpu", 20);
+ return strlcpy(buf, "percpu\n", 20);
case SVC_POOL_PERNODE:
- return strlcpy(buf, "pernode", 20);
+ return strlcpy(buf, "pernode\n", 20);
default:
- return sprintf(buf, "%d", *ip);
+ return sprintf(buf, "%d\n", *ip);
}
}
@@ -217,10 +238,12 @@ svc_pool_map_init_pernode(struct svc_pool_map *m)
/*
* Add a reference to the global map of cpus to pools (and
- * vice versa). Initialise the map if we're the first user.
- * Returns the number of pools.
+ * vice versa) if pools are in use.
+ * Initialise the map if we're the first user.
+ * Returns the number of pools. If this is '1', no reference
+ * was taken.
*/
-unsigned int
+static unsigned int
svc_pool_map_get(void)
{
struct svc_pool_map *m = &svc_pool_map;
@@ -230,6 +253,7 @@ svc_pool_map_get(void)
if (m->count++) {
mutex_unlock(&svc_pool_map_mutex);
+ WARN_ON_ONCE(m->npools <= 1);
return m->npools;
}
@@ -245,30 +269,36 @@ svc_pool_map_get(void)
break;
}
- if (npools < 0) {
+ if (npools <= 0) {
/* default, or memory allocation failure */
npools = 1;
m->mode = SVC_POOL_GLOBAL;
}
m->npools = npools;
+ if (npools == 1)
+ /* service is unpooled, so doesn't hold a reference */
+ m->count--;
+
mutex_unlock(&svc_pool_map_mutex);
- return m->npools;
+ return npools;
}
-EXPORT_SYMBOL_GPL(svc_pool_map_get);
/*
- * Drop a reference to the global map of cpus to pools.
+ * Drop a reference to the global map of cpus to pools, if
+ * pools were in use, i.e. if npools > 1.
* When the last reference is dropped, the map data is
* freed; this allows the sysadmin to change the pool
* mode using the pool_mode module option without
* rebooting or re-loading sunrpc.ko.
*/
-void
-svc_pool_map_put(void)
+static void
+svc_pool_map_put(int npools)
{
struct svc_pool_map *m = &svc_pool_map;
+ if (npools <= 1)
+ return;
mutex_lock(&svc_pool_map_mutex);
if (!--m->count) {
@@ -281,7 +311,6 @@ svc_pool_map_put(void)
mutex_unlock(&svc_pool_map_mutex);
}
-EXPORT_SYMBOL_GPL(svc_pool_map_put);
static int svc_pool_map_get_node(unsigned int pidx)
{
@@ -327,32 +356,35 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
}
}
-/*
- * Use the mapping mode to choose a pool for a given CPU.
- * Used when enqueueing an incoming RPC. Always returns
- * a non-NULL pool pointer.
+/**
+ * svc_pool_for_cpu - Select pool to run a thread on this cpu
+ * @serv: An RPC service
+ *
+ * Use the active CPU and the svc_pool_map's mode setting to
+ * select the svc thread pool to use. Once initialized, the
+ * svc_pool_map does not change.
+ *
+ * Return value:
+ * A pointer to an svc_pool
*/
-struct svc_pool *
-svc_pool_for_cpu(struct svc_serv *serv, int cpu)
+struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv)
{
struct svc_pool_map *m = &svc_pool_map;
+ int cpu = raw_smp_processor_id();
unsigned int pidx = 0;
- /*
- * An uninitialised map happens in a pure client when
- * lockd is brought up, so silently treat it the
- * same as SVC_POOL_GLOBAL.
- */
- if (svc_serv_is_pooled(serv)) {
- switch (m->mode) {
- case SVC_POOL_PERCPU:
- pidx = m->to_pool[cpu];
- break;
- case SVC_POOL_PERNODE:
- pidx = m->to_pool[cpu_to_node(cpu)];
- break;
- }
+ if (serv->sv_nrpools <= 1)
+ return serv->sv_pools;
+
+ switch (m->mode) {
+ case SVC_POOL_PERCPU:
+ pidx = m->to_pool[cpu];
+ break;
+ case SVC_POOL_PERNODE:
+ pidx = m->to_pool[cpu_to_node(cpu)];
+ break;
}
+
return &serv->sv_pools[pidx % serv->sv_nrpools];
}
@@ -422,7 +454,7 @@ __svc_init_bc(struct svc_serv *serv)
*/
static struct svc_serv *
__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
- const struct svc_serv_ops *ops)
+ int (*threadfn)(void *data))
{
struct svc_serv *serv;
unsigned int vers;
@@ -433,13 +465,13 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
return NULL;
serv->sv_name = prog->pg_name;
serv->sv_program = prog;
- serv->sv_nrthreads = 1;
+ kref_init(&serv->sv_refcnt);
serv->sv_stats = prog->pg_stats;
if (bufsize > RPCSVC_MAXPAYLOAD)
bufsize = RPCSVC_MAXPAYLOAD;
serv->sv_max_payload = bufsize? bufsize : 4096;
serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE);
- serv->sv_ops = ops;
+ serv->sv_threadfn = threadfn;
xdrsize = 0;
while (prog) {
prog->pg_lovers = prog->pg_nvers-1;
@@ -485,59 +517,56 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
return serv;
}
-struct svc_serv *
-svc_create(struct svc_program *prog, unsigned int bufsize,
- const struct svc_serv_ops *ops)
+/**
+ * svc_create - Create an RPC service
+ * @prog: the RPC program the new service will handle
+ * @bufsize: maximum message size for @prog
+ * @threadfn: a function to service RPC requests for @prog
+ *
+ * Returns an instantiated struct svc_serv object or NULL.
+ */
+struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize,
+ int (*threadfn)(void *data))
{
- return __svc_create(prog, bufsize, /*npools*/1, ops);
+ return __svc_create(prog, bufsize, 1, threadfn);
}
EXPORT_SYMBOL_GPL(svc_create);
-struct svc_serv *
-svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
- const struct svc_serv_ops *ops)
+/**
+ * svc_create_pooled - Create an RPC service with pooled threads
+ * @prog: the RPC program the new service will handle
+ * @bufsize: maximum message size for @prog
+ * @threadfn: a function to service RPC requests for @prog
+ *
+ * Returns an instantiated struct svc_serv object or NULL.
+ */
+struct svc_serv *svc_create_pooled(struct svc_program *prog,
+ unsigned int bufsize,
+ int (*threadfn)(void *data))
{
struct svc_serv *serv;
unsigned int npools = svc_pool_map_get();
- serv = __svc_create(prog, bufsize, npools, ops);
+ serv = __svc_create(prog, bufsize, npools, threadfn);
if (!serv)
goto out_err;
return serv;
out_err:
- svc_pool_map_put();
+ svc_pool_map_put(npools);
return NULL;
}
EXPORT_SYMBOL_GPL(svc_create_pooled);
-void svc_shutdown_net(struct svc_serv *serv, struct net *net)
-{
- svc_close_net(serv, net);
-
- if (serv->sv_ops->svo_shutdown)
- serv->sv_ops->svo_shutdown(serv, net);
-}
-EXPORT_SYMBOL_GPL(svc_shutdown_net);
-
/*
* Destroy an RPC service. Should be called with appropriate locking to
- * protect the sv_nrthreads, sv_permsocks and sv_tempsocks.
+ * protect sv_permsocks and sv_tempsocks.
*/
void
-svc_destroy(struct svc_serv *serv)
+svc_destroy(struct kref *ref)
{
- dprintk("svc: svc_destroy(%s, %d)\n",
- serv->sv_program->pg_name,
- serv->sv_nrthreads);
-
- if (serv->sv_nrthreads) {
- if (--(serv->sv_nrthreads) != 0) {
- svc_sock_update_bufs(serv);
- return;
- }
- } else
- printk("svc_destroy: no threads for serv=%p!\n", serv);
+ struct svc_serv *serv = container_of(ref, struct svc_serv, sv_refcnt);
+ dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name);
del_timer_sync(&serv->sv_temptimer);
/*
@@ -549,8 +578,7 @@ svc_destroy(struct svc_serv *serv)
cache_clean_deferred(serv);
- if (svc_serv_is_pooled(serv))
- svc_pool_map_put();
+ svc_pool_map_put(serv->sv_nrpools);
kfree(serv->sv_pools);
kfree(serv);
@@ -559,7 +587,7 @@ EXPORT_SYMBOL_GPL(svc_destroy);
/*
* Allocate an RPC server's buffer space.
- * We allocate pages and place them in rq_argpages.
+ * We allocate pages and place them in rq_pages.
*/
static int
svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
@@ -614,6 +642,10 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
rqstp->rq_server = serv;
rqstp->rq_pool = pool;
+ rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0);
+ if (!rqstp->rq_scratch_page)
+ goto out_enomem;
+
rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
if (!rqstp->rq_argp)
goto out_enomem;
@@ -632,7 +664,7 @@ out_enomem:
}
EXPORT_SYMBOL_GPL(svc_rqst_alloc);
-struct svc_rqst *
+static struct svc_rqst *
svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
{
struct svc_rqst *rqstp;
@@ -641,14 +673,17 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
if (!rqstp)
return ERR_PTR(-ENOMEM);
- serv->sv_nrthreads++;
+ svc_get(serv);
+ spin_lock_bh(&serv->sv_lock);
+ serv->sv_nrthreads += 1;
+ spin_unlock_bh(&serv->sv_lock);
+
spin_lock_bh(&pool->sp_lock);
pool->sp_nrthreads++;
list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
spin_unlock_bh(&pool->sp_lock);
return rqstp;
}
-EXPORT_SYMBOL_GPL(svc_prepare_thread);
/*
* Choose a pool in which to create a new thread, for svc_set_num_threads
@@ -722,11 +757,9 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
if (IS_ERR(rqstp))
return PTR_ERR(rqstp);
- __module_get(serv->sv_ops->svo_module);
- task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
+ task = kthread_create_on_node(serv->sv_threadfn, rqstp,
node, "%s", serv->sv_name);
if (IS_ERR(task)) {
- module_put(serv->sv_ops->svo_module);
svc_exit_thread(rqstp);
return PTR_ERR(task);
}
@@ -742,59 +775,13 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
return 0;
}
-
-/* destroy old threads */
-static int
-svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
-{
- struct task_struct *task;
- unsigned int state = serv->sv_nrthreads-1;
-
- /* destroy old threads */
- do {
- task = choose_victim(serv, pool, &state);
- if (task == NULL)
- break;
- send_sig(SIGINT, task, 1);
- nrservs++;
- } while (nrservs < 0);
-
- return 0;
-}
-
/*
* Create or destroy enough new threads to make the number
* of threads the given number. If `pool' is non-NULL, applies
* only to threads in that pool, otherwise round-robins between
* all pools. Caller must ensure that mutual exclusion between this and
* server startup or shutdown.
- *
- * Destroying threads relies on the service threads filling in
- * rqstp->rq_task, which only the nfs ones do. Assumes the serv
- * has been created using svc_create_pooled().
- *
- * Based on code that used to be in nfsd_svc() but tweaked
- * to be pool-aware.
*/
-int
-svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
-{
- if (pool == NULL) {
- /* The -1 assumes caller has done a svc_get() */
- nrservs -= (serv->sv_nrthreads-1);
- } else {
- spin_lock_bh(&pool->sp_lock);
- nrservs -= pool->sp_nrthreads;
- spin_unlock_bh(&pool->sp_lock);
- }
-
- if (nrservs > 0)
- return svc_start_kthreads(serv, pool, nrservs);
- if (nrservs < 0)
- return svc_signal_kthreads(serv, pool, nrservs);
- return 0;
-}
-EXPORT_SYMBOL_GPL(svc_set_num_threads);
/* destroy old threads */
static int
@@ -815,11 +802,10 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
}
int
-svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
if (pool == NULL) {
- /* The -1 assumes caller has done a svc_get() */
- nrservs -= (serv->sv_nrthreads-1);
+ nrservs -= serv->sv_nrthreads;
} else {
spin_lock_bh(&pool->sp_lock);
nrservs -= pool->sp_nrthreads;
@@ -832,7 +818,28 @@ svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrser
return svc_stop_kthreads(serv, pool, nrservs);
return 0;
}
-EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
+EXPORT_SYMBOL_GPL(svc_set_num_threads);
+
+/**
+ * svc_rqst_replace_page - Replace one page in rq_pages[]
+ * @rqstp: svc_rqst with pages to replace
+ * @page: replacement page
+ *
+ * When replacing a page in rq_pages, batch the release of the
+ * replaced pages to avoid hammering the page allocator.
+ */
+void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page)
+{
+ if (*rqstp->rq_next_page) {
+ if (!pagevec_space(&rqstp->rq_pvec))
+ __pagevec_release(&rqstp->rq_pvec);
+ pagevec_add(&rqstp->rq_pvec, *rqstp->rq_next_page);
+ }
+
+ get_page(page);
+ *(rqstp->rq_next_page++) = page;
+}
+EXPORT_SYMBOL_GPL(svc_rqst_replace_page);
/*
* Called from a server thread as it's exiting. Caller must hold the "service
@@ -842,6 +849,8 @@ void
svc_rqst_free(struct svc_rqst *rqstp)
{
svc_release_buffer(rqstp);
+ if (rqstp->rq_scratch_page)
+ put_page(rqstp->rq_scratch_page);
kfree(rqstp->rq_resp);
kfree(rqstp->rq_argp);
kfree(rqstp->rq_auth_data);
@@ -861,11 +870,14 @@ svc_exit_thread(struct svc_rqst *rqstp)
list_del_rcu(&rqstp->rq_all);
spin_unlock_bh(&pool->sp_lock);
+ spin_lock_bh(&serv->sv_lock);
+ serv->sv_nrthreads -= 1;
+ spin_unlock_bh(&serv->sv_lock);
+ svc_sock_update_bufs(serv);
+
svc_rqst_free(rqstp);
- /* Release the server */
- if (serv)
- svc_destroy(serv);
+ svc_put(serv);
}
EXPORT_SYMBOL_GPL(svc_exit_thread);
@@ -991,6 +1003,7 @@ static int __svc_register(struct net *net, const char *progname,
#endif
}
+ trace_svc_register(progname, version, protocol, port, family, error);
return error;
}
@@ -1000,11 +1013,6 @@ int svc_rpcbind_set_version(struct net *net,
unsigned short proto,
unsigned short port)
{
- dprintk("svc: svc_register(%sv%d, %s, %u, %u)\n",
- progp->pg_name, version,
- proto == IPPROTO_UDP? "udp" : "tcp",
- port, family);
-
return __svc_register(net, progp->pg_name, progp->pg_prog,
version, family, proto, port);
@@ -1024,11 +1032,8 @@ int svc_generic_rpcbind_set(struct net *net,
return 0;
if (vers->vs_hidden) {
- dprintk("svc: svc_register(%sv%d, %s, %u, %u)"
- " (but not telling portmap)\n",
- progp->pg_name, version,
- proto == IPPROTO_UDP? "udp" : "tcp",
- port, family);
+ trace_svc_noregister(progp->pg_name, version, proto,
+ port, family, 0);
return 0;
}
@@ -1106,8 +1111,7 @@ static void __svc_unregister(struct net *net, const u32 program, const u32 versi
if (error == -EPROTONOSUPPORT)
error = rpcb_register(net, program, version, 0, 0);
- dprintk("svc: %s(%sv%u), error %d\n",
- __func__, progname, version, error);
+ trace_svc_unregister(progname, version, error);
}
/*
@@ -1132,9 +1136,6 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net)
continue;
if (progp->pg_vers[i]->vs_hidden)
continue;
-
- dprintk("svc: attempting to unregister %sv%u\n",
- progp->pg_name, i);
__svc_unregister(net, progp->pg_prog, i, progp->pg_name);
}
}
@@ -1169,61 +1170,6 @@ static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ..
#endif
__be32
-svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err)
-{
- set_bit(RQ_AUTHERR, &rqstp->rq_flags);
- return auth_err;
-}
-EXPORT_SYMBOL_GPL(svc_return_autherr);
-
-static __be32
-svc_get_autherr(struct svc_rqst *rqstp, __be32 *statp)
-{
- if (test_and_clear_bit(RQ_AUTHERR, &rqstp->rq_flags))
- return *statp;
- return rpc_auth_ok;
-}
-
-static int
-svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp)
-{
- struct kvec *argv = &rqstp->rq_arg.head[0];
- struct kvec *resv = &rqstp->rq_res.head[0];
- const struct svc_procedure *procp = rqstp->rq_procinfo;
-
- /*
- * Decode arguments
- * XXX: why do we ignore the return value?
- */
- if (procp->pc_decode &&
- !procp->pc_decode(rqstp, argv->iov_base)) {
- *statp = rpc_garbage_args;
- return 1;
- }
-
- *statp = procp->pc_func(rqstp);
-
- if (*statp == rpc_drop_reply ||
- test_bit(RQ_DROPME, &rqstp->rq_flags))
- return 0;
-
- if (test_bit(RQ_AUTHERR, &rqstp->rq_flags))
- return 1;
-
- if (*statp != rpc_success)
- return 1;
-
- /* Encode reply */
- if (procp->pc_encode &&
- !procp->pc_encode(rqstp, resv->iov_base + resv->iov_len)) {
- dprintk("svc: failed to encode reply\n");
- /* serv->sv_stats->rpcsystemerr++; */
- *statp = rpc_system_err;
- }
- return 1;
-}
-
-__be32
svc_generic_init_request(struct svc_rqst *rqstp,
const struct svc_program *progp,
struct svc_process_info *ret)
@@ -1259,7 +1205,7 @@ svc_generic_init_request(struct svc_rqst *rqstp,
goto err_bad_proc;
/* Initialize storage for argp and resp */
- memset(rqstp->rq_argp, 0, procp->pc_argsize);
+ memset(rqstp->rq_argp, 0, procp->pc_argzero);
memset(rqstp->rq_resp, 0, procp->pc_ressize);
/* Bump per-procedure stats counter */
@@ -1288,8 +1234,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
struct svc_process_info process;
__be32 *statp;
u32 prog, vers;
- __be32 auth_stat, rpc_stat;
- int auth_res;
+ __be32 rpc_stat;
+ int auth_res, rc;
__be32 *reply_statp;
rpc_stat = rpc_success;
@@ -1298,10 +1244,10 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
goto err_short_len;
/* Will be turned off by GSS integrity and privacy services */
- set_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+ __set_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* Will be turned off only when NFSv4 Sessions are used */
- set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
- clear_bit(RQ_DROPME, &rqstp->rq_flags);
+ __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ __clear_bit(RQ_DROPME, &rqstp->rq_flags);
svc_putu32(resv, rqstp->rq_xid);
@@ -1331,14 +1277,12 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
* We do this before anything else in order to get a decent
* auth verifier.
*/
- auth_res = svc_authenticate(rqstp, &auth_stat);
+ auth_res = svc_authenticate(rqstp);
/* Also give the program a chance to reject this call: */
- if (auth_res == SVC_OK && progp) {
- auth_stat = rpc_autherr_badcred;
+ if (auth_res == SVC_OK && progp)
auth_res = progp->pg_authenticate(rqstp);
- }
if (auth_res != SVC_OK)
- trace_svc_authenticate(rqstp, auth_res, auth_stat);
+ trace_svc_authenticate(rqstp, auth_res);
switch (auth_res) {
case SVC_OK:
break;
@@ -1392,54 +1336,43 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
svc_reserve_auth(rqstp, procp->pc_xdrressize<<2);
/* Call the function that processes the request. */
- if (!process.dispatch) {
- if (!svc_generic_dispatch(rqstp, statp))
- goto release_dropit;
- if (*statp == rpc_garbage_args)
- goto err_garbage;
- auth_stat = svc_get_autherr(rqstp, statp);
- if (auth_stat != rpc_auth_ok)
- goto err_release_bad_auth;
- } else {
- dprintk("svc: calling dispatcher\n");
- if (!process.dispatch(rqstp, statp))
- goto release_dropit; /* Release reply info */
- }
+ rc = process.dispatch(rqstp, statp);
+ if (procp->pc_release)
+ procp->pc_release(rqstp);
+ if (!rc)
+ goto dropit;
+ if (rqstp->rq_auth_stat != rpc_auth_ok)
+ goto err_bad_auth;
/* Check RPC status result */
if (*statp != rpc_success)
resv->iov_len = ((void*)statp) - resv->iov_base + 4;
- /* Release reply info */
- if (procp->pc_release)
- procp->pc_release(rqstp);
-
if (procp->pc_encode == NULL)
goto dropit;
sendit:
if (svc_authorise(rqstp))
- goto close;
+ goto close_xprt;
return 1; /* Caller can now send it */
-release_dropit:
- if (procp->pc_release)
- procp->pc_release(rqstp);
dropit:
svc_authorise(rqstp); /* doesn't hurt to call this twice */
dprintk("svc: svc_process dropit\n");
return 0;
close:
+ svc_authorise(rqstp);
+close_xprt:
if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
- svc_close_xprt(rqstp->rq_xprt);
+ svc_xprt_close(rqstp->rq_xprt);
dprintk("svc: svc_process close\n");
return 0;
err_short_len:
svc_printk(rqstp, "short len %zd, dropping request\n",
argv->iov_len);
- goto close;
+ goto close_xprt;
err_bad_rpc:
serv->sv_stats->rpcbadfmt++;
@@ -1449,17 +1382,15 @@ err_bad_rpc:
svc_putnl(resv, 2);
goto sendit;
-err_release_bad_auth:
- if (procp->pc_release)
- procp->pc_release(rqstp);
err_bad_auth:
- dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat));
+ dprintk("svc: authentication failed (%d)\n",
+ be32_to_cpu(rqstp->rq_auth_stat));
serv->sv_stats->rpcbadauth++;
/* Restore write pointer to location of accept status: */
xdr_ressize_check(rqstp, reply_statp);
svc_putnl(resv, 1); /* REJECT */
svc_putnl(resv, 1); /* AUTH_ERROR */
- svc_putnl(resv, ntohl(auth_stat)); /* status */
+ svc_putu32(resv, rqstp->rq_auth_stat); /* status */
goto sendit;
err_bad_prog:
@@ -1503,8 +1434,13 @@ svc_process(struct svc_rqst *rqstp)
{
struct kvec *argv = &rqstp->rq_arg.head[0];
struct kvec *resv = &rqstp->rq_res.head[0];
- struct svc_serv *serv = rqstp->rq_server;
- u32 dir;
+ __be32 dir;
+
+#if IS_ENABLED(CONFIG_FAIL_SUNRPC)
+ if (!fail_sunrpc.ignore_server_disconnect &&
+ should_fail(&fail_sunrpc.attr, 1))
+ svc_xprt_deferred_close(rqstp->rq_xprt);
+#endif
/*
* Setup response xdr_buf.
@@ -1513,7 +1449,7 @@ svc_process(struct svc_rqst *rqstp)
rqstp->rq_next_page = &rqstp->rq_respages[1];
resv->iov_base = page_address(rqstp->rq_respages[0]);
resv->iov_len = 0;
- rqstp->rq_res.pages = rqstp->rq_respages + 1;
+ rqstp->rq_res.pages = rqstp->rq_next_page;
rqstp->rq_res.len = 0;
rqstp->rq_res.page_base = 0;
rqstp->rq_res.page_len = 0;
@@ -1521,22 +1457,17 @@ svc_process(struct svc_rqst *rqstp)
rqstp->rq_res.tail[0].iov_base = NULL;
rqstp->rq_res.tail[0].iov_len = 0;
- dir = svc_getnl(argv);
- if (dir != 0) {
- /* direction != CALL */
- svc_printk(rqstp, "bad direction %d, dropping request\n", dir);
- serv->sv_stats->rpcbadfmt++;
+ dir = svc_getu32(argv);
+ if (dir != rpc_call)
+ goto out_baddir;
+ if (!svc_process_common(rqstp, argv, resv))
goto out_drop;
- }
-
- /* Reserve space for the record marker */
- if (rqstp->rq_prot == IPPROTO_TCP)
- svc_putnl(resv, 0);
-
- /* Returns 1 for send, 0 for drop */
- if (likely(svc_process_common(rqstp, argv, resv)))
- return svc_send(rqstp);
+ return svc_send(rqstp);
+out_baddir:
+ svc_printk(rqstp, "bad direction 0x%08x, dropping request\n",
+ be32_to_cpu(dir));
+ rqstp->rq_server->sv_stats->rpcbadfmt++;
out_drop:
svc_drop(rqstp);
return 0;
@@ -1623,8 +1554,12 @@ out:
EXPORT_SYMBOL_GPL(bc_svc_process);
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-/*
- * Return (transport-specific) limit on the rpc payload.
+/**
+ * svc_max_payload - Return transport-specific limit on the RPC payload
+ * @rqstp: RPC transaction context
+ *
+ * Returns the maximum number of payload bytes the current transport
+ * allows.
*/
u32 svc_max_payload(const struct svc_rqst *rqstp)
{
@@ -1637,18 +1572,51 @@ u32 svc_max_payload(const struct svc_rqst *rqstp)
EXPORT_SYMBOL_GPL(svc_max_payload);
/**
+ * svc_proc_name - Return RPC procedure name in string form
+ * @rqstp: svc_rqst to operate on
+ *
+ * Return value:
+ * Pointer to a NUL-terminated string
+ */
+const char *svc_proc_name(const struct svc_rqst *rqstp)
+{
+ if (rqstp && rqstp->rq_procinfo)
+ return rqstp->rq_procinfo->pc_name;
+ return "unknown";
+}
+
+
+/**
+ * svc_encode_result_payload - mark a range of bytes as a result payload
+ * @rqstp: svc_rqst to operate on
+ * @offset: payload's byte offset in rqstp->rq_res
+ * @length: size of payload, in bytes
+ *
+ * Returns zero on success, or a negative errno if a permanent
+ * error occurred.
+ */
+int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
+{
+ return rqstp->rq_xprt->xpt_ops->xpo_result_payload(rqstp, offset,
+ length);
+}
+EXPORT_SYMBOL_GPL(svc_encode_result_payload);
+
+/**
* svc_fill_write_vector - Construct data argument for VFS write call
* @rqstp: svc_rqst to operate on
- * @pages: list of pages containing data payload
- * @first: buffer containing first section of write payload
- * @total: total number of bytes of write payload
+ * @payload: xdr_buf containing only the write data payload
*
* Fills in rqstp::rq_vec, and returns the number of elements.
*/
-unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages,
- struct kvec *first, size_t total)
+unsigned int svc_fill_write_vector(struct svc_rqst *rqstp,
+ struct xdr_buf *payload)
{
+ struct page **pages = payload->pages;
+ struct kvec *first = payload->head;
struct kvec *vec = rqstp->rq_vec;
+ size_t total = payload->len;
unsigned int i;
/* Some types of transport can present the write payload
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index de3c077733a7..2106003645a7 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -6,6 +6,7 @@
*/
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/errno.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
@@ -104,8 +105,17 @@ void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
}
EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
-/*
- * Format the transport list for printing
+/**
+ * svc_print_xprts - Format the transport list for printing
+ * @buf: target buffer for formatted address
+ * @maxlen: length of target buffer
+ *
+ * Fills in @buf with a string containing a list of transport names, each name
+ * terminated with '\n'. If the buffer is too small, some entries may be
+ * missing, but it is guaranteed that all lines in the output buffer are
+ * complete.
+ *
+ * Returns positive length of the filled-in string.
*/
int svc_print_xprts(char *buf, int maxlen)
{
@@ -118,9 +128,9 @@ int svc_print_xprts(char *buf, int maxlen)
list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
int slen;
- sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
- slen = strlen(tmpstr);
- if (len + slen > maxlen)
+ slen = snprintf(tmpstr, sizeof(tmpstr), "%s %d\n",
+ xcl->xcl_name, xcl->xcl_max_payload);
+ if (slen >= sizeof(tmpstr) || len + slen >= maxlen)
break;
len += slen;
strcat(buf, tmpstr);
@@ -130,6 +140,20 @@ int svc_print_xprts(char *buf, int maxlen)
return len;
}
+/**
+ * svc_xprt_deferred_close - Close a transport
+ * @xprt: transport instance
+ *
+ * Used in contexts that need to defer the work of shutting down
+ * the transport to an nfsd thread.
+ */
+void svc_xprt_deferred_close(struct svc_xprt *xprt)
+{
+ if (!test_and_set_bit(XPT_CLOSE, &xprt->xpt_flags))
+ svc_xprt_enqueue(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_deferred_close);
+
static void svc_xprt_free(struct kref *kref)
{
struct svc_xprt *xprt =
@@ -138,12 +162,13 @@ static void svc_xprt_free(struct kref *kref)
if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags))
svcauth_unix_info_release(xprt);
put_cred(xprt->xpt_cred);
- put_net(xprt->xpt_net);
+ put_net_track(xprt->xpt_net, &xprt->ns_tracker);
/* See comment on corresponding get in xs_setup_bc_tcp(): */
if (xprt->xpt_bc_xprt)
xprt_put(xprt->xpt_bc_xprt);
if (xprt->xpt_bc_xps)
xprt_switch_put(xprt->xpt_bc_xps);
+ trace_svc_xprt_free(xprt);
xprt->xpt_ops->xpo_free(xprt);
module_put(owner);
}
@@ -173,7 +198,7 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
mutex_init(&xprt->xpt_mutex);
spin_lock_init(&xprt->xpt_lock);
set_bit(XPT_BUSY, &xprt->xpt_flags);
- xprt->xpt_net = get_net(net);
+ xprt->xpt_net = get_net_track(net, &xprt->ns_tracker, GFP_ATOMIC);
strcpy(xprt->xpt_remotebuf, "uninitialized");
}
EXPORT_SYMBOL_GPL(svc_xprt_init);
@@ -197,6 +222,7 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
.sin6_port = htons(port),
};
#endif
+ struct svc_xprt *xprt;
struct sockaddr *sap;
size_t len;
@@ -215,18 +241,24 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
return ERR_PTR(-EAFNOSUPPORT);
}
- return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
+ xprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
+ if (IS_ERR(xprt))
+ trace_svc_xprt_create_err(serv->sv_program->pg_name,
+ xcl->xcl_name, sap, len, xprt);
+ return xprt;
}
-/*
- * svc_xprt_received conditionally queues the transport for processing
- * by another thread. The caller must hold the XPT_BUSY bit and must
+/**
+ * svc_xprt_received - start next receiver thread
+ * @xprt: controlling transport
+ *
+ * The caller must hold the XPT_BUSY bit and must
* not thereafter touch transport data.
*
* Note: XPT_DATA only gets cleared when a read-attempt finds no (or
* insufficient) data.
*/
-static void svc_xprt_received(struct svc_xprt *xprt)
+void svc_xprt_received(struct svc_xprt *xprt)
{
if (!test_bit(XPT_BUSY, &xprt->xpt_flags)) {
WARN_ONCE(1, "xprt=0x%p already busy!", xprt);
@@ -234,14 +266,15 @@ static void svc_xprt_received(struct svc_xprt *xprt)
}
/* As soon as we clear busy, the xprt could be closed and
- * 'put', so we need a reference to call svc_enqueue_xprt with:
+ * 'put', so we need a reference to call svc_xprt_enqueue with:
*/
svc_xprt_get(xprt);
smp_mb__before_atomic();
clear_bit(XPT_BUSY, &xprt->xpt_flags);
- xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt);
+ svc_xprt_enqueue(xprt);
svc_xprt_put(xprt);
}
+EXPORT_SYMBOL_GPL(svc_xprt_received);
void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
{
@@ -252,7 +285,7 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
svc_xprt_received(new);
}
-static int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
struct net *net, const int family,
const unsigned short port, int flags,
const struct cred *cred)
@@ -288,25 +321,35 @@ static int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
return -EPROTONOSUPPORT;
}
-int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+/**
+ * svc_xprt_create - Add a new listener to @serv
+ * @serv: target RPC service
+ * @xprt_name: transport class name
+ * @net: network namespace
+ * @family: network address family
+ * @port: listener port
+ * @flags: SVC_SOCK flags
+ * @cred: credential to bind to this transport
+ *
+ * Return values:
+ * %0: New listener added successfully
+ * %-EPROTONOSUPPORT: Requested transport type not supported
+ */
+int svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
struct net *net, const int family,
const unsigned short port, int flags,
const struct cred *cred)
{
int err;
- dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
- err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred);
+ err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred);
if (err == -EPROTONOSUPPORT) {
request_module("svc%s", xprt_name);
- err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred);
+ err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred);
}
- if (err < 0)
- dprintk("svc: transport %s not found, err %d\n",
- xprt_name, -err);
return err;
}
-EXPORT_SYMBOL_GPL(svc_create_xprt);
+EXPORT_SYMBOL_GPL(svc_xprt_create);
/*
* Copy the local and remote xprt addresses to the rqstp structure
@@ -382,6 +425,8 @@ static bool svc_xprt_ready(struct svc_xprt *xprt)
smp_rmb();
xpt_flags = READ_ONCE(xprt->xpt_flags);
+ if (xpt_flags & BIT(XPT_BUSY))
+ return false;
if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE)))
return true;
if (xpt_flags & (BIT(XPT_DATA) | BIT(XPT_DEFERRED))) {
@@ -394,11 +439,15 @@ static bool svc_xprt_ready(struct svc_xprt *xprt)
return false;
}
-void svc_xprt_do_enqueue(struct svc_xprt *xprt)
+/**
+ * svc_xprt_enqueue - Queue a transport on an idle nfsd thread
+ * @xprt: transport with data pending
+ *
+ */
+void svc_xprt_enqueue(struct svc_xprt *xprt)
{
struct svc_pool *pool;
struct svc_rqst *rqstp = NULL;
- int cpu;
if (!svc_xprt_ready(xprt))
return;
@@ -411,8 +460,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
return;
- cpu = get_cpu();
- pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
+ pool = svc_pool_for_cpu(xprt->xpt_server);
atomic_long_inc(&pool->sp_stats.packets);
@@ -435,21 +483,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
rqstp = NULL;
out_unlock:
rcu_read_unlock();
- put_cpu();
- trace_svc_xprt_do_enqueue(xprt, rqstp);
-}
-EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue);
-
-/*
- * Queue up a transport with data pending. If there are idle nfsd
- * processes, wake 'em up.
- *
- */
-void svc_xprt_enqueue(struct svc_xprt *xprt)
-{
- if (test_bit(XPT_BUSY, &xprt->xpt_flags))
- return;
- xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt);
+ trace_svc_xprt_enqueue(xprt, rqstp);
}
EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
@@ -509,6 +543,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp)
kfree(rqstp->rq_deferred);
rqstp->rq_deferred = NULL;
+ pagevec_release(&rqstp->rq_pvec);
svc_free_res_pages(rqstp);
rqstp->rq_res.page_len = 0;
rqstp->rq_res.page_base = 0;
@@ -631,36 +666,38 @@ static void svc_check_conn_limits(struct svc_serv *serv)
static int svc_alloc_arg(struct svc_rqst *rqstp)
{
struct svc_serv *serv = rqstp->rq_server;
- struct xdr_buf *arg;
- int pages;
- int i;
+ struct xdr_buf *arg = &rqstp->rq_arg;
+ unsigned long pages, filled, ret;
+
+ pagevec_init(&rqstp->rq_pvec);
- /* now allocate needed pages. If we get a failure, sleep briefly */
pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT;
if (pages > RPCSVC_MAXPAGES) {
- pr_warn_once("svc: warning: pages=%u > RPCSVC_MAXPAGES=%lu\n",
+ pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n",
pages, RPCSVC_MAXPAGES);
/* use as many pages as possible */
pages = RPCSVC_MAXPAGES;
}
- for (i = 0; i < pages ; i++)
- while (rqstp->rq_pages[i] == NULL) {
- struct page *p = alloc_page(GFP_KERNEL);
- if (!p) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (signalled() || kthread_should_stop()) {
- set_current_state(TASK_RUNNING);
- return -EINTR;
- }
- schedule_timeout(msecs_to_jiffies(500));
- }
- rqstp->rq_pages[i] = p;
+
+ for (filled = 0; filled < pages; filled = ret) {
+ ret = alloc_pages_bulk_array(GFP_KERNEL, pages,
+ rqstp->rq_pages);
+ if (ret > filled)
+ /* Made progress, don't sleep yet */
+ continue;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signalled() || kthread_should_stop()) {
+ set_current_state(TASK_RUNNING);
+ return -EINTR;
}
- rqstp->rq_page_end = &rqstp->rq_pages[i];
- rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
+ trace_svc_alloc_arg_err(pages, ret);
+ memalloc_retry_wait(GFP_KERNEL);
+ }
+ rqstp->rq_page_end = &rqstp->rq_pages[pages];
+ rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */
/* Make arg->head point to first page and arg->pages point to rest */
- arg = &rqstp->rq_arg;
arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
arg->head[0].iov_len = PAGE_SIZE;
arg->pages = rqstp->rq_pages + 1;
@@ -771,7 +808,6 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
int len = 0;
if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
- dprintk("svc_recv: found XPT_CLOSE\n");
if (test_and_clear_bit(XPT_KILL_TEMP, &xprt->xpt_flags))
xprt->xpt_ops->xpo_kill_temp_xprt(xprt);
svc_delete_xprt(xprt);
@@ -790,8 +826,11 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
if (newxpt) {
newxpt->xpt_cred = get_cred(xprt->xpt_cred);
svc_add_new_temp_xprt(serv, newxpt);
- } else
+ trace_svc_xprt_accept(newxpt, serv->sv_name);
+ } else {
module_put(xprt->xpt_class->xcl_owner);
+ }
+ svc_xprt_received(xprt);
} else if (svc_xprt_reserve_slot(rqstp, xprt)) {
/* XPT_DATA|XPT_DEFERRED case: */
dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
@@ -805,11 +844,10 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
rqstp->rq_stime = ktime_get();
rqstp->rq_reserved = serv->sv_max_mesg;
atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
- }
- /* clear XPT_BUSY: */
- svc_xprt_received(xprt);
+ } else
+ svc_xprt_received(xprt);
+
out:
- trace_svc_handle_xprt(xprt, len);
return len;
}
@@ -824,14 +862,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
struct svc_serv *serv = rqstp->rq_server;
int len, err;
- dprintk("svc: server %p waiting for data (to = %ld)\n",
- rqstp, timeout);
-
- if (rqstp->rq_xprt)
- printk(KERN_ERR
- "svc_recv: service %p, transport not NULL!\n",
- rqstp);
-
err = svc_alloc_arg(rqstp);
if (err)
goto out;
@@ -854,6 +884,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
err = -EAGAIN;
if (len <= 0)
goto out_release;
+ trace_svc_xdr_recvfrom(&rqstp->rq_arg);
clear_bit(XPT_OLD, &xprt->xpt_flags);
@@ -863,7 +894,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
if (serv->sv_stats)
serv->sv_stats->netcnt++;
- trace_svc_recv(rqstp, len);
return len;
out_release:
rqstp->rq_res.len = 0;
@@ -879,7 +909,6 @@ EXPORT_SYMBOL_GPL(svc_recv);
void svc_drop(struct svc_rqst *rqstp)
{
trace_svc_drop(rqstp);
- dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
svc_xprt_release(rqstp);
}
EXPORT_SYMBOL_GPL(svc_drop);
@@ -897,24 +926,16 @@ int svc_send(struct svc_rqst *rqstp)
if (!xprt)
goto out;
- /* release the receive skb before sending the reply */
- xprt->xpt_ops->xpo_release_rqst(rqstp);
-
/* calculate over-all length */
xb = &rqstp->rq_res;
xb->len = xb->head[0].iov_len +
xb->page_len +
xb->tail[0].iov_len;
-
- /* Grab mutex to serialize outgoing data. */
- mutex_lock(&xprt->xpt_mutex);
+ trace_svc_xdr_sendto(rqstp->rq_xid, xb);
trace_svc_stats_latency(rqstp);
- if (test_bit(XPT_DEAD, &xprt->xpt_flags)
- || test_bit(XPT_CLOSE, &xprt->xpt_flags))
- len = -ENOTCONN;
- else
- len = xprt->xpt_ops->xpo_sendto(rqstp);
- mutex_unlock(&xprt->xpt_mutex);
+
+ len = xprt->xpt_ops->xpo_sendto(rqstp);
+
trace_svc_send(rqstp, len);
svc_xprt_release(rqstp);
@@ -1022,12 +1043,13 @@ static void svc_delete_xprt(struct svc_xprt *xprt)
struct svc_serv *serv = xprt->xpt_server;
struct svc_deferred_req *dr;
- /* Only do this once */
if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags))
- BUG();
+ return;
- dprintk("svc: svc_delete_xprt(%p)\n", xprt);
+ trace_svc_xprt_detach(xprt);
xprt->xpt_ops->xpo_detach(xprt);
+ if (xprt->xpt_bc_xprt)
+ xprt->xpt_bc_xprt->ops->close(xprt->xpt_bc_xprt);
spin_lock_bh(&serv->sv_lock);
list_del_init(&xprt->xpt_list);
@@ -1043,8 +1065,14 @@ static void svc_delete_xprt(struct svc_xprt *xprt)
svc_xprt_put(xprt);
}
-void svc_close_xprt(struct svc_xprt *xprt)
+/**
+ * svc_xprt_close - Close a client connection
+ * @xprt: transport to disconnect
+ *
+ */
+void svc_xprt_close(struct svc_xprt *xprt)
{
+ trace_svc_xprt_close(xprt);
set_bit(XPT_CLOSE, &xprt->xpt_flags);
if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
/* someone else will have to effect the close */
@@ -1057,14 +1085,14 @@ void svc_close_xprt(struct svc_xprt *xprt)
*/
svc_delete_xprt(xprt);
}
-EXPORT_SYMBOL_GPL(svc_close_xprt);
+EXPORT_SYMBOL_GPL(svc_xprt_close);
static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net)
{
struct svc_xprt *xprt;
int ret = 0;
- spin_lock(&serv->sv_lock);
+ spin_lock_bh(&serv->sv_lock);
list_for_each_entry(xprt, xprt_list, xpt_list) {
if (xprt->xpt_net != net)
continue;
@@ -1072,7 +1100,7 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st
set_bit(XPT_CLOSE, &xprt->xpt_flags);
svc_xprt_enqueue(xprt);
}
- spin_unlock(&serv->sv_lock);
+ spin_unlock_bh(&serv->sv_lock);
return ret;
}
@@ -1109,7 +1137,11 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
}
}
-/*
+/**
+ * svc_xprt_destroy_all - Destroy transports associated with @serv
+ * @serv: RPC service to be shut down
+ * @net: target network namespace
+ *
* Server threads may still be running (especially in the case where the
* service is still running in other network namespaces).
*
@@ -1121,7 +1153,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
* threads, we may need to wait a little while and then check again to
* see if they're done.
*/
-void svc_close_net(struct svc_serv *serv, struct net *net)
+void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net)
{
int delay = 0;
@@ -1132,6 +1164,7 @@ void svc_close_net(struct svc_serv *serv, struct net *net)
msleep(delay++);
}
}
+EXPORT_SYMBOL_GPL(svc_xprt_destroy_all);
/*
* Handle defer and revisit of requests
@@ -1147,16 +1180,15 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
set_bit(XPT_DEFERRED, &xprt->xpt_flags);
if (too_many || test_bit(XPT_DEAD, &xprt->xpt_flags)) {
spin_unlock(&xprt->xpt_lock);
- dprintk("revisit canceled\n");
+ trace_svc_defer_drop(dr);
svc_xprt_put(xprt);
- trace_svc_drop_deferred(dr);
kfree(dr);
return;
}
- dprintk("revisit queued\n");
dr->xprt = NULL;
list_add(&dr->handle.recent, &xprt->xpt_deferred);
spin_unlock(&xprt->xpt_lock);
+ trace_svc_defer_queue(dr);
svc_xprt_enqueue(xprt);
svc_xprt_put(xprt);
}
@@ -1195,44 +1227,48 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req)
dr->addrlen = rqstp->rq_addrlen;
dr->daddr = rqstp->rq_daddr;
dr->argslen = rqstp->rq_arg.len >> 2;
- dr->xprt_hlen = rqstp->rq_xprt_hlen;
+ dr->xprt_ctxt = rqstp->rq_xprt_ctxt;
+ rqstp->rq_xprt_ctxt = NULL;
/* back up head to the start of the buffer and copy */
skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip,
dr->argslen << 2);
}
+ trace_svc_defer(rqstp);
svc_xprt_get(rqstp->rq_xprt);
dr->xprt = rqstp->rq_xprt;
- set_bit(RQ_DROPME, &rqstp->rq_flags);
+ __set_bit(RQ_DROPME, &rqstp->rq_flags);
dr->handle.revisit = svc_revisit;
- trace_svc_defer(rqstp);
return &dr->handle;
}
/*
* recv data from a deferred request into an active one
*/
-static int svc_deferred_recv(struct svc_rqst *rqstp)
+static noinline int svc_deferred_recv(struct svc_rqst *rqstp)
{
struct svc_deferred_req *dr = rqstp->rq_deferred;
+ trace_svc_defer_recv(dr);
+
/* setup iov_base past transport header */
- rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2);
+ rqstp->rq_arg.head[0].iov_base = dr->args;
/* The iov_len does not include the transport header bytes */
- rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen;
+ rqstp->rq_arg.head[0].iov_len = dr->argslen << 2;
rqstp->rq_arg.page_len = 0;
/* The rq_arg.len includes the transport header bytes */
- rqstp->rq_arg.len = dr->argslen<<2;
+ rqstp->rq_arg.len = dr->argslen << 2;
rqstp->rq_prot = dr->prot;
memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
rqstp->rq_addrlen = dr->addrlen;
/* Save off transport header len in case we get deferred again */
- rqstp->rq_xprt_hlen = dr->xprt_hlen;
rqstp->rq_daddr = dr->daddr;
rqstp->rq_respages = rqstp->rq_pages;
- return (dr->argslen<<2) - dr->xprt_hlen;
+ rqstp->rq_xprt_ctxt = dr->xprt_ctxt;
+ svc_xprt_received(rqstp->rq_xprt);
+ return dr->argslen << 2;
}
@@ -1248,7 +1284,6 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
struct svc_deferred_req,
handle.recent);
list_del_init(&dr->handle.recent);
- trace_svc_revisit_deferred(dr);
} else
clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
spin_unlock(&xprt->xpt_lock);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 552617e3467b..e72ba2f13f6c 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -21,6 +21,8 @@
#include <trace/events/sunrpc.h>
+#include "sunrpc.h"
+
#define RPCDBG_FACILITY RPCDBG_AUTH
@@ -29,10 +31,12 @@
*/
extern struct auth_ops svcauth_null;
extern struct auth_ops svcauth_unix;
+extern struct auth_ops svcauth_tls;
static struct auth_ops __rcu *authtab[RPC_AUTH_MAXFLAVOR] = {
[RPC_AUTH_NULL] = (struct auth_ops __force __rcu *)&svcauth_null,
[RPC_AUTH_UNIX] = (struct auth_ops __force __rcu *)&svcauth_unix,
+ [RPC_AUTH_TLS] = (struct auth_ops __force __rcu *)&svcauth_tls,
};
static struct auth_ops *
@@ -57,12 +61,12 @@ svc_put_auth_ops(struct auth_ops *aops)
}
int
-svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
+svc_authenticate(struct svc_rqst *rqstp)
{
rpc_authflavor_t flavor;
struct auth_ops *aops;
- *authp = rpc_auth_ok;
+ rqstp->rq_auth_stat = rpc_auth_ok;
flavor = svc_getnl(&rqstp->rq_arg.head[0]);
@@ -70,7 +74,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
aops = svc_get_auth_ops(flavor);
if (aops == NULL) {
- *authp = rpc_autherr_badcred;
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
return SVC_DENIED;
}
@@ -78,7 +82,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
init_svc_cred(&rqstp->rq_cred);
rqstp->rq_authop = aops;
- return aops->accept(rqstp, authp);
+ return aops->accept(rqstp);
}
EXPORT_SYMBOL_GPL(svc_authenticate);
@@ -205,3 +209,26 @@ struct auth_domain *auth_domain_find(char *name)
return NULL;
}
EXPORT_SYMBOL_GPL(auth_domain_find);
+
+/**
+ * auth_domain_cleanup - check that the auth_domain table is empty
+ *
+ * On module unload the auth_domain_table must be empty. To make it
+ * easier to catch bugs which don't clean up domains properly, we
+ * warn if anything remains in the table at cleanup time.
+ *
+ * Note that we cannot proactively remove the domains at this stage.
+ * The ->release() function might be in a module that has already been
+ * unloaded.
+ */
+
+void auth_domain_cleanup(void)
+{
+ int h;
+ struct auth_domain *hp;
+
+ for (h = 0; h < DN_HASHMAX; h++)
+ hlist_for_each_entry(hp, &auth_domain_table[h], hash)
+ pr_warn("svc: domain %s still present at module unload.\n",
+ hp->name);
+}
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 04aa80a2d752..b1efc34db6ed 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -37,6 +37,7 @@ struct unix_domain {
extern struct auth_ops svcauth_null;
extern struct auth_ops svcauth_unix;
+extern struct auth_ops svcauth_tls;
static void svcauth_unix_domain_release_rcu(struct rcu_head *head)
{
@@ -148,6 +149,11 @@ static struct cache_head *ip_map_alloc(void)
return NULL;
}
+static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall(cd, h);
+}
+
static void ip_map_request(struct cache_detail *cd,
struct cache_head *h,
char **bpp, int *blen)
@@ -298,15 +304,6 @@ static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class,
return NULL;
}
-static inline struct ip_map *ip_map_lookup(struct net *net, char *class,
- struct in6_addr *addr)
-{
- struct sunrpc_net *sn;
-
- sn = net_generic(net, sunrpc_net_id);
- return __ip_map_lookup(sn->ip_map_cache, class, addr);
-}
-
static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
struct unix_domain *udom, time64_t expiry)
{
@@ -327,15 +324,6 @@ static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
return 0;
}
-static inline int ip_map_update(struct net *net, struct ip_map *ipm,
- struct unix_domain *udom, time64_t expiry)
-{
- struct sunrpc_net *sn;
-
- sn = net_generic(net, sunrpc_net_id);
- return __ip_map_update(sn->ip_map_cache, ipm, udom, expiry);
-}
-
void svcauth_unix_purge(struct net *net)
{
struct sunrpc_net *sn;
@@ -467,6 +455,11 @@ static struct cache_head *unix_gid_alloc(void)
return NULL;
}
+static int unix_gid_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall_timeout(cd, h);
+}
+
static void unix_gid_request(struct cache_detail *cd,
struct cache_head *h,
char **bpp, int *blen)
@@ -584,6 +577,7 @@ static const struct cache_detail unix_gid_cache_template = {
.hash_size = GID_HASHMAX,
.name = "auth.unix.gid",
.cache_put = unix_gid_put,
+ .cache_upcall = unix_gid_upcall,
.cache_request = unix_gid_request,
.cache_parse = unix_gid_parse,
.cache_show = unix_gid_show,
@@ -688,8 +682,9 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
rqstp->rq_client = NULL;
if (rqstp->rq_proc == 0)
- return SVC_OK;
+ goto out;
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
ipm = ip_map_cached_get(xprt);
if (ipm == NULL)
ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class,
@@ -726,13 +721,16 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
put_group_info(cred->cr_group_info);
cred->cr_group_info = gi;
}
+
+out:
+ rqstp->rq_auth_stat = rpc_auth_ok;
return SVC_OK;
}
EXPORT_SYMBOL_GPL(svcauth_unix_set_client);
static int
-svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
+svcauth_null_accept(struct svc_rqst *rqstp)
{
struct kvec *argv = &rqstp->rq_arg.head[0];
struct kvec *resv = &rqstp->rq_res.head[0];
@@ -743,12 +741,12 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
if (svc_getu32(argv) != 0) {
dprintk("svc: bad null cred\n");
- *authp = rpc_autherr_badcred;
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
return SVC_DENIED;
}
if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
dprintk("svc: bad null verf\n");
- *authp = rpc_autherr_badverf;
+ rqstp->rq_auth_stat = rpc_autherr_badverf;
return SVC_DENIED;
}
@@ -792,7 +790,66 @@ struct auth_ops svcauth_null = {
static int
-svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
+svcauth_tls_accept(struct svc_rqst *rqstp)
+{
+ struct svc_cred *cred = &rqstp->rq_cred;
+ struct kvec *argv = rqstp->rq_arg.head;
+ struct kvec *resv = rqstp->rq_res.head;
+
+ if (argv->iov_len < XDR_UNIT * 3)
+ return SVC_GARBAGE;
+
+ /* Call's cred length */
+ if (svc_getu32(argv) != xdr_zero) {
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ return SVC_DENIED;
+ }
+
+ /* Call's verifier flavor and its length */
+ if (svc_getu32(argv) != rpc_auth_null ||
+ svc_getu32(argv) != xdr_zero) {
+ rqstp->rq_auth_stat = rpc_autherr_badverf;
+ return SVC_DENIED;
+ }
+
+ /* AUTH_TLS is not valid on non-NULL procedures */
+ if (rqstp->rq_proc != 0) {
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ return SVC_DENIED;
+ }
+
+ /* Mapping to nobody uid/gid is required */
+ cred->cr_uid = INVALID_UID;
+ cred->cr_gid = INVALID_GID;
+ cred->cr_group_info = groups_alloc(0);
+ if (cred->cr_group_info == NULL)
+ return SVC_CLOSE; /* kmalloc failure - client must retry */
+
+ /* Reply's verifier */
+ svc_putnl(resv, RPC_AUTH_NULL);
+ if (rqstp->rq_xprt->xpt_ops->xpo_start_tls) {
+ svc_putnl(resv, 8);
+ memcpy(resv->iov_base + resv->iov_len, "STARTTLS", 8);
+ resv->iov_len += 8;
+ } else
+ svc_putnl(resv, 0);
+
+ rqstp->rq_cred.cr_flavor = RPC_AUTH_TLS;
+ return SVC_OK;
+}
+
+struct auth_ops svcauth_tls = {
+ .name = "tls",
+ .owner = THIS_MODULE,
+ .flavour = RPC_AUTH_TLS,
+ .accept = svcauth_tls_accept,
+ .release = svcauth_null_release,
+ .set_client = svcauth_unix_set_client,
+};
+
+
+static int
+svcauth_unix_accept(struct svc_rqst *rqstp)
{
struct kvec *argv = &rqstp->rq_arg.head[0];
struct kvec *resv = &rqstp->rq_res.head[0];
@@ -834,7 +891,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
}
groups_sort(cred->cr_group_info);
if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
- *authp = rpc_autherr_badverf;
+ rqstp->rq_auth_stat = rpc_autherr_badverf;
return SVC_DENIED;
}
@@ -846,7 +903,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
return SVC_OK;
badcred:
- *authp = rpc_autherr_badcred;
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
return SVC_DENIED;
}
@@ -881,6 +938,7 @@ static const struct cache_detail ip_map_cache_template = {
.hash_size = IP_HASHMAX,
.name = "auth.unix.ip",
.cache_put = ip_map_put,
+ .cache_upcall = ip_map_upcall,
.cache_request = ip_map_request,
.cache_parse = ip_map_parse,
.cache_show = ip_map_show,
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 2934dd711715..2fc98fea59b4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -44,8 +44,8 @@
#include <net/tcp.h>
#include <net/tcp_states.h>
#include <linux/uaccess.h>
+#include <linux/highmem.h>
#include <asm/ioctls.h>
-#include <trace/events/skb.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/clnt.h>
@@ -55,6 +55,9 @@
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/xprt.h>
+#include <trace/events/sunrpc.h>
+
+#include "socklib.h"
#include "sunrpc.h"
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
@@ -107,31 +110,26 @@ static void svc_reclassify_socket(struct socket *sock)
}
#endif
-/*
- * Release an skbuff after use
+/**
+ * svc_tcp_release_rqst - Release transport-related resources
+ * @rqstp: request structure with resources to be released
+ *
*/
-static void svc_release_skb(struct svc_rqst *rqstp)
+static void svc_tcp_release_rqst(struct svc_rqst *rqstp)
{
- struct sk_buff *skb = rqstp->rq_xprt_ctxt;
-
- if (skb) {
- struct svc_sock *svsk =
- container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
- rqstp->rq_xprt_ctxt = NULL;
-
- dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
- skb_free_datagram_locked(svsk->sk_sk, skb);
- }
}
-static void svc_release_udp_skb(struct svc_rqst *rqstp)
+/**
+ * svc_udp_release_rqst - Release transport-related resources
+ * @rqstp: request structure with resources to be released
+ *
+ */
+static void svc_udp_release_rqst(struct svc_rqst *rqstp)
{
struct sk_buff *skb = rqstp->rq_xprt_ctxt;
if (skb) {
rqstp->rq_xprt_ctxt = NULL;
-
- dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
consume_skb(skb);
}
}
@@ -174,109 +172,10 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
}
}
-/*
- * send routine intended to be shared by the fore- and back-channel
- */
-int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
- struct page *headpage, unsigned long headoffset,
- struct page *tailpage, unsigned long tailoffset)
+static int svc_sock_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
{
- int result;
- int size;
- struct page **ppage = xdr->pages;
- size_t base = xdr->page_base;
- unsigned int pglen = xdr->page_len;
- unsigned int flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
- int slen;
- int len = 0;
-
- slen = xdr->len;
-
- /* send head */
- if (slen == xdr->head[0].iov_len)
- flags = 0;
- len = kernel_sendpage(sock, headpage, headoffset,
- xdr->head[0].iov_len, flags);
- if (len != xdr->head[0].iov_len)
- goto out;
- slen -= xdr->head[0].iov_len;
- if (slen == 0)
- goto out;
-
- /* send page data */
- size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
- while (pglen > 0) {
- if (slen == size)
- flags = 0;
- result = kernel_sendpage(sock, *ppage, base, size, flags);
- if (result > 0)
- len += result;
- if (result != size)
- goto out;
- slen -= size;
- pglen -= size;
- size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
- base = 0;
- ppage++;
- }
-
- /* send tail */
- if (xdr->tail[0].iov_len) {
- result = kernel_sendpage(sock, tailpage, tailoffset,
- xdr->tail[0].iov_len, 0);
- if (result > 0)
- len += result;
- }
-
-out:
- return len;
-}
-
-
-/*
- * Generic sendto routine
- */
-static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
-{
- struct svc_sock *svsk =
- container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
- struct socket *sock = svsk->sk_sock;
- union {
- struct cmsghdr hdr;
- long all[SVC_PKTINFO_SPACE / sizeof(long)];
- } buffer;
- struct cmsghdr *cmh = &buffer.hdr;
- int len = 0;
- unsigned long tailoff;
- unsigned long headoff;
- RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
-
- if (rqstp->rq_prot == IPPROTO_UDP) {
- struct msghdr msg = {
- .msg_name = &rqstp->rq_addr,
- .msg_namelen = rqstp->rq_addrlen,
- .msg_control = cmh,
- .msg_controllen = sizeof(buffer),
- .msg_flags = MSG_MORE,
- };
-
- svc_set_cmsg_data(rqstp, cmh);
-
- if (sock_sendmsg(sock, &msg) < 0)
- goto out;
- }
-
- tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1);
- headoff = 0;
- len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff,
- rqstp->rq_respages[0], tailoff);
-
-out:
- dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n",
- svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
- xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
-
- return len;
+ return 0;
}
/*
@@ -316,34 +215,66 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
return len;
}
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+static void svc_flush_bvec(const struct bio_vec *bvec, size_t size, size_t seek)
+{
+ struct bvec_iter bi = {
+ .bi_size = size + seek,
+ };
+ struct bio_vec bv;
+
+ bvec_iter_advance(bvec, &bi, seek & PAGE_MASK);
+ for_each_bvec(bv, bvec, bi, bi)
+ flush_dcache_page(bv.bv_page);
+}
+#else
+static inline void svc_flush_bvec(const struct bio_vec *bvec, size_t size,
+ size_t seek)
+{
+}
+#endif
+
/*
- * Generic recvfrom routine.
+ * Read from @rqstp's transport socket. The incoming message fills whole
+ * pages in @rqstp's rq_pages array until the last page of the message
+ * has been received into a partial page.
*/
-static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov,
- unsigned int nr, size_t buflen, unsigned int base)
+static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen,
+ size_t seek)
{
struct svc_sock *svsk =
container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ struct bio_vec *bvec = rqstp->rq_bvec;
struct msghdr msg = { NULL };
+ unsigned int i;
ssize_t len;
-
- rqstp->rq_xprt_hlen = 0;
+ size_t t;
clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- iov_iter_kvec(&msg.msg_iter, READ, iov, nr, buflen);
- if (base != 0) {
- iov_iter_advance(&msg.msg_iter, base);
- buflen -= base;
+
+ for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) {
+ bvec[i].bv_page = rqstp->rq_pages[i];
+ bvec[i].bv_len = PAGE_SIZE;
+ bvec[i].bv_offset = 0;
+ }
+ rqstp->rq_respages = &rqstp->rq_pages[i];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ iov_iter_bvec(&msg.msg_iter, READ, bvec, i, buflen);
+ if (seek) {
+ iov_iter_advance(&msg.msg_iter, seek);
+ buflen -= seek;
}
len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT);
+ if (len > 0)
+ svc_flush_bvec(bvec, len, seek);
+
/* If we read a full record, then assume there may be more
* data to read (stream based sockets only!)
*/
if (len == buflen)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- dprintk("svc: socket %p recvfrom(%p, %zu) = %zd\n",
- svsk, iov[0].iov_base, iov[0].iov_len, len);
return len;
}
@@ -367,9 +298,9 @@ static void svc_sock_setbufsize(struct svc_sock *svsk, unsigned int nreqs)
static void svc_sock_secure_port(struct svc_rqst *rqstp)
{
if (svc_port_is_privileged(svc_addr(rqstp)))
- set_bit(RQ_SECURE, &rqstp->rq_flags);
+ __set_bit(RQ_SECURE, &rqstp->rq_flags);
else
- clear_bit(RQ_SECURE, &rqstp->rq_flags);
+ __clear_bit(RQ_SECURE, &rqstp->rq_flags);
}
/*
@@ -380,13 +311,10 @@ static void svc_data_ready(struct sock *sk)
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
if (svsk) {
- dprintk("svc: socket %p(inet %p), busy=%d\n",
- svsk, sk,
- test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
-
/* Refer to svc_setup_socket() for details. */
rmb();
svsk->sk_odata(sk);
+ trace_svcsock_data_ready(&svsk->sk_xprt, 0);
if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags))
svc_xprt_enqueue(&svsk->sk_xprt);
}
@@ -400,11 +328,9 @@ static void svc_write_space(struct sock *sk)
struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
if (svsk) {
- dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
- svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
-
/* Refer to svc_setup_socket() for details. */
rmb();
+ trace_svcsock_write_space(&svsk->sk_xprt, 0);
svsk->sk_owspace(sk);
svc_xprt_enqueue(&svsk->sk_xprt);
}
@@ -421,17 +347,9 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt)
static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt)
{
- struct svc_sock *svsk;
- struct socket *sock;
- struct linger no_linger = {
- .l_onoff = 1,
- .l_linger = 0,
- };
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
- svsk = container_of(xprt, struct svc_sock, sk_xprt);
- sock = svsk->sk_sock;
- kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
- (char *)&no_linger, sizeof(no_linger));
+ sock_no_linger(svsk->sk_sock->sk);
}
/*
@@ -489,8 +407,15 @@ static int svc_udp_get_dest_address(struct svc_rqst *rqstp,
return 0;
}
-/*
- * Receive a datagram from a UDP socket.
+/**
+ * svc_udp_recvfrom - Receive a datagram from a UDP socket.
+ * @rqstp: request structure into which to receive an RPC Call
+ *
+ * Called in a loop when XPT_DATA has been set.
+ *
+ * Returns:
+ * On success, the number of bytes in a received RPC Call, or
+ * %0 if a complete RPC Call message was not ready to return
*/
static int svc_udp_recvfrom(struct svc_rqst *rqstp)
{
@@ -524,20 +449,14 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
svc_sock_setbufsize(svsk, serv->sv_nrthreads + 3);
clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- skb = NULL;
err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
0, 0, MSG_PEEK | MSG_DONTWAIT);
- if (err >= 0)
- skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err);
-
- if (skb == NULL) {
- if (err != -EAGAIN) {
- /* possibly an icmp error */
- dprintk("svc: recvfrom returned error %d\n", -err);
- set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- }
- return 0;
- }
+ if (err < 0)
+ goto out_recv_err;
+ skb = skb_recv_udp(svsk->sk_sk, MSG_DONTWAIT, &err);
+ if (!skb)
+ goto out_recv_err;
+
len = svc_addr_len(svc_addr(rqstp));
rqstp->rq_addrlen = len;
if (skb->tstamp == 0) {
@@ -548,26 +467,21 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
sock_write_timestamp(svsk->sk_sk, skb->tstamp);
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
- len = skb->len;
+ len = skb->len;
rqstp->rq_arg.len = len;
+ trace_svcsock_udp_recv(&svsk->sk_xprt, len);
rqstp->rq_prot = IPPROTO_UDP;
- if (!svc_udp_get_dest_address(rqstp, cmh)) {
- net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n",
- cmh->cmsg_level, cmh->cmsg_type);
- goto out_free;
- }
+ if (!svc_udp_get_dest_address(rqstp, cmh))
+ goto out_cmsg_err;
rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp));
if (skb_is_nonlinear(skb)) {
/* we have to copy */
local_bh_disable();
- if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
- local_bh_enable();
- /* checksum error */
- goto out_free;
- }
+ if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb))
+ goto out_bh_enable;
local_bh_enable();
consume_skb(skb);
} else {
@@ -594,23 +508,86 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
if (serv->sv_stats)
serv->sv_stats->netudpcnt++;
+ svc_xprt_received(rqstp->rq_xprt);
return len;
+
+out_recv_err:
+ if (err != -EAGAIN) {
+ /* possibly an icmp error */
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ }
+ trace_svcsock_udp_recv_err(&svsk->sk_xprt, err);
+ goto out_clear_busy;
+out_cmsg_err:
+ net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n",
+ cmh->cmsg_level, cmh->cmsg_type);
+ goto out_free;
+out_bh_enable:
+ local_bh_enable();
out_free:
kfree_skb(skb);
+out_clear_busy:
+ svc_xprt_received(rqstp->rq_xprt);
return 0;
}
-static int
-svc_udp_sendto(struct svc_rqst *rqstp)
+/**
+ * svc_udp_sendto - Send out a reply on a UDP socket
+ * @rqstp: completed svc_rqst
+ *
+ * xpt_mutex ensures @rqstp's whole message is written to the socket
+ * without interruption.
+ *
+ * Returns the number of bytes sent, or a negative errno.
+ */
+static int svc_udp_sendto(struct svc_rqst *rqstp)
{
- int error;
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct xdr_buf *xdr = &rqstp->rq_res;
+ union {
+ struct cmsghdr hdr;
+ long all[SVC_PKTINFO_SPACE / sizeof(long)];
+ } buffer;
+ struct cmsghdr *cmh = &buffer.hdr;
+ struct msghdr msg = {
+ .msg_name = &rqstp->rq_addr,
+ .msg_namelen = rqstp->rq_addrlen,
+ .msg_control = cmh,
+ .msg_controllen = sizeof(buffer),
+ };
+ unsigned int sent;
+ int err;
+
+ svc_udp_release_rqst(rqstp);
- error = svc_sendto(rqstp, &rqstp->rq_res);
- if (error == -ECONNREFUSED)
+ svc_set_cmsg_data(rqstp, cmh);
+
+ mutex_lock(&xprt->xpt_mutex);
+
+ if (svc_xprt_is_dead(xprt))
+ goto out_notconn;
+
+ err = xdr_alloc_bvec(xdr, GFP_KERNEL);
+ if (err < 0)
+ goto out_unlock;
+
+ err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent);
+ if (err == -ECONNREFUSED) {
/* ICMP error on earlier request. */
- error = svc_sendto(rqstp, &rqstp->rq_res);
+ err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent);
+ }
+ xdr_free_bvec(xdr);
+ trace_svcsock_udp_send(xprt, err);
+out_unlock:
+ mutex_unlock(&xprt->xpt_mutex);
+ if (err < 0)
+ return err;
+ return sent;
- return error;
+out_notconn:
+ mutex_unlock(&xprt->xpt_mutex);
+ return -ENOTCONN;
}
static int svc_udp_has_wspace(struct svc_xprt *xprt)
@@ -653,7 +630,8 @@ static const struct svc_xprt_ops svc_udp_ops = {
.xpo_create = svc_udp_create,
.xpo_recvfrom = svc_udp_recvfrom,
.xpo_sendto = svc_udp_sendto,
- .xpo_release_rqst = svc_release_udp_skb,
+ .xpo_result_payload = svc_sock_result_payload,
+ .xpo_release_rqst = svc_udp_release_rqst,
.xpo_detach = svc_sock_detach,
.xpo_free = svc_sock_free,
.xpo_has_wspace = svc_udp_has_wspace,
@@ -672,8 +650,6 @@ static struct svc_xprt_class svc_udp_class = {
static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
{
- int err, level, optname, one = 1;
-
svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class,
&svsk->sk_xprt, serv);
clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
@@ -693,19 +669,14 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
/* make sure we get destination address info */
switch (svsk->sk_sk->sk_family) {
case AF_INET:
- level = SOL_IP;
- optname = IP_PKTINFO;
+ ip_sock_set_pktinfo(svsk->sk_sock->sk);
break;
case AF_INET6:
- level = SOL_IPV6;
- optname = IPV6_RECVPKTINFO;
+ ip6_sock_set_recvpktinfo(svsk->sk_sock->sk);
break;
default:
BUG();
}
- err = kernel_setsockopt(svsk->sk_sock, level, optname,
- (char *)&one, sizeof(one));
- dprintk("svc: kernel_setsockopt returned %d\n", err);
}
/*
@@ -716,9 +687,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- dprintk("svc: socket %p TCP (listen) state change %d\n",
- sk, sk->sk_state);
-
if (svsk) {
/* Refer to svc_setup_socket() for details. */
rmb();
@@ -739,8 +707,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
if (svsk) {
set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
- } else
- printk("svc: socket %p: no user data\n", sk);
+ }
}
}
@@ -751,19 +718,13 @@ static void svc_tcp_state_change(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
- sk, sk->sk_state, sk->sk_user_data);
-
- if (!svsk)
- printk("svc: socket %p: no user data\n", sk);
- else {
+ if (svsk) {
/* Refer to svc_setup_socket() for details. */
rmb();
svsk->sk_ostate(sk);
- if (sk->sk_state != TCP_ESTABLISHED) {
- set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
- svc_xprt_enqueue(&svsk->sk_xprt);
- }
+ trace_svcsock_tcp_state(&svsk->sk_xprt, svsk->sk_sock);
+ if (sk->sk_state != TCP_ESTABLISHED)
+ svc_xprt_deferred_close(&svsk->sk_xprt);
}
}
@@ -780,9 +741,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
struct socket *newsock;
struct svc_sock *newsvsk;
int err, slen;
- RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
- dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
if (!sock)
return NULL;
@@ -795,30 +754,18 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
else if (err != -EAGAIN)
net_warn_ratelimited("%s: accept failed (err %d)!\n",
serv->sv_name, -err);
+ trace_svcsock_accept_err(xprt, serv->sv_name, err);
return NULL;
}
set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
err = kernel_getpeername(newsock, sin);
if (err < 0) {
- net_warn_ratelimited("%s: peername failed (err %d)!\n",
- serv->sv_name, -err);
+ trace_svcsock_getpeername_err(xprt, serv->sv_name, err);
goto failed; /* aborted connection or whatever */
}
slen = err;
- /* Ideally, we would want to reject connections from unauthorized
- * hosts here, but when we get encryption, the IP of the host won't
- * tell us anything. For now just warn about unpriv connections.
- */
- if (!svc_port_is_privileged(sin)) {
- dprintk("%s: connect from unprivileged port: %s\n",
- serv->sv_name,
- __svc_print_addr(sin, buf, sizeof(buf)));
- }
- dprintk("%s: connect from %s\n", serv->sv_name,
- __svc_print_addr(sin, buf, sizeof(buf)));
-
/* Reset the inherited callbacks before calling svc_setup_socket */
newsock->sk->sk_state_change = svsk->sk_ostate;
newsock->sk->sk_data_ready = svsk->sk_odata;
@@ -836,10 +783,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
err = kernel_getsockname(newsock, sin);
slen = err;
- if (unlikely(err < 0)) {
- dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
+ if (unlikely(err < 0))
slen = offsetof(struct sockaddr, sa_data);
- }
svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
if (sock_is_loopback(newsock->sk))
@@ -856,13 +801,14 @@ failed:
return NULL;
}
-static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
+static size_t svc_tcp_restore_pages(struct svc_sock *svsk,
+ struct svc_rqst *rqstp)
{
- unsigned int i, len, npages;
+ size_t len = svsk->sk_datalen;
+ unsigned int i, npages;
- if (svsk->sk_datalen == 0)
+ if (!len)
return 0;
- len = svsk->sk_datalen;
npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
for (i = 0; i < npages; i++) {
if (rqstp->rq_pages[i] != NULL)
@@ -911,47 +857,45 @@ out:
}
/*
- * Receive fragment record header.
- * If we haven't gotten the record length yet, get the next four bytes.
+ * Receive fragment record header into sk_marker.
*/
-static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
+static ssize_t svc_tcp_read_marker(struct svc_sock *svsk,
+ struct svc_rqst *rqstp)
{
- struct svc_serv *serv = svsk->sk_xprt.xpt_server;
- unsigned int want;
- int len;
+ ssize_t want, len;
+ /* If we haven't gotten the record length yet,
+ * get the next four bytes.
+ */
if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
+ struct msghdr msg = { NULL };
struct kvec iov;
want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
- iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
+ iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen;
iov.iov_len = want;
- len = svc_recvfrom(rqstp, &iov, 1, want, 0);
+ iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, want);
+ len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT);
if (len < 0)
- goto error;
+ return len;
svsk->sk_tcplen += len;
-
if (len < want) {
- dprintk("svc: short recvfrom while reading record "
- "length (%d of %d)\n", len, want);
- return -EAGAIN;
+ /* call again to read the remaining bytes */
+ goto err_short;
}
-
- dprintk("svc: TCP record, %d bytes\n", svc_sock_reclen(svsk));
+ trace_svcsock_marker(&svsk->sk_xprt, svsk->sk_marker);
if (svc_sock_reclen(svsk) + svsk->sk_datalen >
- serv->sv_max_mesg) {
- net_notice_ratelimited("RPC: fragment too large: %d\n",
- svc_sock_reclen(svsk));
- goto err_delete;
- }
+ svsk->sk_xprt.xpt_server->sv_max_mesg)
+ goto err_too_large;
}
-
return svc_sock_reclen(svsk);
-error:
- dprintk("RPC: TCP recv_record got %d\n", len);
- return len;
-err_delete:
- set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+
+err_too_large:
+ net_notice_ratelimited("svc: %s %s RPC fragment too large: %d\n",
+ __func__, svsk->sk_xprt.xpt_server->sv_name,
+ svc_sock_reclen(svsk));
+ svc_xprt_deferred_close(&svsk->sk_xprt);
+err_short:
return -EAGAIN;
}
@@ -1000,87 +944,58 @@ unlock_eagain:
return -EAGAIN;
}
-static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len)
-{
- int i = 0;
- int t = 0;
-
- while (t < len) {
- vec[i].iov_base = page_address(pages[i]);
- vec[i].iov_len = PAGE_SIZE;
- i++;
- t += PAGE_SIZE;
- }
- return i;
-}
-
static void svc_tcp_fragment_received(struct svc_sock *svsk)
{
/* If we have more data, signal svc_xprt_enqueue() to try again */
- dprintk("svc: TCP %s record (%d bytes)\n",
- svc_sock_final_rec(svsk) ? "final" : "nonfinal",
- svc_sock_reclen(svsk));
svsk->sk_tcplen = 0;
- svsk->sk_reclen = 0;
+ svsk->sk_marker = xdr_zero;
}
-/*
- * Receive data from a TCP socket.
+/**
+ * svc_tcp_recvfrom - Receive data from a TCP socket
+ * @rqstp: request structure into which to receive an RPC Call
+ *
+ * Called in a loop when XPT_DATA has been set.
+ *
+ * Read the 4-byte stream record marker, then use the record length
+ * in that marker to set up exactly the resources needed to receive
+ * the next RPC message into @rqstp.
+ *
+ * Returns:
+ * On success, the number of bytes in a received RPC Call, or
+ * %0 if a complete RPC Call message was not ready to return
+ *
+ * The zero return case handles partial receives and callback Replies.
+ * The state of a partial receive is preserved in the svc_sock for
+ * the next call to svc_tcp_recvfrom.
*/
static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
{
struct svc_sock *svsk =
container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
struct svc_serv *serv = svsk->sk_xprt.xpt_server;
- int len;
- struct kvec *vec;
- unsigned int want, base;
+ size_t want, base;
+ ssize_t len;
__be32 *p;
__be32 calldir;
- int pnum;
-
- dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
- svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
- test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
- test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
- len = svc_tcp_recv_record(svsk, rqstp);
+ clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ len = svc_tcp_read_marker(svsk, rqstp);
if (len < 0)
goto error;
base = svc_tcp_restore_pages(svsk, rqstp);
- want = svc_sock_reclen(svsk) - (svsk->sk_tcplen - sizeof(rpc_fraghdr));
-
- vec = rqstp->rq_vec;
-
- pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], base + want);
-
- rqstp->rq_respages = &rqstp->rq_pages[pnum];
- rqstp->rq_next_page = rqstp->rq_respages + 1;
-
- /* Now receive data */
- len = svc_recvfrom(rqstp, vec, pnum, base + want, base);
+ want = len - (svsk->sk_tcplen - sizeof(rpc_fraghdr));
+ len = svc_tcp_read_msg(rqstp, base + want, base);
if (len >= 0) {
+ trace_svcsock_tcp_recv(&svsk->sk_xprt, len);
svsk->sk_tcplen += len;
svsk->sk_datalen += len;
}
- if (len != want || !svc_sock_final_rec(svsk)) {
- svc_tcp_save_pages(svsk, rqstp);
- if (len < 0 && len != -EAGAIN)
- goto err_delete;
- if (len == want)
- svc_tcp_fragment_received(svsk);
- else
- dprintk("svc: incomplete TCP record (%d of %d)\n",
- (int)(svsk->sk_tcplen - sizeof(rpc_fraghdr)),
- svc_sock_reclen(svsk));
- goto err_noclose;
- }
-
- if (svsk->sk_datalen < 8) {
- svsk->sk_datalen = 0;
- goto err_delete; /* client is nuts. */
- }
+ if (len != want || !svc_sock_final_rec(svsk))
+ goto err_incomplete;
+ if (svsk->sk_datalen < 8)
+ goto err_nuts;
rqstp->rq_arg.len = svsk->sk_datalen;
rqstp->rq_arg.page_base = 0;
@@ -1093,9 +1008,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
rqstp->rq_xprt_ctxt = NULL;
rqstp->rq_prot = IPPROTO_TCP;
if (test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags))
- set_bit(RQ_LOCAL, &rqstp->rq_flags);
+ __set_bit(RQ_LOCAL, &rqstp->rq_flags);
else
- clear_bit(RQ_LOCAL, &rqstp->rq_flags);
+ __clear_bit(RQ_LOCAL, &rqstp->rq_flags);
p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
calldir = p[1];
@@ -1113,50 +1028,167 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
if (serv->sv_stats)
serv->sv_stats->nettcpcnt++;
+ svc_xprt_received(rqstp->rq_xprt);
return rqstp->rq_arg.len;
+err_incomplete:
+ svc_tcp_save_pages(svsk, rqstp);
+ if (len < 0 && len != -EAGAIN)
+ goto err_delete;
+ if (len == want)
+ svc_tcp_fragment_received(svsk);
+ else
+ trace_svcsock_tcp_recv_short(&svsk->sk_xprt,
+ svc_sock_reclen(svsk),
+ svsk->sk_tcplen - sizeof(rpc_fraghdr));
+ goto err_noclose;
error:
if (len != -EAGAIN)
goto err_delete;
- dprintk("RPC: TCP recvfrom got EAGAIN\n");
- return 0;
+ trace_svcsock_tcp_recv_eagain(&svsk->sk_xprt, 0);
+ goto err_noclose;
+err_nuts:
+ svsk->sk_datalen = 0;
err_delete:
- printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
- svsk->sk_xprt.xpt_server->sv_name, -len);
- set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ trace_svcsock_tcp_recv_err(&svsk->sk_xprt, len);
+ svc_xprt_deferred_close(&svsk->sk_xprt);
err_noclose:
+ svc_xprt_received(rqstp->rq_xprt);
return 0; /* record not complete */
}
+static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
+ int flags)
+{
+ return kernel_sendpage(sock, virt_to_page(vec->iov_base),
+ offset_in_page(vec->iov_base),
+ vec->iov_len, flags);
+}
+
/*
- * Send out data on TCP socket.
+ * kernel_sendpage() is used exclusively to reduce the number of
+ * copy operations in this path. Therefore the caller must ensure
+ * that the pages backing @xdr are unchanging.
+ *
+ * In addition, the logic assumes that * .bv_len is never larger
+ * than PAGE_SIZE.
*/
-static int svc_tcp_sendto(struct svc_rqst *rqstp)
+static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
+ rpc_fraghdr marker, unsigned int *sentp)
{
- struct xdr_buf *xbufp = &rqstp->rq_res;
- int sent;
- __be32 reclen;
+ const struct kvec *head = xdr->head;
+ const struct kvec *tail = xdr->tail;
+ struct kvec rm = {
+ .iov_base = &marker,
+ .iov_len = sizeof(marker),
+ };
+ struct msghdr msg = {
+ .msg_flags = 0,
+ };
+ int ret;
+
+ *sentp = 0;
+ ret = xdr_alloc_bvec(xdr, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+
+ ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ if (ret != rm.iov_len)
+ return -EAGAIN;
- /* Set up the first element of the reply kvec.
- * Any other kvecs that may be in use have been taken
- * care of by the server implementation itself.
- */
- reclen = htonl(0x80000000|((xbufp->len ) - 4));
- memcpy(xbufp->head[0].iov_base, &reclen, 4);
-
- sent = svc_sendto(rqstp, &rqstp->rq_res);
- if (sent != xbufp->len) {
- printk(KERN_NOTICE
- "rpc-srv/tcp: %s: %s %d when sending %d bytes "
- "- shutting down socket\n",
- rqstp->rq_xprt->xpt_server->sv_name,
- (sent<0)?"got error":"sent only",
- sent, xbufp->len);
- set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
- svc_xprt_enqueue(rqstp->rq_xprt);
- sent = -EAGAIN;
+ ret = svc_tcp_send_kvec(sock, head, 0);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ if (ret != head->iov_len)
+ goto out;
+
+ if (xdr->page_len) {
+ unsigned int offset, len, remaining;
+ struct bio_vec *bvec;
+
+ bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT);
+ offset = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining > 0) {
+ len = min(remaining, bvec->bv_len - offset);
+ ret = kernel_sendpage(sock, bvec->bv_page,
+ bvec->bv_offset + offset,
+ len, 0);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ if (ret != len)
+ goto out;
+ remaining -= len;
+ offset = 0;
+ bvec++;
+ }
+ }
+
+ if (tail->iov_len) {
+ ret = svc_tcp_send_kvec(sock, tail, 0);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
}
+
+out:
+ return 0;
+}
+
+/**
+ * svc_tcp_sendto - Send out a reply on a TCP socket
+ * @rqstp: completed svc_rqst
+ *
+ * xpt_mutex ensures @rqstp's whole message is written to the socket
+ * without interruption.
+ *
+ * Returns the number of bytes sent, or a negative errno.
+ */
+static int svc_tcp_sendto(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct xdr_buf *xdr = &rqstp->rq_res;
+ rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
+ (u32)xdr->len);
+ unsigned int sent;
+ int err;
+
+ svc_tcp_release_rqst(rqstp);
+
+ atomic_inc(&svsk->sk_sendqlen);
+ mutex_lock(&xprt->xpt_mutex);
+ if (svc_xprt_is_dead(xprt))
+ goto out_notconn;
+ tcp_sock_set_cork(svsk->sk_sk, true);
+ err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent);
+ xdr_free_bvec(xdr);
+ trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent);
+ if (err < 0 || sent != (xdr->len + sizeof(marker)))
+ goto out_close;
+ if (atomic_dec_and_test(&svsk->sk_sendqlen))
+ tcp_sock_set_cork(svsk->sk_sk, false);
+ mutex_unlock(&xprt->xpt_mutex);
return sent;
+
+out_notconn:
+ atomic_dec(&svsk->sk_sendqlen);
+ mutex_unlock(&xprt->xpt_mutex);
+ return -ENOTCONN;
+out_close:
+ pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
+ xprt->xpt_server->sv_name,
+ (err < 0) ? "got error" : "sent",
+ (err < 0) ? err : sent, xdr->len);
+ svc_xprt_deferred_close(xprt);
+ atomic_dec(&svsk->sk_sendqlen);
+ mutex_unlock(&xprt->xpt_mutex);
+ return -EAGAIN;
}
static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
@@ -1171,7 +1203,8 @@ static const struct svc_xprt_ops svc_tcp_ops = {
.xpo_create = svc_tcp_create,
.xpo_recvfrom = svc_tcp_recvfrom,
.xpo_sendto = svc_tcp_sendto,
- .xpo_release_rqst = svc_release_skb,
+ .xpo_result_payload = svc_sock_result_payload,
+ .xpo_release_rqst = svc_tcp_release_rqst,
.xpo_detach = svc_tcp_sock_detach,
.xpo_free = svc_sock_free,
.xpo_has_wspace = svc_tcp_has_wspace,
@@ -1209,23 +1242,21 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
if (sk->sk_state == TCP_LISTEN) {
- dprintk("setting up TCP socket for listening\n");
strcpy(svsk->sk_xprt.xpt_remotebuf, "listener");
set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
sk->sk_data_ready = svc_tcp_listen_data_ready;
set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
} else {
- dprintk("setting up TCP socket for reading\n");
sk->sk_state_change = svc_tcp_state_change;
sk->sk_data_ready = svc_data_ready;
sk->sk_write_space = svc_write_space;
- svsk->sk_reclen = 0;
+ svsk->sk_marker = xdr_zero;
svsk->sk_tcplen = 0;
svsk->sk_datalen = 0;
memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
- tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
+ tcp_sock_set_nodelay(sk);
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
switch (sk->sk_state) {
@@ -1233,7 +1264,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
case TCP_ESTABLISHED:
break;
default:
- set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_deferred_close(&svsk->sk_xprt);
}
}
}
@@ -1265,7 +1296,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
int err = 0;
- dprintk("svc: svc_setup_socket %p\n", sock);
svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
if (!svsk)
return ERR_PTR(-ENOMEM);
@@ -1302,12 +1332,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
else
svc_tcp_init(svsk, serv);
- dprintk("svc: svc_setup_socket created %p (inet %p), "
- "listen %d close %d\n",
- svsk, svsk->sk_sk,
- test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags),
- test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
-
+ trace_svcsock_new_socket(sock);
return svsk;
}
@@ -1399,12 +1424,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
struct sockaddr *newsin = (struct sockaddr *)&addr;
int newlen;
int family;
- int val;
- RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
-
- dprintk("svc: svc_create_socket(%s, %d, %s)\n",
- serv->sv_program->pg_name, protocol,
- __svc_print_addr(sin, buf, sizeof(buf)));
if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
printk(KERN_WARNING "svc: only UDP and TCP "
@@ -1435,11 +1454,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
* getting requests from IPv4 remotes. Those should
* be shunted to a PF_INET listener via rpcbind.
*/
- val = 1;
if (family == PF_INET6)
- kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
- (char *)&val, sizeof(val));
-
+ ip6_sock_set_v6only(sock->sk);
if (type == SOCK_STREAM)
sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */
error = kernel_bind(sock, sin, len);
@@ -1464,7 +1480,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
return (struct svc_xprt *)svsk;
bummer:
- dprintk("svc: svc_create_socket error = %d\n", -error);
sock_release(sock);
return ERR_PTR(error);
}
@@ -1478,8 +1493,6 @@ static void svc_sock_detach(struct svc_xprt *xprt)
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
struct sock *sk = svsk->sk_sk;
- dprintk("svc: svc_sock_detach(%p)\n", svsk);
-
/* put back the old socket callbacks */
lock_sock(sk);
sk->sk_state_change = svsk->sk_ostate;
@@ -1496,8 +1509,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
{
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
- dprintk("svc: svc_tcp_sock_detach(%p)\n", svsk);
-
svc_sock_detach(xprt);
if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
@@ -1512,7 +1523,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
static void svc_sock_free(struct svc_xprt *xprt)
{
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
- dprintk("svc: svc_sock_free(%p)\n", svsk);
if (svsk->sk_sock->file)
sockfd_put(svsk->sk_sock);
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index d75f17b56f0e..3aad6ef18504 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -60,25 +60,32 @@ rpc_unregister_sysctl(void)
}
static int proc_do_xprt(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char tmpbuf[256];
- size_t len;
+ ssize_t len;
- if ((*ppos && !write) || !*lenp) {
+ if (write || *ppos) {
*lenp = 0;
return 0;
}
len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
- return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
+ len = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
+
+ if (len < 0) {
+ *lenp = 0;
+ return -EINVAL;
+ }
+ *lenp = len;
+ return 0;
}
static int
-proc_dodebug(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp,
+ loff_t *ppos)
{
- char tmpbuf[20], c, *s = NULL;
- char __user *p;
+ char tmpbuf[20], *s = NULL;
+ char *p;
unsigned int value;
size_t left, len;
@@ -90,18 +97,17 @@ proc_dodebug(struct ctl_table *table, int write,
left = *lenp;
if (write) {
- if (!access_ok(buffer, left))
- return -EFAULT;
p = buffer;
- while (left && __get_user(c, p) >= 0 && isspace(c))
- left--, p++;
+ while (left && isspace(*p)) {
+ left--;
+ p++;
+ }
if (!left)
goto done;
if (left > sizeof(tmpbuf) - 1)
return -EINVAL;
- if (copy_from_user(tmpbuf, p, left))
- return -EFAULT;
+ memcpy(tmpbuf, p, left);
tmpbuf[left] = '\0';
value = simple_strtol(tmpbuf, &s, 0);
@@ -109,8 +115,10 @@ proc_dodebug(struct ctl_table *table, int write,
left -= (s - tmpbuf);
if (left && !isspace(*s))
return -EINVAL;
- while (left && isspace(*s))
- left--, s++;
+ while (left && isspace(*s)) {
+ left--;
+ s++;
+ }
} else
left = 0;
*(unsigned int *) table->data = value;
@@ -121,11 +129,9 @@ proc_dodebug(struct ctl_table *table, int write,
len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data);
if (len > left)
len = left;
- if (copy_to_user(buffer, tmpbuf, len))
- return -EFAULT;
+ memcpy(buffer, tmpbuf, len);
if ((left -= len) > 0) {
- if (put_user('\n', (char __user *)buffer + len))
- return -EFAULT;
+ *((char *)buffer + len) = '\n';
left--;
}
}
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
new file mode 100644
index 000000000000..c1f559892ae8
--- /dev/null
+++ b/net/sunrpc/sysfs.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#include <linux/sunrpc/clnt.h>
+#include <linux/kobject.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/xprtsock.h>
+
+#include "sysfs.h"
+
+struct xprt_addr {
+ const char *addr;
+ struct rcu_head rcu;
+};
+
+static void free_xprt_addr(struct rcu_head *head)
+{
+ struct xprt_addr *addr = container_of(head, struct xprt_addr, rcu);
+
+ kfree(addr->addr);
+ kfree(addr);
+}
+
+static struct kset *rpc_sunrpc_kset;
+static struct kobject *rpc_sunrpc_client_kobj, *rpc_sunrpc_xprt_switch_kobj;
+
+static void rpc_sysfs_object_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
+static const struct kobj_ns_type_operations *
+rpc_sysfs_object_child_ns_type(struct kobject *kobj)
+{
+ return &net_ns_type_operations;
+}
+
+static struct kobj_type rpc_sysfs_object_type = {
+ .release = rpc_sysfs_object_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .child_ns_type = rpc_sysfs_object_child_ns_type,
+};
+
+static struct kobject *rpc_sysfs_object_alloc(const char *name,
+ struct kset *kset,
+ struct kobject *parent)
+{
+ struct kobject *kobj;
+
+ kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
+ if (kobj) {
+ kobj->kset = kset;
+ if (kobject_init_and_add(kobj, &rpc_sysfs_object_type,
+ parent, "%s", name) == 0)
+ return kobj;
+ kobject_put(kobj);
+ }
+ return NULL;
+}
+
+static inline struct rpc_xprt *
+rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj)
+{
+ struct rpc_sysfs_xprt *x = container_of(kobj,
+ struct rpc_sysfs_xprt, kobject);
+
+ return xprt_get(x->xprt);
+}
+
+static inline struct rpc_xprt_switch *
+rpc_sysfs_xprt_kobj_get_xprt_switch(struct kobject *kobj)
+{
+ struct rpc_sysfs_xprt *x = container_of(kobj,
+ struct rpc_sysfs_xprt, kobject);
+
+ return xprt_switch_get(x->xprt_switch);
+}
+
+static inline struct rpc_xprt_switch *
+rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj)
+{
+ struct rpc_sysfs_xprt_switch *x = container_of(kobj,
+ struct rpc_sysfs_xprt_switch, kobject);
+
+ return xprt_switch_get(x->xprt_switch);
+}
+
+static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+ ssize_t ret;
+
+ if (!xprt) {
+ ret = sprintf(buf, "<closed>\n");
+ goto out;
+ }
+ ret = sprintf(buf, "%s\n", xprt->address_strings[RPC_DISPLAY_ADDR]);
+ xprt_put(xprt);
+out:
+ return ret;
+}
+
+static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+ size_t buflen = PAGE_SIZE;
+ ssize_t ret;
+
+ if (!xprt || !xprt_connected(xprt)) {
+ ret = sprintf(buf, "<closed>\n");
+ } else if (xprt->ops->get_srcaddr) {
+ ret = xprt->ops->get_srcaddr(xprt, buf, buflen);
+ if (ret > 0) {
+ if (ret < buflen - 1) {
+ buf[ret] = '\n';
+ ret++;
+ buf[ret] = '\0';
+ }
+ } else
+ ret = sprintf(buf, "<closed>\n");
+ } else
+ ret = sprintf(buf, "<not a socket>\n");
+ xprt_put(xprt);
+ return ret;
+}
+
+static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+ unsigned short srcport = 0;
+ size_t buflen = PAGE_SIZE;
+ ssize_t ret;
+
+ if (!xprt || !xprt_connected(xprt)) {
+ ret = sprintf(buf, "<closed>\n");
+ goto out;
+ }
+
+ if (xprt->ops->get_srcport)
+ srcport = xprt->ops->get_srcport(xprt);
+
+ ret = snprintf(buf, buflen,
+ "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n"
+ "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n"
+ "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n"
+ "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n"
+ "tasks_queuelen=%ld\ndst_port=%s\n",
+ xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs,
+ xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen,
+ xprt->sending.qlen, xprt->pending.qlen,
+ xprt->backlog.qlen, xprt->main, srcport,
+ atomic_long_read(&xprt->queuelen),
+ xprt->address_strings[RPC_DISPLAY_PORT]);
+out:
+ xprt_put(xprt);
+ return ret;
+}
+
+static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+ ssize_t ret;
+ int locked, connected, connecting, close_wait, bound, binding,
+ closing, congested, cwnd_wait, write_space, offline, remove;
+
+ if (!(xprt && xprt->state)) {
+ ret = sprintf(buf, "state=CLOSED\n");
+ } else {
+ locked = test_bit(XPRT_LOCKED, &xprt->state);
+ connected = test_bit(XPRT_CONNECTED, &xprt->state);
+ connecting = test_bit(XPRT_CONNECTING, &xprt->state);
+ close_wait = test_bit(XPRT_CLOSE_WAIT, &xprt->state);
+ bound = test_bit(XPRT_BOUND, &xprt->state);
+ binding = test_bit(XPRT_BINDING, &xprt->state);
+ closing = test_bit(XPRT_CLOSING, &xprt->state);
+ congested = test_bit(XPRT_CONGESTED, &xprt->state);
+ cwnd_wait = test_bit(XPRT_CWND_WAIT, &xprt->state);
+ write_space = test_bit(XPRT_WRITE_SPACE, &xprt->state);
+ offline = test_bit(XPRT_OFFLINE, &xprt->state);
+ remove = test_bit(XPRT_REMOVE, &xprt->state);
+
+ ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s %s\n",
+ locked ? "LOCKED" : "",
+ connected ? "CONNECTED" : "",
+ connecting ? "CONNECTING" : "",
+ close_wait ? "CLOSE_WAIT" : "",
+ bound ? "BOUND" : "",
+ binding ? "BOUNDING" : "",
+ closing ? "CLOSING" : "",
+ congested ? "CONGESTED" : "",
+ cwnd_wait ? "CWND_WAIT" : "",
+ write_space ? "WRITE_SPACE" : "",
+ offline ? "OFFLINE" : "",
+ remove ? "REMOVE" : "");
+ }
+
+ xprt_put(xprt);
+ return ret;
+}
+
+static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct rpc_xprt_switch *xprt_switch =
+ rpc_sysfs_xprt_switch_kobj_get_xprt(kobj);
+ ssize_t ret;
+
+ if (!xprt_switch)
+ return 0;
+ ret = sprintf(buf, "num_xprts=%u\nnum_active=%u\n"
+ "num_unique_destaddr=%u\nqueue_len=%ld\n",
+ xprt_switch->xps_nxprts, xprt_switch->xps_nactive,
+ xprt_switch->xps_nunique_destaddr_xprts,
+ atomic_long_read(&xprt_switch->xps_queuelen));
+ xprt_switch_put(xprt_switch);
+ return ret;
+}
+
+static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+ struct sockaddr *saddr;
+ char *dst_addr;
+ int port;
+ struct xprt_addr *saved_addr;
+ size_t buf_len;
+
+ if (!xprt)
+ return 0;
+ if (!(xprt->xprt_class->ident == XPRT_TRANSPORT_TCP ||
+ xprt->xprt_class->ident == XPRT_TRANSPORT_RDMA)) {
+ xprt_put(xprt);
+ return -EOPNOTSUPP;
+ }
+
+ if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) {
+ count = -EINTR;
+ goto out_put;
+ }
+ saddr = (struct sockaddr *)&xprt->addr;
+ port = rpc_get_port(saddr);
+
+ /* buf_len is the len until the first occurence of either
+ * '\n' or '\0'
+ */
+ buf_len = strcspn(buf, "\n");
+
+ dst_addr = kstrndup(buf, buf_len, GFP_KERNEL);
+ if (!dst_addr)
+ goto out_err;
+ saved_addr = kzalloc(sizeof(*saved_addr), GFP_KERNEL);
+ if (!saved_addr)
+ goto out_err_free;
+ saved_addr->addr =
+ rcu_dereference_raw(xprt->address_strings[RPC_DISPLAY_ADDR]);
+ rcu_assign_pointer(xprt->address_strings[RPC_DISPLAY_ADDR], dst_addr);
+ call_rcu(&saved_addr->rcu, free_xprt_addr);
+ xprt->addrlen = rpc_pton(xprt->xprt_net, buf, buf_len, saddr,
+ sizeof(*saddr));
+ rpc_set_port(saddr, port);
+
+ xprt_force_disconnect(xprt);
+out:
+ xprt_release_write(xprt, NULL);
+out_put:
+ xprt_put(xprt);
+ return count;
+out_err_free:
+ kfree(dst_addr);
+out_err:
+ count = -ENOMEM;
+ goto out;
+}
+
+static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+ int offline = 0, online = 0, remove = 0;
+ struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj);
+
+ if (!xprt || !xps) {
+ count = 0;
+ goto out_put;
+ }
+
+ if (!strncmp(buf, "offline", 7))
+ offline = 1;
+ else if (!strncmp(buf, "online", 6))
+ online = 1;
+ else if (!strncmp(buf, "remove", 6))
+ remove = 1;
+ else {
+ count = -EINVAL;
+ goto out_put;
+ }
+
+ if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) {
+ count = -EINTR;
+ goto out_put;
+ }
+ if (xprt->main) {
+ count = -EINVAL;
+ goto release_tasks;
+ }
+ if (offline) {
+ xprt_set_offline_locked(xprt, xps);
+ } else if (online) {
+ xprt_set_online_locked(xprt, xps);
+ } else if (remove) {
+ if (test_bit(XPRT_OFFLINE, &xprt->state))
+ xprt_delete_locked(xprt, xps);
+ else
+ count = -EINVAL;
+ }
+
+release_tasks:
+ xprt_release_write(xprt, NULL);
+out_put:
+ xprt_put(xprt);
+ xprt_switch_put(xps);
+ return count;
+}
+
+int rpc_sysfs_init(void)
+{
+ rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj);
+ if (!rpc_sunrpc_kset)
+ return -ENOMEM;
+ rpc_sunrpc_client_kobj =
+ rpc_sysfs_object_alloc("rpc-clients", rpc_sunrpc_kset, NULL);
+ if (!rpc_sunrpc_client_kobj)
+ goto err_client;
+ rpc_sunrpc_xprt_switch_kobj =
+ rpc_sysfs_object_alloc("xprt-switches", rpc_sunrpc_kset, NULL);
+ if (!rpc_sunrpc_xprt_switch_kobj)
+ goto err_switch;
+ return 0;
+err_switch:
+ kobject_put(rpc_sunrpc_client_kobj);
+ rpc_sunrpc_client_kobj = NULL;
+err_client:
+ kset_unregister(rpc_sunrpc_kset);
+ rpc_sunrpc_kset = NULL;
+ return -ENOMEM;
+}
+
+static void rpc_sysfs_client_release(struct kobject *kobj)
+{
+ struct rpc_sysfs_client *c;
+
+ c = container_of(kobj, struct rpc_sysfs_client, kobject);
+ kfree(c);
+}
+
+static void rpc_sysfs_xprt_switch_release(struct kobject *kobj)
+{
+ struct rpc_sysfs_xprt_switch *xprt_switch;
+
+ xprt_switch = container_of(kobj, struct rpc_sysfs_xprt_switch, kobject);
+ kfree(xprt_switch);
+}
+
+static void rpc_sysfs_xprt_release(struct kobject *kobj)
+{
+ struct rpc_sysfs_xprt *xprt;
+
+ xprt = container_of(kobj, struct rpc_sysfs_xprt, kobject);
+ kfree(xprt);
+}
+
+static const void *rpc_sysfs_client_namespace(struct kobject *kobj)
+{
+ return container_of(kobj, struct rpc_sysfs_client, kobject)->net;
+}
+
+static const void *rpc_sysfs_xprt_switch_namespace(struct kobject *kobj)
+{
+ return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net;
+}
+
+static const void *rpc_sysfs_xprt_namespace(struct kobject *kobj)
+{
+ return container_of(kobj, struct rpc_sysfs_xprt,
+ kobject)->xprt->xprt_net;
+}
+
+static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr,
+ 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store);
+
+static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr,
+ 0644, rpc_sysfs_xprt_srcaddr_show, NULL);
+
+static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info,
+ 0444, rpc_sysfs_xprt_info_show, NULL);
+
+static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state,
+ 0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change);
+
+static struct attribute *rpc_sysfs_xprt_attrs[] = {
+ &rpc_sysfs_xprt_dstaddr.attr,
+ &rpc_sysfs_xprt_srcaddr.attr,
+ &rpc_sysfs_xprt_info.attr,
+ &rpc_sysfs_xprt_change_state.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(rpc_sysfs_xprt);
+
+static struct kobj_attribute rpc_sysfs_xprt_switch_info =
+ __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL);
+
+static struct attribute *rpc_sysfs_xprt_switch_attrs[] = {
+ &rpc_sysfs_xprt_switch_info.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(rpc_sysfs_xprt_switch);
+
+static struct kobj_type rpc_sysfs_client_type = {
+ .release = rpc_sysfs_client_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .namespace = rpc_sysfs_client_namespace,
+};
+
+static struct kobj_type rpc_sysfs_xprt_switch_type = {
+ .release = rpc_sysfs_xprt_switch_release,
+ .default_groups = rpc_sysfs_xprt_switch_groups,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .namespace = rpc_sysfs_xprt_switch_namespace,
+};
+
+static struct kobj_type rpc_sysfs_xprt_type = {
+ .release = rpc_sysfs_xprt_release,
+ .default_groups = rpc_sysfs_xprt_groups,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .namespace = rpc_sysfs_xprt_namespace,
+};
+
+void rpc_sysfs_exit(void)
+{
+ kobject_put(rpc_sunrpc_client_kobj);
+ kobject_put(rpc_sunrpc_xprt_switch_kobj);
+ kset_unregister(rpc_sunrpc_kset);
+}
+
+static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent,
+ struct net *net,
+ int clid)
+{
+ struct rpc_sysfs_client *p;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (p) {
+ p->net = net;
+ p->kobject.kset = rpc_sunrpc_kset;
+ if (kobject_init_and_add(&p->kobject, &rpc_sysfs_client_type,
+ parent, "clnt-%d", clid) == 0)
+ return p;
+ kobject_put(&p->kobject);
+ }
+ return NULL;
+}
+
+static struct rpc_sysfs_xprt_switch *
+rpc_sysfs_xprt_switch_alloc(struct kobject *parent,
+ struct rpc_xprt_switch *xprt_switch,
+ struct net *net,
+ gfp_t gfp_flags)
+{
+ struct rpc_sysfs_xprt_switch *p;
+
+ p = kzalloc(sizeof(*p), gfp_flags);
+ if (p) {
+ p->net = net;
+ p->kobject.kset = rpc_sunrpc_kset;
+ if (kobject_init_and_add(&p->kobject,
+ &rpc_sysfs_xprt_switch_type,
+ parent, "switch-%d",
+ xprt_switch->xps_id) == 0)
+ return p;
+ kobject_put(&p->kobject);
+ }
+ return NULL;
+}
+
+static struct rpc_sysfs_xprt *rpc_sysfs_xprt_alloc(struct kobject *parent,
+ struct rpc_xprt *xprt,
+ gfp_t gfp_flags)
+{
+ struct rpc_sysfs_xprt *p;
+
+ p = kzalloc(sizeof(*p), gfp_flags);
+ if (!p)
+ goto out;
+ p->kobject.kset = rpc_sunrpc_kset;
+ if (kobject_init_and_add(&p->kobject, &rpc_sysfs_xprt_type,
+ parent, "xprt-%d-%s", xprt->id,
+ xprt->address_strings[RPC_DISPLAY_PROTO]) == 0)
+ return p;
+ kobject_put(&p->kobject);
+out:
+ return NULL;
+}
+
+void rpc_sysfs_client_setup(struct rpc_clnt *clnt,
+ struct rpc_xprt_switch *xprt_switch,
+ struct net *net)
+{
+ struct rpc_sysfs_client *rpc_client;
+ struct rpc_sysfs_xprt_switch *xswitch =
+ (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs;
+
+ if (!xswitch)
+ return;
+
+ rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj,
+ net, clnt->cl_clid);
+ if (rpc_client) {
+ char name[] = "switch";
+ int ret;
+
+ clnt->cl_sysfs = rpc_client;
+ rpc_client->clnt = clnt;
+ rpc_client->xprt_switch = xprt_switch;
+ kobject_uevent(&rpc_client->kobject, KOBJ_ADD);
+ ret = sysfs_create_link_nowarn(&rpc_client->kobject,
+ &xswitch->kobject, name);
+ if (ret)
+ pr_warn("can't create link to %s in sysfs (%d)\n",
+ name, ret);
+ }
+}
+
+void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch,
+ struct rpc_xprt *xprt,
+ gfp_t gfp_flags)
+{
+ struct rpc_sysfs_xprt_switch *rpc_xprt_switch;
+ struct net *net;
+
+ if (xprt_switch->xps_net)
+ net = xprt_switch->xps_net;
+ else
+ net = xprt->xprt_net;
+ rpc_xprt_switch =
+ rpc_sysfs_xprt_switch_alloc(rpc_sunrpc_xprt_switch_kobj,
+ xprt_switch, net, gfp_flags);
+ if (rpc_xprt_switch) {
+ xprt_switch->xps_sysfs = rpc_xprt_switch;
+ rpc_xprt_switch->xprt_switch = xprt_switch;
+ rpc_xprt_switch->xprt = xprt;
+ kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_ADD);
+ } else {
+ xprt_switch->xps_sysfs = NULL;
+ }
+}
+
+void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch,
+ struct rpc_xprt *xprt,
+ gfp_t gfp_flags)
+{
+ struct rpc_sysfs_xprt *rpc_xprt;
+ struct rpc_sysfs_xprt_switch *switch_obj =
+ (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs;
+
+ if (!switch_obj)
+ return;
+
+ rpc_xprt = rpc_sysfs_xprt_alloc(&switch_obj->kobject, xprt, gfp_flags);
+ if (rpc_xprt) {
+ xprt->xprt_sysfs = rpc_xprt;
+ rpc_xprt->xprt = xprt;
+ rpc_xprt->xprt_switch = xprt_switch;
+ kobject_uevent(&rpc_xprt->kobject, KOBJ_ADD);
+ }
+}
+
+void rpc_sysfs_client_destroy(struct rpc_clnt *clnt)
+{
+ struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs;
+
+ if (rpc_client) {
+ char name[] = "switch";
+
+ sysfs_remove_link(&rpc_client->kobject, name);
+ kobject_uevent(&rpc_client->kobject, KOBJ_REMOVE);
+ kobject_del(&rpc_client->kobject);
+ kobject_put(&rpc_client->kobject);
+ clnt->cl_sysfs = NULL;
+ }
+}
+
+void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt_switch)
+{
+ struct rpc_sysfs_xprt_switch *rpc_xprt_switch = xprt_switch->xps_sysfs;
+
+ if (rpc_xprt_switch) {
+ kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_REMOVE);
+ kobject_del(&rpc_xprt_switch->kobject);
+ kobject_put(&rpc_xprt_switch->kobject);
+ xprt_switch->xps_sysfs = NULL;
+ }
+}
+
+void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt)
+{
+ struct rpc_sysfs_xprt *rpc_xprt = xprt->xprt_sysfs;
+
+ if (rpc_xprt) {
+ kobject_uevent(&rpc_xprt->kobject, KOBJ_REMOVE);
+ kobject_del(&rpc_xprt->kobject);
+ kobject_put(&rpc_xprt->kobject);
+ xprt->xprt_sysfs = NULL;
+ }
+}
diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h
new file mode 100644
index 000000000000..6620cebd1037
--- /dev/null
+++ b/net/sunrpc/sysfs.h
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#ifndef __SUNRPC_SYSFS_H
+#define __SUNRPC_SYSFS_H
+
+struct rpc_sysfs_client {
+ struct kobject kobject;
+ struct net *net;
+ struct rpc_clnt *clnt;
+ struct rpc_xprt_switch *xprt_switch;
+};
+
+struct rpc_sysfs_xprt_switch {
+ struct kobject kobject;
+ struct net *net;
+ struct rpc_xprt_switch *xprt_switch;
+ struct rpc_xprt *xprt;
+};
+
+struct rpc_sysfs_xprt {
+ struct kobject kobject;
+ struct rpc_xprt *xprt;
+ struct rpc_xprt_switch *xprt_switch;
+};
+
+int rpc_sysfs_init(void);
+void rpc_sysfs_exit(void);
+
+void rpc_sysfs_client_setup(struct rpc_clnt *clnt,
+ struct rpc_xprt_switch *xprt_switch,
+ struct net *net);
+void rpc_sysfs_client_destroy(struct rpc_clnt *clnt);
+void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch,
+ struct rpc_xprt *xprt, gfp_t gfp_flags);
+void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt);
+void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch,
+ struct rpc_xprt *xprt, gfp_t gfp_flags);
+void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt);
+
+#endif
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index e5497dc2475b..336a7c7833e4 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -19,6 +19,9 @@
#include <linux/bvec.h>
#include <trace/events/sunrpc.h>
+static void _copy_to_pages(struct page **, size_t, const char *, size_t);
+
+
/*
* XDR functions for basic NFS types
*/
@@ -120,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
* @len: length of string, in bytes
*
*/
-void
-xdr_terminate_string(struct xdr_buf *buf, const u32 len)
+void xdr_terminate_string(const struct xdr_buf *buf, const u32 len)
{
char *kaddr;
@@ -131,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len)
}
EXPORT_SYMBOL_GPL(xdr_terminate_string);
-size_t
-xdr_buf_pagecount(struct xdr_buf *buf)
+size_t xdr_buf_pagecount(const struct xdr_buf *buf)
{
if (!buf->page_len)
return 0;
@@ -190,9 +191,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
tail->iov_base = buf + offset;
tail->iov_len = buflen - offset;
- if ((xdr->page_len & 3) == 0)
- tail->iov_len -= sizeof(__be32);
-
xdr->buflen += len;
}
EXPORT_SYMBOL_GPL(xdr_inline_pages);
@@ -202,6 +200,71 @@ EXPORT_SYMBOL_GPL(xdr_inline_pages);
*/
/**
+ * _shift_data_left_pages
+ * @pages: vector of pages containing both the source and dest memory area.
+ * @pgto_base: page vector address of destination
+ * @pgfrom_base: page vector address of source
+ * @len: number of bytes to copy
+ *
+ * Note: the addresses pgto_base and pgfrom_base are both calculated in
+ * the same way:
+ * if a memory area starts at byte 'base' in page 'pages[i]',
+ * then its address is given as (i << PAGE_CACHE_SHIFT) + base
+ * Alse note: pgto_base must be < pgfrom_base, but the memory areas
+ * they point to may overlap.
+ */
+static void
+_shift_data_left_pages(struct page **pages, size_t pgto_base,
+ size_t pgfrom_base, size_t len)
+{
+ struct page **pgfrom, **pgto;
+ char *vfrom, *vto;
+ size_t copy;
+
+ BUG_ON(pgfrom_base <= pgto_base);
+
+ if (!len)
+ return;
+
+ pgto = pages + (pgto_base >> PAGE_SHIFT);
+ pgfrom = pages + (pgfrom_base >> PAGE_SHIFT);
+
+ pgto_base &= ~PAGE_MASK;
+ pgfrom_base &= ~PAGE_MASK;
+
+ do {
+ if (pgto_base >= PAGE_SIZE) {
+ pgto_base = 0;
+ pgto++;
+ }
+ if (pgfrom_base >= PAGE_SIZE){
+ pgfrom_base = 0;
+ pgfrom++;
+ }
+
+ copy = len;
+ if (copy > (PAGE_SIZE - pgto_base))
+ copy = PAGE_SIZE - pgto_base;
+ if (copy > (PAGE_SIZE - pgfrom_base))
+ copy = PAGE_SIZE - pgfrom_base;
+
+ vto = kmap_atomic(*pgto);
+ if (*pgto != *pgfrom) {
+ vfrom = kmap_atomic(*pgfrom);
+ memcpy(vto + pgto_base, vfrom + pgfrom_base, copy);
+ kunmap_atomic(vfrom);
+ } else
+ memmove(vto + pgto_base, vto + pgfrom_base, copy);
+ flush_dcache_page(*pgto);
+ kunmap_atomic(vto);
+
+ pgto_base += copy;
+ pgfrom_base += copy;
+
+ } while ((len -= copy) != 0);
+}
+
+/**
* _shift_data_right_pages
* @pages: vector of pages containing both the source and dest memory area.
* @pgto_base: page vector address of destination
@@ -225,6 +288,9 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
BUG_ON(pgto_base <= pgfrom_base);
+ if (!len)
+ return;
+
pgto_base += len;
pgfrom_base += len;
@@ -283,6 +349,9 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
char *vto;
size_t copy;
+ if (!len)
+ return;
+
pgto = pages + (pgbase >> PAGE_SHIFT);
pgbase &= ~PAGE_MASK;
@@ -327,6 +396,9 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
char *vfrom;
size_t copy;
+ if (!len)
+ return;
+
pgfrom = pages + (pgbase >> PAGE_SHIFT);
pgbase &= ~PAGE_MASK;
@@ -350,148 +422,451 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
}
EXPORT_SYMBOL_GPL(_copy_from_pages);
+static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base,
+ unsigned int len)
+{
+ if (base >= iov->iov_len)
+ return;
+ if (len > iov->iov_len - base)
+ len = iov->iov_len - base;
+ memset(iov->iov_base + base, 0, len);
+}
+
/**
- * xdr_shrink_bufhead
+ * xdr_buf_pages_zero
* @buf: xdr_buf
- * @len: bytes to remove from buf->head[0]
- *
- * Shrinks XDR buffer's header kvec buf->head[0] by
- * 'len' bytes. The extra data is not lost, but is instead
- * moved into the inlined pages and/or the tail.
+ * @pgbase: beginning offset
+ * @len: length
*/
-static unsigned int
-xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
+static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase,
+ unsigned int len)
{
- struct kvec *head, *tail;
- size_t copy, offs;
- unsigned int pglen = buf->page_len;
- unsigned int result;
+ struct page **pages = buf->pages;
+ struct page **page;
+ char *vpage;
+ unsigned int zero;
- result = 0;
- tail = buf->tail;
- head = buf->head;
+ if (!len)
+ return;
+ if (pgbase >= buf->page_len) {
+ xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len);
+ return;
+ }
+ if (pgbase + len > buf->page_len) {
+ xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len);
+ len = buf->page_len - pgbase;
+ }
- WARN_ON_ONCE(len > head->iov_len);
- if (len > head->iov_len)
- len = head->iov_len;
-
- /* Shift the tail first */
- if (tail->iov_len != 0) {
- if (tail->iov_len > len) {
- copy = tail->iov_len - len;
- memmove((char *)tail->iov_base + len,
- tail->iov_base, copy);
- result += copy;
- }
- /* Copy from the inlined pages into the tail */
- copy = len;
- if (copy > pglen)
- copy = pglen;
- offs = len - copy;
- if (offs >= tail->iov_len)
- copy = 0;
- else if (copy > tail->iov_len - offs)
- copy = tail->iov_len - offs;
- if (copy != 0) {
- _copy_from_pages((char *)tail->iov_base + offs,
- buf->pages,
- buf->page_base + pglen + offs - len,
- copy);
- result += copy;
+ pgbase += buf->page_base;
+
+ page = pages + (pgbase >> PAGE_SHIFT);
+ pgbase &= ~PAGE_MASK;
+
+ do {
+ zero = PAGE_SIZE - pgbase;
+ if (zero > len)
+ zero = len;
+
+ vpage = kmap_atomic(*page);
+ memset(vpage + pgbase, 0, zero);
+ kunmap_atomic(vpage);
+
+ flush_dcache_page(*page);
+ pgbase = 0;
+ page++;
+
+ } while ((len -= zero) != 0);
+}
+
+static unsigned int xdr_buf_pages_fill_sparse(const struct xdr_buf *buf,
+ unsigned int buflen, gfp_t gfp)
+{
+ unsigned int i, npages, pagelen;
+
+ if (!(buf->flags & XDRBUF_SPARSE_PAGES))
+ return buflen;
+ if (buflen <= buf->head->iov_len)
+ return buflen;
+ pagelen = buflen - buf->head->iov_len;
+ if (pagelen > buf->page_len)
+ pagelen = buf->page_len;
+ npages = (pagelen + buf->page_base + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for (i = 0; i < npages; i++) {
+ if (!buf->pages[i])
+ continue;
+ buf->pages[i] = alloc_page(gfp);
+ if (likely(buf->pages[i]))
+ continue;
+ buflen -= pagelen;
+ pagelen = i << PAGE_SHIFT;
+ if (pagelen > buf->page_base)
+ buflen += pagelen - buf->page_base;
+ break;
+ }
+ return buflen;
+}
+
+static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len)
+{
+ struct kvec *head = buf->head;
+ struct kvec *tail = buf->tail;
+ unsigned int sum = head->iov_len + buf->page_len + tail->iov_len;
+ unsigned int free_space, newlen;
+
+ if (sum > buf->len) {
+ free_space = min_t(unsigned int, sum - buf->len, len);
+ newlen = xdr_buf_pages_fill_sparse(buf, buf->len + free_space,
+ GFP_KERNEL);
+ free_space = newlen - buf->len;
+ buf->len = newlen;
+ len -= free_space;
+ if (!len)
+ return;
+ }
+
+ if (buf->buflen > sum) {
+ /* Expand the tail buffer */
+ free_space = min_t(unsigned int, buf->buflen - sum, len);
+ tail->iov_len += free_space;
+ buf->len += free_space;
+ }
+}
+
+static void xdr_buf_tail_copy_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+ unsigned int to = base + shift;
+
+ if (to >= tail->iov_len)
+ return;
+ if (len + to > tail->iov_len)
+ len = tail->iov_len - to;
+ memmove(tail->iov_base + to, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+ unsigned int to = base + shift;
+ unsigned int pglen = 0;
+ unsigned int talen = 0, tato = 0;
+
+ if (base >= buf->page_len)
+ return;
+ if (len > buf->page_len - base)
+ len = buf->page_len - base;
+ if (to >= buf->page_len) {
+ tato = to - buf->page_len;
+ if (tail->iov_len >= len + tato)
+ talen = len;
+ else if (tail->iov_len > tato)
+ talen = tail->iov_len - tato;
+ } else if (len + to >= buf->page_len) {
+ pglen = buf->page_len - to;
+ talen = len - pglen;
+ if (talen > tail->iov_len)
+ talen = tail->iov_len;
+ } else
+ pglen = len;
+
+ _copy_from_pages(tail->iov_base + tato, buf->pages,
+ buf->page_base + base + pglen, talen);
+ _shift_data_right_pages(buf->pages, buf->page_base + to,
+ buf->page_base + base, pglen);
+}
+
+static void xdr_buf_head_copy_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *head = buf->head;
+ const struct kvec *tail = buf->tail;
+ unsigned int to = base + shift;
+ unsigned int pglen = 0, pgto = 0;
+ unsigned int talen = 0, tato = 0;
+
+ if (base >= head->iov_len)
+ return;
+ if (len > head->iov_len - base)
+ len = head->iov_len - base;
+ if (to >= buf->page_len + head->iov_len) {
+ tato = to - buf->page_len - head->iov_len;
+ talen = len;
+ } else if (to >= head->iov_len) {
+ pgto = to - head->iov_len;
+ pglen = len;
+ if (pgto + pglen > buf->page_len) {
+ talen = pgto + pglen - buf->page_len;
+ pglen -= talen;
}
- /* Do we also need to copy data from the head into the tail ? */
- if (len > pglen) {
- offs = copy = len - pglen;
- if (copy > tail->iov_len)
- copy = tail->iov_len;
- memcpy(tail->iov_base,
- (char *)head->iov_base +
- head->iov_len - offs,
- copy);
- result += copy;
+ } else {
+ pglen = len - to;
+ if (pglen > buf->page_len) {
+ talen = pglen - buf->page_len;
+ pglen = buf->page_len;
}
}
- /* Now handle pages */
- if (pglen != 0) {
- if (pglen > len)
- _shift_data_right_pages(buf->pages,
- buf->page_base + len,
- buf->page_base,
- pglen - len);
- copy = len;
- if (len > pglen)
- copy = pglen;
- _copy_to_pages(buf->pages, buf->page_base,
- (char *)head->iov_base + head->iov_len - len,
- copy);
- result += copy;
+
+ len -= talen;
+ base += len;
+ if (talen + tato > tail->iov_len)
+ talen = tail->iov_len > tato ? tail->iov_len - tato : 0;
+ memcpy(tail->iov_base + tato, head->iov_base + base, talen);
+
+ len -= pglen;
+ base -= pglen;
+ _copy_to_pages(buf->pages, buf->page_base + pgto, head->iov_base + base,
+ pglen);
+
+ base -= len;
+ memmove(head->iov_base + to, head->iov_base + base, len);
+}
+
+static void xdr_buf_tail_shift_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+
+ if (base >= tail->iov_len || !shift || !len)
+ return;
+ xdr_buf_tail_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ if (!shift || !len)
+ return;
+ if (base >= buf->page_len) {
+ xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift);
+ return;
}
- head->iov_len -= len;
- buf->buflen -= len;
- /* Have we truncated the message? */
- if (buf->len > buf->buflen)
- buf->len = buf->buflen;
+ if (base + len > buf->page_len)
+ xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len,
+ shift);
+ xdr_buf_pages_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_head_shift_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *head = buf->head;
- return result;
+ if (!shift)
+ return;
+ if (base >= head->iov_len) {
+ xdr_buf_pages_shift_right(buf, head->iov_len - base, len,
+ shift);
+ return;
+ }
+ if (base + len > head->iov_len)
+ xdr_buf_pages_shift_right(buf, 0, base + len - head->iov_len,
+ shift);
+ xdr_buf_head_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base,
+ unsigned int len, unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+
+ if (base >= tail->iov_len)
+ return;
+ if (len > tail->iov_len - base)
+ len = tail->iov_len - base;
+ /* Shift data into head */
+ if (shift > buf->page_len + base) {
+ const struct kvec *head = buf->head;
+ unsigned int hdto =
+ head->iov_len + buf->page_len + base - shift;
+ unsigned int hdlen = len;
+
+ if (WARN_ONCE(shift > head->iov_len + buf->page_len + base,
+ "SUNRPC: Misaligned data.\n"))
+ return;
+ if (hdto + hdlen > head->iov_len)
+ hdlen = head->iov_len - hdto;
+ memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen);
+ base += hdlen;
+ len -= hdlen;
+ if (!len)
+ return;
+ }
+ /* Shift data into pages */
+ if (shift > base) {
+ unsigned int pgto = buf->page_len + base - shift;
+ unsigned int pglen = len;
+
+ if (pgto + pglen > buf->page_len)
+ pglen = buf->page_len - pgto;
+ _copy_to_pages(buf->pages, buf->page_base + pgto,
+ tail->iov_base + base, pglen);
+ base += pglen;
+ len -= pglen;
+ if (!len)
+ return;
+ }
+ memmove(tail->iov_base + base - shift, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_left(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ unsigned int pgto;
+
+ if (base >= buf->page_len)
+ return;
+ if (len > buf->page_len - base)
+ len = buf->page_len - base;
+ /* Shift data into head */
+ if (shift > base) {
+ const struct kvec *head = buf->head;
+ unsigned int hdto = head->iov_len + base - shift;
+ unsigned int hdlen = len;
+
+ if (WARN_ONCE(shift > head->iov_len + base,
+ "SUNRPC: Misaligned data.\n"))
+ return;
+ if (hdto + hdlen > head->iov_len)
+ hdlen = head->iov_len - hdto;
+ _copy_from_pages(head->iov_base + hdto, buf->pages,
+ buf->page_base + base, hdlen);
+ base += hdlen;
+ len -= hdlen;
+ if (!len)
+ return;
+ }
+ pgto = base - shift;
+ _shift_data_left_pages(buf->pages, buf->page_base + pgto,
+ buf->page_base + base, len);
+}
+
+static void xdr_buf_tail_shift_left(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ if (!shift || !len)
+ return;
+ xdr_buf_tail_copy_left(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_left(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ if (!shift || !len)
+ return;
+ if (base >= buf->page_len) {
+ xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift);
+ return;
+ }
+ xdr_buf_pages_copy_left(buf, base, len, shift);
+ len += base;
+ if (len <= buf->page_len)
+ return;
+ xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift);
+}
+
+static void xdr_buf_head_shift_left(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *head = buf->head;
+ unsigned int bytes;
+
+ if (!shift || !len)
+ return;
+
+ if (shift > base) {
+ bytes = (shift - base);
+ if (bytes >= len)
+ return;
+ base += bytes;
+ len -= bytes;
+ }
+
+ if (base < head->iov_len) {
+ bytes = min_t(unsigned int, len, head->iov_len - base);
+ memmove(head->iov_base + (base - shift),
+ head->iov_base + base, bytes);
+ base += bytes;
+ len -= bytes;
+ }
+ xdr_buf_pages_shift_left(buf, base - head->iov_len, len, shift);
+}
+
+/**
+ * xdr_shrink_bufhead
+ * @buf: xdr_buf
+ * @len: new length of buf->head[0]
+ *
+ * Shrinks XDR buffer's header kvec buf->head[0], setting it to
+ * 'len' bytes. The extra data is not lost, but is instead
+ * moved into the inlined pages and/or the tail.
+ */
+static unsigned int xdr_shrink_bufhead(struct xdr_buf *buf, unsigned int len)
+{
+ struct kvec *head = buf->head;
+ unsigned int shift, buflen = max(buf->len, len);
+
+ WARN_ON_ONCE(len > head->iov_len);
+ if (head->iov_len > buflen) {
+ buf->buflen -= head->iov_len - buflen;
+ head->iov_len = buflen;
+ }
+ if (len >= head->iov_len)
+ return 0;
+ shift = head->iov_len - len;
+ xdr_buf_try_expand(buf, shift);
+ xdr_buf_head_shift_right(buf, len, buflen - len, shift);
+ head->iov_len = len;
+ buf->buflen -= shift;
+ buf->len -= shift;
+ return shift;
}
/**
- * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes
+ * xdr_shrink_pagelen - shrinks buf->pages to @len bytes
* @buf: xdr_buf
- * @len: bytes to remove from buf->pages
+ * @len: new page buffer length
*
* The extra data is not lost, but is instead moved into buf->tail.
* Returns the actual number of bytes moved.
*/
-static unsigned int
-xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
+static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len)
{
- struct kvec *tail;
- size_t copy;
- unsigned int pglen = buf->page_len;
- unsigned int tailbuf_len;
- unsigned int result;
-
- result = 0;
- tail = buf->tail;
- if (len > buf->page_len)
- len = buf-> page_len;
- tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
-
- /* Shift the tail first */
- if (tailbuf_len != 0) {
- unsigned int free_space = tailbuf_len - tail->iov_len;
-
- if (len < free_space)
- free_space = len;
- tail->iov_len += free_space;
-
- copy = len;
- if (tail->iov_len > len) {
- char *p = (char *)tail->iov_base + len;
- memmove(p, tail->iov_base, tail->iov_len - len);
- result += tail->iov_len - len;
- } else
- copy = tail->iov_len;
- /* Copy from the inlined pages into the tail */
- _copy_from_pages((char *)tail->iov_base,
- buf->pages, buf->page_base + pglen - len,
- copy);
- result += copy;
+ unsigned int shift, buflen = buf->len - buf->head->iov_len;
+
+ WARN_ON_ONCE(len > buf->page_len);
+ if (buf->head->iov_len >= buf->len || len > buflen)
+ buflen = len;
+ if (buf->page_len > buflen) {
+ buf->buflen -= buf->page_len - buflen;
+ buf->page_len = buflen;
}
- buf->page_len -= len;
- buf->buflen -= len;
- /* Have we truncated the message? */
- if (buf->len > buf->buflen)
- buf->len = buf->buflen;
-
- return result;
+ if (len >= buf->page_len)
+ return 0;
+ shift = buf->page_len - len;
+ xdr_buf_try_expand(buf, shift);
+ xdr_buf_pages_shift_right(buf, len, buflen - len, shift);
+ buf->page_len = len;
+ buf->len -= shift;
+ buf->buflen -= shift;
+ return shift;
}
void
xdr_shift_buf(struct xdr_buf *buf, size_t len)
{
- xdr_shrink_bufhead(buf, len);
+ xdr_shrink_bufhead(buf, buf->head->iov_len - len);
}
EXPORT_SYMBOL_GPL(xdr_shift_buf);
@@ -505,6 +880,31 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr)
}
EXPORT_SYMBOL_GPL(xdr_stream_pos);
+static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+ unsigned int blen = xdr->buf->len;
+
+ xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0;
+}
+
+static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+ xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len);
+}
+
+/**
+ * xdr_page_pos - Return the current offset from the start of the xdr pages
+ * @xdr: pointer to struct xdr_stream
+ */
+unsigned int xdr_page_pos(const struct xdr_stream *xdr)
+{
+ unsigned int pos = xdr_stream_pos(xdr);
+
+ WARN_ON(pos < xdr->buf->head[0].iov_len);
+ return pos - xdr->buf->head[0].iov_len;
+}
+EXPORT_SYMBOL_GPL(xdr_page_pos);
+
/**
* xdr_init_encode - Initialize a struct xdr_stream for sending data.
* @xdr: pointer to xdr_stream struct
@@ -525,7 +925,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
struct kvec *iov = buf->head;
int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
- xdr_set_scratch_buffer(xdr, NULL, 0);
+ xdr_reset_scratch_buffer(xdr);
BUG_ON(scratch_len < 0);
xdr->buf = buf;
xdr->iov = iov;
@@ -547,7 +947,29 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
EXPORT_SYMBOL_GPL(xdr_init_encode);
/**
- * xdr_commit_encode - Ensure all data is written to buffer
+ * xdr_init_encode_pages - Initialize an xdr_stream for encoding into pages
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to XDR buffer into which to encode data
+ * @pages: list of pages to decode into
+ * @rqst: pointer to controlling rpc_rqst, for debugging
+ *
+ */
+void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+ struct page **pages, struct rpc_rqst *rqst)
+{
+ xdr_reset_scratch_buffer(xdr);
+
+ xdr->buf = buf;
+ xdr->page_ptr = pages;
+ xdr->iov = NULL;
+ xdr->p = page_address(*pages);
+ xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
+ xdr->rqst = rqst;
+}
+EXPORT_SYMBOL_GPL(xdr_init_encode_pages);
+
+/**
+ * __xdr_commit_encode - Ensure all data is written to buffer
* @xdr: pointer to xdr_stream
*
* We handle encoding across page boundaries by giving the caller a
@@ -559,26 +981,29 @@ EXPORT_SYMBOL_GPL(xdr_init_encode);
* required at the end of encoding, or any other time when the xdr_buf
* data might be read.
*/
-inline void xdr_commit_encode(struct xdr_stream *xdr)
+void __xdr_commit_encode(struct xdr_stream *xdr)
{
- int shift = xdr->scratch.iov_len;
+ size_t shift = xdr->scratch.iov_len;
void *page;
- if (shift == 0)
- return;
page = page_address(*xdr->page_ptr);
memcpy(xdr->scratch.iov_base, page, shift);
memmove(page, page + shift, (void *)xdr->p - page);
- xdr->scratch.iov_len = 0;
+ xdr_reset_scratch_buffer(xdr);
}
-EXPORT_SYMBOL_GPL(xdr_commit_encode);
+EXPORT_SYMBOL_GPL(__xdr_commit_encode);
-static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
- size_t nbytes)
+/*
+ * The buffer space to be reserved crosses the boundary between
+ * xdr->buf->head and xdr->buf->pages, or between two pages
+ * in xdr->buf->pages.
+ */
+static noinline __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
+ size_t nbytes)
{
- __be32 *p;
int space_left;
int frag1bytes, frag2bytes;
+ void *p;
if (nbytes > PAGE_SIZE)
goto out_overflow; /* Bigger buffers require special handling */
@@ -592,6 +1017,7 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
xdr->buf->page_len += frag1bytes;
xdr->page_ptr++;
xdr->iov = NULL;
+
/*
* If the last encode didn't end exactly on a page boundary, the
* next one will straddle boundaries. Encode into the next
@@ -599,16 +1025,20 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
* the "scratch" iov to track any temporarily unused fragment of
* space at the end of the previous buffer:
*/
- xdr->scratch.iov_base = xdr->p;
- xdr->scratch.iov_len = frag1bytes;
- p = page_address(*xdr->page_ptr);
+ xdr_set_scratch_buffer(xdr, xdr->p, frag1bytes);
+
/*
- * Note this is where the next encode will start after we've
- * shifted this one back:
+ * xdr->p is where the next encode will start after
+ * xdr_commit_encode() has shifted this one back:
*/
- xdr->p = (void *)p + frag2bytes;
+ p = page_address(*xdr->page_ptr);
+ xdr->p = p + frag2bytes;
space_left = xdr->buf->buflen - xdr->buf->len;
- xdr->end = (void *)p + min_t(int, space_left, PAGE_SIZE);
+ if (space_left - frag1bytes >= PAGE_SIZE)
+ xdr->end = p + PAGE_SIZE;
+ else
+ xdr->end = p + space_left - frag1bytes;
+
xdr->buf->page_len += frag2bytes;
xdr->buf->len += nbytes;
return p;
@@ -648,6 +1078,51 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
}
EXPORT_SYMBOL_GPL(xdr_reserve_space);
+
+/**
+ * xdr_reserve_space_vec - Reserves a large amount of buffer space for sending
+ * @xdr: pointer to xdr_stream
+ * @vec: pointer to a kvec array
+ * @nbytes: number of bytes to reserve
+ *
+ * Reserves enough buffer space to encode 'nbytes' of data and stores the
+ * pointers in 'vec'. The size argument passed to xdr_reserve_space() is
+ * determined based on the number of bytes remaining in the current page to
+ * avoid invalidating iov_base pointers when xdr_commit_encode() is called.
+ */
+int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes)
+{
+ int thislen;
+ int v = 0;
+ __be32 *p;
+
+ /*
+ * svcrdma requires every READ payload to start somewhere
+ * in xdr->pages.
+ */
+ if (xdr->iov == xdr->buf->head) {
+ xdr->iov = NULL;
+ xdr->end = xdr->p;
+ }
+
+ while (nbytes) {
+ thislen = xdr->buf->page_len % PAGE_SIZE;
+ thislen = min_t(size_t, nbytes, PAGE_SIZE - thislen);
+
+ p = xdr_reserve_space(xdr, thislen);
+ if (!p)
+ return -EIO;
+
+ vec[v].iov_base = p;
+ vec[v].iov_len = thislen;
+ v++;
+ nbytes -= thislen;
+ }
+
+ return v;
+}
+EXPORT_SYMBOL_GPL(xdr_reserve_space_vec);
+
/**
* xdr_truncate_encode - truncate an encode buffer
* @xdr: pointer to xdr_stream
@@ -658,7 +1133,7 @@ EXPORT_SYMBOL_GPL(xdr_reserve_space);
* head, tail, and page lengths are adjusted to correspond.
*
* If this means moving xdr->p to a different buffer, we assume that
- * that the end pointer should be set to the end of the current page,
+ * the end pointer should be set to the end of the current page,
* except in the case of the head buffer when we assume the head
* buffer's current length represents the end of the available buffer.
*
@@ -781,19 +1256,31 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b
}
EXPORT_SYMBOL_GPL(xdr_write_pages);
-static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
- unsigned int len)
+static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
+ unsigned int base, unsigned int len)
{
if (len > iov->iov_len)
len = iov->iov_len;
- xdr->p = (__be32*)iov->iov_base;
+ if (unlikely(base > len))
+ base = len;
+ xdr->p = (__be32*)(iov->iov_base + base);
xdr->end = (__be32*)(iov->iov_base + len);
xdr->iov = iov;
xdr->page_ptr = NULL;
+ return len - base;
}
-static int xdr_set_page_base(struct xdr_stream *xdr,
- unsigned int base, unsigned int len)
+static unsigned int xdr_set_tail_base(struct xdr_stream *xdr,
+ unsigned int base, unsigned int len)
+{
+ struct xdr_buf *buf = xdr->buf;
+
+ xdr_stream_set_pos(xdr, base + buf->page_len + buf->head->iov_len);
+ return xdr_set_iov(xdr, buf->tail, base, len);
+}
+
+static unsigned int xdr_set_page_base(struct xdr_stream *xdr,
+ unsigned int base, unsigned int len)
{
unsigned int pgnr;
unsigned int maxlen;
@@ -803,11 +1290,13 @@ static int xdr_set_page_base(struct xdr_stream *xdr,
maxlen = xdr->buf->page_len;
if (base >= maxlen)
- return -EINVAL;
- maxlen -= base;
+ return 0;
+ else
+ maxlen -= base;
if (len > maxlen)
len = maxlen;
+ xdr_stream_page_set_pos(xdr, base);
base += xdr->buf->page_base;
pgnr = base >> PAGE_SHIFT;
@@ -822,7 +1311,16 @@ static int xdr_set_page_base(struct xdr_stream *xdr,
pgend = PAGE_SIZE;
xdr->end = (__be32*)(kaddr + pgend);
xdr->iov = NULL;
- return 0;
+ return len;
+}
+
+static void xdr_set_page(struct xdr_stream *xdr, unsigned int base,
+ unsigned int len)
+{
+ if (xdr_set_page_base(xdr, base, len) == 0) {
+ base -= xdr->buf->page_len;
+ xdr_set_tail_base(xdr, base, len);
+ }
}
static void xdr_set_next_page(struct xdr_stream *xdr)
@@ -831,19 +1329,18 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT;
newbase -= xdr->buf->page_base;
-
- if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
+ if (newbase < xdr->buf->page_len)
+ xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr));
+ else
+ xdr_set_tail_base(xdr, 0, xdr_stream_remaining(xdr));
}
static bool xdr_set_next_buffer(struct xdr_stream *xdr)
{
if (xdr->page_ptr != NULL)
xdr_set_next_page(xdr);
- else if (xdr->iov == xdr->buf->head) {
- if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
- }
+ else if (xdr->iov == xdr->buf->head)
+ xdr_set_page(xdr, 0, xdr_stream_remaining(xdr));
return xdr->p != xdr->end;
}
@@ -858,15 +1355,11 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
struct rpc_rqst *rqst)
{
xdr->buf = buf;
- xdr->scratch.iov_base = NULL;
- xdr->scratch.iov_len = 0;
+ xdr_reset_scratch_buffer(xdr);
xdr->nwords = XDR_QUADLEN(buf->len);
- if (buf->head[0].iov_len != 0)
- xdr_set_iov(xdr, buf->head, buf->len);
- else if (buf->page_len != 0)
- xdr_set_page_base(xdr, 0, buf->len);
- else
- xdr_set_iov(xdr, buf->head, buf->len);
+ if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 &&
+ xdr_set_page_base(xdr, 0, buf->len) == 0)
+ xdr_set_iov(xdr, buf->tail, 0, buf->len);
if (p != NULL && p > xdr->p && xdr->end >= p) {
xdr->nwords -= p - xdr->p;
xdr->p = p;
@@ -907,24 +1400,6 @@ static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
return p;
}
-/**
- * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
- * @xdr: pointer to xdr_stream struct
- * @buf: pointer to an empty buffer
- * @buflen: size of 'buf'
- *
- * The scratch buffer is used when decoding from an array of pages.
- * If an xdr_inline_decode() call spans across page boundaries, then
- * we copy the data into the scratch buffer in order to allow linear
- * access.
- */
-void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
-{
- xdr->scratch.iov_base = buf;
- xdr->scratch.iov_len = buflen;
-}
-EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
-
static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
{
__be32 *p;
@@ -979,26 +1454,31 @@ out_overflow:
}
EXPORT_SYMBOL_GPL(xdr_inline_decode);
-static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
+static void xdr_realign_pages(struct xdr_stream *xdr)
{
struct xdr_buf *buf = xdr->buf;
- struct kvec *iov;
- unsigned int nwords = XDR_QUADLEN(len);
+ struct kvec *iov = buf->head;
unsigned int cur = xdr_stream_pos(xdr);
- unsigned int copied, offset;
-
- if (xdr->nwords == 0)
- return 0;
+ unsigned int copied;
/* Realign pages to current pointer position */
- iov = buf->head;
if (iov->iov_len > cur) {
- offset = iov->iov_len - cur;
- copied = xdr_shrink_bufhead(buf, offset);
- trace_rpc_xdr_alignment(xdr, offset, copied);
- xdr->nwords = XDR_QUADLEN(buf->len - cur);
+ copied = xdr_shrink_bufhead(buf, cur);
+ trace_rpc_xdr_alignment(xdr, cur, copied);
+ xdr_set_page(xdr, 0, buf->page_len);
}
+}
+
+static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
+{
+ struct xdr_buf *buf = xdr->buf;
+ unsigned int nwords = XDR_QUADLEN(len);
+ unsigned int copied;
+
+ if (xdr->nwords == 0)
+ return 0;
+ xdr_realign_pages(xdr);
if (nwords > xdr->nwords) {
nwords = xdr->nwords;
len = nwords << 2;
@@ -1007,57 +1487,72 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
len = buf->page_len;
else if (nwords < xdr->nwords) {
/* Truncate page data and move it into the tail */
- offset = buf->page_len - len;
- copied = xdr_shrink_pagelen(buf, offset);
- trace_rpc_xdr_alignment(xdr, offset, copied);
- xdr->nwords = XDR_QUADLEN(buf->len - cur);
+ copied = xdr_shrink_pagelen(buf, len);
+ trace_rpc_xdr_alignment(xdr, len, copied);
}
return len;
}
/**
- * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position
+ * xdr_read_pages - align page-based XDR data to current pointer position
* @xdr: pointer to xdr_stream struct
* @len: number of bytes of page data
*
* Moves data beyond the current pointer position from the XDR head[] buffer
- * into the page list. Any data that lies beyond current position + "len"
- * bytes is moved into the XDR tail[].
+ * into the page list. Any data that lies beyond current position + @len
+ * bytes is moved into the XDR tail[]. The xdr_stream current position is
+ * then advanced past that data to align to the next XDR object in the tail.
*
* Returns the number of XDR encoded bytes now contained in the pages
*/
unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
{
- struct xdr_buf *buf = xdr->buf;
- struct kvec *iov;
- unsigned int nwords;
- unsigned int end;
- unsigned int padding;
+ unsigned int nwords = XDR_QUADLEN(len);
+ unsigned int base, end, pglen;
- len = xdr_align_pages(xdr, len);
- if (len == 0)
+ pglen = xdr_align_pages(xdr, nwords << 2);
+ if (pglen == 0)
return 0;
- nwords = XDR_QUADLEN(len);
- padding = (nwords << 2) - len;
- xdr->iov = iov = buf->tail;
- /* Compute remaining message length. */
- end = ((xdr->nwords - nwords) << 2) + padding;
- if (end > iov->iov_len)
- end = iov->iov_len;
- /*
- * Position current pointer at beginning of tail, and
- * set remaining message length.
- */
- xdr->p = (__be32 *)((char *)iov->iov_base + padding);
- xdr->end = (__be32 *)((char *)iov->iov_base + end);
- xdr->page_ptr = NULL;
- xdr->nwords = XDR_QUADLEN(end - padding);
- return len;
+ base = (nwords << 2) - pglen;
+ end = xdr_stream_remaining(xdr) - pglen;
+
+ xdr_set_tail_base(xdr, base, end);
+ return len <= pglen ? len : pglen;
}
EXPORT_SYMBOL_GPL(xdr_read_pages);
/**
+ * xdr_set_pagelen - Sets the length of the XDR pages
+ * @xdr: pointer to xdr_stream struct
+ * @len: new length of the XDR page data
+ *
+ * Either grows or shrinks the length of the xdr pages by setting pagelen to
+ * @len bytes. When shrinking, any extra data is moved into buf->tail, whereas
+ * when growing any data beyond the current pointer is moved into the tail.
+ *
+ * Returns True if the operation was successful, and False otherwise.
+ */
+void xdr_set_pagelen(struct xdr_stream *xdr, unsigned int len)
+{
+ struct xdr_buf *buf = xdr->buf;
+ size_t remaining = xdr_stream_remaining(xdr);
+ size_t base = 0;
+
+ if (len < buf->page_len) {
+ base = buf->page_len - len;
+ xdr_shrink_pagelen(buf, len);
+ } else {
+ xdr_buf_head_shift_right(buf, xdr_stream_pos(xdr),
+ buf->page_len, remaining);
+ if (len > buf->page_len)
+ xdr_buf_try_expand(buf, len - buf->page_len);
+ }
+ xdr_set_tail_base(xdr, base, remaining);
+}
+EXPORT_SYMBOL_GPL(xdr_set_pagelen);
+
+/**
* xdr_enter_page - decode data from the XDR page
* @xdr: pointer to xdr_stream struct
* @len: number of bytes of page data
@@ -1081,8 +1576,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page);
static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
-void
-xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
+void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf)
{
buf->head[0] = *iov;
buf->tail[0] = empty_iov;
@@ -1103,11 +1597,10 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
*
* @buf and @subbuf may be pointers to the same struct xdr_buf.
*
- * Returns -1 if base of length are out of bounds.
+ * Returns -1 if base or length are out of bounds.
*/
-int
-xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
- unsigned int base, unsigned int len)
+int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf,
+ unsigned int base, unsigned int len)
{
subbuf->buflen = subbuf->len = len;
if (base < buf->head[0].iov_len) {
@@ -1118,6 +1611,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
base = 0;
} else {
base -= buf->head[0].iov_len;
+ subbuf->head[0].iov_base = buf->head[0].iov_base;
subbuf->head[0].iov_len = 0;
}
@@ -1130,6 +1624,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
base = 0;
} else {
base -= buf->page_len;
+ subbuf->pages = buf->pages;
+ subbuf->page_base = 0;
subbuf->page_len = 0;
}
@@ -1141,6 +1637,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
base = 0;
} else {
base -= buf->tail[0].iov_len;
+ subbuf->tail[0].iov_base = buf->tail[0].iov_base;
subbuf->tail[0].iov_len = 0;
}
@@ -1150,7 +1647,150 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
}
EXPORT_SYMBOL_GPL(xdr_buf_subsegment);
-static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+/**
+ * xdr_stream_subsegment - set @subbuf to a portion of @xdr
+ * @xdr: an xdr_stream set up for decoding
+ * @subbuf: the result buffer
+ * @nbytes: length of @xdr to extract, in bytes
+ *
+ * Sets up @subbuf to represent a portion of @xdr. The portion
+ * starts at the current offset in @xdr, and extends for a length
+ * of @nbytes. If this is successful, @xdr is advanced to the next
+ * XDR data item following that portion.
+ *
+ * Return values:
+ * %true: @subbuf has been initialized, and @xdr has been advanced.
+ * %false: a bounds error has occurred
+ */
+bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
+ unsigned int nbytes)
+{
+ unsigned int start = xdr_stream_pos(xdr);
+ unsigned int remaining, len;
+
+ /* Extract @subbuf and bounds-check the fn arguments */
+ if (xdr_buf_subsegment(xdr->buf, subbuf, start, nbytes))
+ return false;
+
+ /* Advance @xdr by @nbytes */
+ for (remaining = nbytes; remaining;) {
+ if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
+ return false;
+
+ len = (char *)xdr->end - (char *)xdr->p;
+ if (remaining <= len) {
+ xdr->p = (__be32 *)((char *)xdr->p +
+ (remaining + xdr_pad_size(nbytes)));
+ break;
+ }
+
+ xdr->p = (__be32 *)((char *)xdr->p + len);
+ xdr->end = xdr->p;
+ remaining -= len;
+ }
+
+ xdr_stream_set_pos(xdr, start + nbytes);
+ return true;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_subsegment);
+
+/**
+ * xdr_stream_move_subsegment - Move part of a stream to another position
+ * @xdr: the source xdr_stream
+ * @offset: the source offset of the segment
+ * @target: the target offset of the segment
+ * @length: the number of bytes to move
+ *
+ * Moves @length bytes from @offset to @target in the xdr_stream, overwriting
+ * anything in its space. Returns the number of bytes in the segment.
+ */
+unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset,
+ unsigned int target, unsigned int length)
+{
+ struct xdr_buf buf;
+ unsigned int shift;
+
+ if (offset < target) {
+ shift = target - offset;
+ if (xdr_buf_subsegment(xdr->buf, &buf, offset, shift + length) < 0)
+ return 0;
+ xdr_buf_head_shift_right(&buf, 0, length, shift);
+ } else if (offset > target) {
+ shift = offset - target;
+ if (xdr_buf_subsegment(xdr->buf, &buf, target, shift + length) < 0)
+ return 0;
+ xdr_buf_head_shift_left(&buf, shift, length, shift);
+ }
+ return length;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_move_subsegment);
+
+/**
+ * xdr_stream_zero - zero out a portion of an xdr_stream
+ * @xdr: an xdr_stream to zero out
+ * @offset: the starting point in the stream
+ * @length: the number of bytes to zero
+ */
+unsigned int xdr_stream_zero(struct xdr_stream *xdr, unsigned int offset,
+ unsigned int length)
+{
+ struct xdr_buf buf;
+
+ if (xdr_buf_subsegment(xdr->buf, &buf, offset, length) < 0)
+ return 0;
+ if (buf.head[0].iov_len)
+ xdr_buf_iov_zero(buf.head, 0, buf.head[0].iov_len);
+ if (buf.page_len > 0)
+ xdr_buf_pages_zero(&buf, 0, buf.page_len);
+ if (buf.tail[0].iov_len)
+ xdr_buf_iov_zero(buf.tail, 0, buf.tail[0].iov_len);
+ return length;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_zero);
+
+/**
+ * xdr_buf_trim - lop at most "len" bytes off the end of "buf"
+ * @buf: buf to be trimmed
+ * @len: number of bytes to reduce "buf" by
+ *
+ * Trim an xdr_buf by the given number of bytes by fixing up the lengths. Note
+ * that it's possible that we'll trim less than that amount if the xdr_buf is
+ * too small, or if (for instance) it's all in the head and the parser has
+ * already read too far into it.
+ */
+void xdr_buf_trim(struct xdr_buf *buf, unsigned int len)
+{
+ size_t cur;
+ unsigned int trim = len;
+
+ if (buf->tail[0].iov_len) {
+ cur = min_t(size_t, buf->tail[0].iov_len, trim);
+ buf->tail[0].iov_len -= cur;
+ trim -= cur;
+ if (!trim)
+ goto fix_len;
+ }
+
+ if (buf->page_len) {
+ cur = min_t(unsigned int, buf->page_len, trim);
+ buf->page_len -= cur;
+ trim -= cur;
+ if (!trim)
+ goto fix_len;
+ }
+
+ if (buf->head[0].iov_len) {
+ cur = min_t(size_t, buf->head[0].iov_len, trim);
+ buf->head[0].iov_len -= cur;
+ trim -= cur;
+ }
+fix_len:
+ buf->len -= (len - trim);
+}
+EXPORT_SYMBOL_GPL(xdr_buf_trim);
+
+static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf,
+ void *obj, unsigned int len)
{
unsigned int this_len;
@@ -1159,8 +1799,7 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->page_len);
- if (this_len)
- _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
+ _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
@@ -1168,7 +1807,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
}
/* obj is assumed to point to allocated memory of size at least len: */
-int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+ void *obj, unsigned int len)
{
struct xdr_buf subbuf;
int status;
@@ -1181,7 +1821,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u
}
EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf);
-static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf,
+ void *obj, unsigned int len)
{
unsigned int this_len;
@@ -1190,8 +1831,7 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->page_len);
- if (this_len)
- _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
+ _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
@@ -1199,7 +1839,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
}
/* obj is assumed to point to allocated memory of size at least len: */
-int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+ void *obj, unsigned int len)
{
struct xdr_buf subbuf;
int status;
@@ -1212,8 +1853,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un
}
EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf);
-int
-xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
+int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj)
{
__be32 raw;
int status;
@@ -1226,8 +1866,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
}
EXPORT_SYMBOL_GPL(xdr_decode_word);
-int
-xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
+int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj)
{
__be32 raw = cpu_to_be32(obj);
@@ -1235,65 +1874,9 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
}
EXPORT_SYMBOL_GPL(xdr_encode_word);
-/**
- * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf
- * @buf: pointer to buffer containing a mic
- * @mic: on success, returns the address of the mic
- * @offset: the offset in buf where mic may be found
- *
- * This function may modify the xdr buf if the mic is found to be straddling
- * a boundary between head, pages, and tail. On success the mic can be read
- * from the address returned. There is no need to free the mic.
- *
- * Return: Success returns 0, otherwise an integer error.
- */
-int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset)
-{
- struct xdr_buf subbuf;
- unsigned int boundary;
-
- if (xdr_decode_word(buf, offset, &mic->len))
- return -EFAULT;
- offset += 4;
-
- /* Is the mic partially in the head? */
- boundary = buf->head[0].iov_len;
- if (offset < boundary && (offset + mic->len) > boundary)
- xdr_shift_buf(buf, boundary - offset);
-
- /* Is the mic partially in the pages? */
- boundary += buf->page_len;
- if (offset < boundary && (offset + mic->len) > boundary)
- xdr_shrink_pagelen(buf, boundary - offset);
-
- if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len))
- return -EFAULT;
-
- /* Is the mic contained entirely in the head? */
- mic->data = subbuf.head[0].iov_base;
- if (subbuf.head[0].iov_len == mic->len)
- return 0;
- /* ..or is the mic contained entirely in the tail? */
- mic->data = subbuf.tail[0].iov_base;
- if (subbuf.tail[0].iov_len == mic->len)
- return 0;
-
- /* Find a contiguous area in @buf to hold all of @mic */
- if (mic->len > buf->buflen - buf->len)
- return -ENOMEM;
- if (buf->tail[0].iov_len != 0)
- mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
- else
- mic->data = buf->head[0].iov_base + buf->head[0].iov_len;
- __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xdr_buf_read_mic);
-
/* Returns 0 on success, or else a negative error code. */
-static int
-xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
- struct xdr_array2_desc *desc, int encode)
+static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc, int encode)
{
char *elem = NULL, *c;
unsigned int copied = 0, todo, avail_here;
@@ -1485,9 +2068,8 @@ out:
return err;
}
-int
-xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
- struct xdr_array2_desc *desc)
+int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc)
{
if (base >= buf->len)
return -EINVAL;
@@ -1496,9 +2078,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
}
EXPORT_SYMBOL_GPL(xdr_decode_array2);
-int
-xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
- struct xdr_array2_desc *desc)
+int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc)
{
if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
buf->head->iov_len + buf->page_len + buf->tail->iov_len)
@@ -1508,9 +2089,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
}
EXPORT_SYMBOL_GPL(xdr_encode_array2);
-int
-xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
- int (*actor)(struct scatterlist *, void *), void *data)
+int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset,
+ unsigned int len,
+ int (*actor)(struct scatterlist *, void *), void *data)
{
int i, ret = 0;
unsigned int page_len, thislen, page_offset;
@@ -1678,10 +2259,8 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
if (ret > 0) {
- char *s = kmalloc(ret + 1, gfp_flags);
+ char *s = kmemdup_nul(p, ret, gfp_flags);
if (s != NULL) {
- memcpy(s, p, ret);
- s[ret] = '\0';
*str = s;
return strlen(s);
}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 1aafe8d3f3f4..656cec208371 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -55,6 +55,8 @@
#include <trace/events/sunrpc.h>
#include "sunrpc.h"
+#include "sysfs.h"
+#include "fail.h"
/*
* Local variables
@@ -67,9 +69,11 @@
/*
* Local functions
*/
-static void xprt_init(struct rpc_xprt *xprt, struct net *net);
+static void xprt_init(struct rpc_xprt *xprt, struct net *net);
static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
-static void xprt_destroy(struct rpc_xprt *xprt);
+static void xprt_destroy(struct rpc_xprt *xprt);
+static void xprt_request_init(struct rpc_task *task);
+static int xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf);
static DEFINE_SPINLOCK(xprt_list_lock);
static LIST_HEAD(xprt_list);
@@ -151,42 +155,101 @@ out:
}
EXPORT_SYMBOL_GPL(xprt_unregister_transport);
-/**
- * xprt_load_transport - load a transport implementation
- * @transport_name: transport to load
- *
- * Returns:
- * 0: transport successfully loaded
- * -ENOENT: transport module not available
- */
-int xprt_load_transport(const char *transport_name)
+static void
+xprt_class_release(const struct xprt_class *t)
{
- struct xprt_class *t;
- int result;
+ module_put(t->owner);
+}
+
+static const struct xprt_class *
+xprt_class_find_by_ident_locked(int ident)
+{
+ const struct xprt_class *t;
+
+ list_for_each_entry(t, &xprt_list, list) {
+ if (t->ident != ident)
+ continue;
+ if (!try_module_get(t->owner))
+ continue;
+ return t;
+ }
+ return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_ident(int ident)
+{
+ const struct xprt_class *t;
- result = 0;
spin_lock(&xprt_list_lock);
+ t = xprt_class_find_by_ident_locked(ident);
+ spin_unlock(&xprt_list_lock);
+ return t;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid_locked(const char *netid)
+{
+ const struct xprt_class *t;
+ unsigned int i;
+
list_for_each_entry(t, &xprt_list, list) {
- if (strcmp(t->name, transport_name) == 0) {
- spin_unlock(&xprt_list_lock);
- goto out;
+ for (i = 0; t->netid[i][0] != '\0'; i++) {
+ if (strcmp(t->netid[i], netid) != 0)
+ continue;
+ if (!try_module_get(t->owner))
+ continue;
+ return t;
}
}
+ return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid(const char *netid)
+{
+ const struct xprt_class *t;
+
+ spin_lock(&xprt_list_lock);
+ t = xprt_class_find_by_netid_locked(netid);
+ if (!t) {
+ spin_unlock(&xprt_list_lock);
+ request_module("rpc%s", netid);
+ spin_lock(&xprt_list_lock);
+ t = xprt_class_find_by_netid_locked(netid);
+ }
spin_unlock(&xprt_list_lock);
- result = request_module("xprt%s", transport_name);
-out:
- return result;
+ return t;
}
-EXPORT_SYMBOL_GPL(xprt_load_transport);
+
+/**
+ * xprt_find_transport_ident - convert a netid into a transport identifier
+ * @netid: transport to load
+ *
+ * Returns:
+ * > 0: transport identifier
+ * -ENOENT: transport module not available
+ */
+int xprt_find_transport_ident(const char *netid)
+{
+ const struct xprt_class *t;
+ int ret;
+
+ t = xprt_class_find_by_netid(netid);
+ if (!t)
+ return -ENOENT;
+ ret = t->ident;
+ xprt_class_release(t);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xprt_find_transport_ident);
static void xprt_clear_locked(struct rpc_xprt *xprt)
{
xprt->snd_task = NULL;
- if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
- smp_mb__before_atomic();
- clear_bit(XPRT_LOCKED, &xprt->state);
- smp_mb__after_atomic();
- } else
+ if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state))
+ clear_bit_unlock(XPRT_LOCKED, &xprt->state);
+ else
queue_work(xprtiod_workqueue, &xprt->task_cleanup);
}
@@ -381,7 +444,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
}
EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
-static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
+void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
{
if (xprt->snd_task != task)
return;
@@ -607,6 +670,11 @@ static void xprt_reset_majortimeo(struct rpc_rqst *req)
req->rq_majortimeo += xprt_calc_majortimeo(req);
}
+static void xprt_reset_minortimeo(struct rpc_rqst *req)
+{
+ req->rq_minortimeo += req->rq_timeout;
+}
+
static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req)
{
unsigned long time_init;
@@ -618,6 +686,7 @@ static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req)
time_init = xprt_abs_ktime_to_jiffies(task->tk_start);
req->rq_timeout = task->tk_client->cl_timeout->to_initval;
req->rq_majortimeo = time_init + xprt_calc_majortimeo(req);
+ req->rq_minortimeo = time_init + req->rq_timeout;
}
/**
@@ -632,6 +701,8 @@ int xprt_adjust_timeout(struct rpc_rqst *req)
int status = 0;
if (time_before(jiffies, req->rq_majortimeo)) {
+ if (time_before(jiffies, req->rq_minortimeo))
+ return status;
if (to->to_exponential)
req->rq_timeout <<= 1;
else
@@ -649,6 +720,7 @@ int xprt_adjust_timeout(struct rpc_rqst *req)
spin_unlock(&xprt->transport_lock);
status = -ETIMEDOUT;
}
+ xprt_reset_minortimeo(req);
if (req->rq_timeout == 0) {
printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n");
@@ -663,6 +735,9 @@ static void xprt_autoclose(struct work_struct *work)
container_of(work, struct rpc_xprt, task_cleanup);
unsigned int pflags = memalloc_nofs_save();
+ trace_xprt_disconnect_auto(xprt);
+ xprt->connect_cookie++;
+ smp_mb__before_atomic();
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
xprt->ops->close(xprt);
xprt_release_write(xprt, NULL);
@@ -677,7 +752,7 @@ static void xprt_autoclose(struct work_struct *work)
*/
void xprt_disconnect_done(struct rpc_xprt *xprt)
{
- dprintk("RPC: disconnected transport %p\n", xprt);
+ trace_xprt_disconnect_done(xprt);
spin_lock(&xprt->transport_lock);
xprt_clear_connected(xprt);
xprt_clear_write_space_locked(xprt);
@@ -688,21 +763,32 @@ void xprt_disconnect_done(struct rpc_xprt *xprt)
EXPORT_SYMBOL_GPL(xprt_disconnect_done);
/**
+ * xprt_schedule_autoclose_locked - Try to schedule an autoclose RPC call
+ * @xprt: transport to disconnect
+ */
+static void xprt_schedule_autoclose_locked(struct rpc_xprt *xprt)
+{
+ if (test_and_set_bit(XPRT_CLOSE_WAIT, &xprt->state))
+ return;
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
+ else if (xprt->snd_task && !test_bit(XPRT_SND_IS_COOKIE, &xprt->state))
+ rpc_wake_up_queued_task_set_status(&xprt->pending,
+ xprt->snd_task, -ENOTCONN);
+}
+
+/**
* xprt_force_disconnect - force a transport to disconnect
* @xprt: transport to disconnect
*
*/
void xprt_force_disconnect(struct rpc_xprt *xprt)
{
+ trace_xprt_disconnect_force(xprt);
+
/* Don't race with the test_bit() in xprt_clear_locked() */
spin_lock(&xprt->transport_lock);
- set_bit(XPRT_CLOSE_WAIT, &xprt->state);
- /* Try to schedule an autoclose RPC call */
- if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
- queue_work(xprtiod_workqueue, &xprt->task_cleanup);
- else if (xprt->snd_task)
- rpc_wake_up_queued_task_set_status(&xprt->pending,
- xprt->snd_task, -ENOTCONN);
+ xprt_schedule_autoclose_locked(xprt);
spin_unlock(&xprt->transport_lock);
}
EXPORT_SYMBOL_GPL(xprt_force_disconnect);
@@ -742,11 +828,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
goto out;
if (test_bit(XPRT_CLOSING, &xprt->state))
goto out;
- set_bit(XPRT_CLOSE_WAIT, &xprt->state);
- /* Try to schedule an autoclose RPC call */
- if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
- queue_work(xprtiod_workqueue, &xprt->task_cleanup);
- xprt_wake_pending_tasks(xprt, -EAGAIN);
+ xprt_schedule_autoclose_locked(xprt);
out:
spin_unlock(&xprt->transport_lock);
}
@@ -780,6 +862,19 @@ xprt_init_autodisconnect(struct timer_list *t)
queue_work(xprtiod_workqueue, &xprt->task_cleanup);
}
+#if IS_ENABLED(CONFIG_FAIL_SUNRPC)
+static void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+ if (!fail_sunrpc.ignore_client_disconnect &&
+ should_fail(&fail_sunrpc.attr, 1))
+ xprt->ops->inject_disconnect(xprt);
+}
+#else
+static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+}
+#endif
+
bool xprt_lock_connect(struct rpc_xprt *xprt,
struct rpc_task *task,
void *cookie)
@@ -791,12 +886,14 @@ bool xprt_lock_connect(struct rpc_xprt *xprt,
goto out;
if (xprt->snd_task != task)
goto out;
+ set_bit(XPRT_SND_IS_COOKIE, &xprt->state);
xprt->snd_task = cookie;
ret = true;
out:
spin_unlock(&xprt->transport_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(xprt_lock_connect);
void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
{
@@ -806,12 +903,14 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
if (!test_bit(XPRT_LOCKED, &xprt->state))
goto out;
xprt->snd_task =NULL;
+ clear_bit(XPRT_SND_IS_COOKIE, &xprt->state);
xprt->ops->release_xprt(xprt, NULL);
xprt_schedule_autodisconnect(xprt);
out:
spin_unlock(&xprt->transport_lock);
wake_up_bit(&xprt->state, XPRT_LOCKED);
}
+EXPORT_SYMBOL_GPL(xprt_unlock_connect);
/**
* xprt_connect - schedule a transport connect operation
@@ -822,8 +921,7 @@ void xprt_connect(struct rpc_task *task)
{
struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
- dprintk("RPC: %5u xprt_connect xprt %p %s connected\n", task->tk_pid,
- xprt, (xprt_connected(xprt) ? "is" : "is not"));
+ trace_xprt_connect(xprt);
if (!xprt_bound(xprt)) {
task->tk_status = -EAGAIN;
@@ -832,10 +930,7 @@ void xprt_connect(struct rpc_task *task)
if (!xprt_lock_write(xprt, task))
return;
- if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state))
- xprt->ops->close(xprt);
-
- if (!xprt_connected(xprt)) {
+ if (!xprt_connected(xprt) && !test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie;
rpc_sleep_on_timeout(&xprt->pending, task, NULL,
xprt_request_timeout(task->tk_rqstp));
@@ -1044,16 +1139,19 @@ xprt_request_need_enqueue_receive(struct rpc_task *task, struct rpc_rqst *req)
* @task: RPC task
*
*/
-void
+int
xprt_request_enqueue_receive(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
+ int ret;
if (!xprt_request_need_enqueue_receive(task, req))
- return;
+ return 0;
- xprt_request_prepare(task->tk_rqstp);
+ ret = xprt_request_prepare(task->tk_rqstp, &req->rq_rcv_buf);
+ if (ret)
+ return ret;
spin_lock(&xprt->queue_lock);
/* Update the softirq receive buffer */
@@ -1067,6 +1165,7 @@ xprt_request_enqueue_receive(struct rpc_task *task)
/* Turn off autodisconnect */
del_singleshot_timer_sync(&xprt->timer);
+ return 0;
}
/**
@@ -1117,12 +1216,10 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- dprintk("RPC: %5u xid %08x complete (%d bytes received)\n",
- task->tk_pid, ntohl(req->rq_xid), copied);
- trace_xprt_complete_rqst(xprt, req->rq_xid, copied);
-
xprt->stat.recvs++;
+ xdr_free_bvec(&req->rq_rcv_buf);
+ req->rq_private_buf.bvec = NULL;
req->rq_private_buf.len = copied;
/* Ensure all writes are done before we update */
/* req->rq_reply_bytes_recvd */
@@ -1241,8 +1338,14 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
{
struct rpc_rqst *pos, *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
+ int ret;
if (xprt_request_need_enqueue_transmit(task, req)) {
+ ret = xprt_request_prepare(task->tk_rqstp, &req->rq_snd_buf);
+ if (ret) {
+ task->tk_status = ret;
+ return;
+ }
req->rq_bytes_sent = 0;
spin_lock(&xprt->queue_lock);
/*
@@ -1257,19 +1360,6 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
/* Note: req is added _before_ pos */
list_add_tail(&req->rq_xmit, &pos->rq_xmit);
INIT_LIST_HEAD(&req->rq_xmit2);
- trace_xprt_enq_xmit(task, 1);
- goto out;
- }
- } else if (RPC_IS_SWAPPER(task)) {
- list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
- if (pos->rq_cong || pos->rq_bytes_sent)
- continue;
- if (RPC_IS_SWAPPER(pos->rq_task))
- continue;
- /* Note: req is added _before_ pos */
- list_add_tail(&req->rq_xmit, &pos->rq_xmit);
- INIT_LIST_HEAD(&req->rq_xmit2);
- trace_xprt_enq_xmit(task, 2);
goto out;
}
} else if (!req->rq_seqno) {
@@ -1278,14 +1368,13 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
continue;
list_add_tail(&req->rq_xmit2, &pos->rq_xmit2);
INIT_LIST_HEAD(&req->rq_xmit);
- trace_xprt_enq_xmit(task, 3);
goto out;
}
}
list_add_tail(&req->rq_xmit, &xprt->xmit_queue);
INIT_LIST_HEAD(&req->rq_xmit2);
- trace_xprt_enq_xmit(task, 4);
out:
+ atomic_long_inc(&xprt->xmit_queuelen);
set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
spin_unlock(&xprt->queue_lock);
}
@@ -1315,6 +1404,8 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task)
}
} else
list_del(&req->rq_xmit2);
+ atomic_long_dec(&req->rq_xprt->xmit_queuelen);
+ xdr_free_bvec(&req->rq_snd_buf);
}
/**
@@ -1351,8 +1442,6 @@ xprt_request_dequeue_xprt(struct rpc_task *task)
test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
xprt_is_pinned_rqst(req)) {
spin_lock(&xprt->queue_lock);
- xprt_request_dequeue_transmit_locked(task);
- xprt_request_dequeue_receive_locked(task);
while (xprt_is_pinned_rqst(req)) {
set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
spin_unlock(&xprt->queue_lock);
@@ -1360,24 +1449,30 @@ xprt_request_dequeue_xprt(struct rpc_task *task)
spin_lock(&xprt->queue_lock);
clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
}
+ xprt_request_dequeue_transmit_locked(task);
+ xprt_request_dequeue_receive_locked(task);
spin_unlock(&xprt->queue_lock);
+ xdr_free_bvec(&req->rq_rcv_buf);
}
}
/**
* xprt_request_prepare - prepare an encoded request for transport
* @req: pointer to rpc_rqst
+ * @buf: pointer to send/rcv xdr_buf
*
* Calls into the transport layer to do whatever is needed to prepare
* the request for transmission or receive.
+ * Returns error, or zero.
*/
-void
-xprt_request_prepare(struct rpc_rqst *req)
+static int
+xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf)
{
struct rpc_xprt *xprt = req->rq_xprt;
if (xprt->ops->prepare_request)
- xprt->ops->prepare_request(req);
+ return xprt->ops->prepare_request(req, buf);
+ return 0;
}
/**
@@ -1402,8 +1497,6 @@ bool xprt_prepare_transmit(struct rpc_task *task)
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
-
if (!xprt_lock_write(xprt, task)) {
/* Race breaker: someone may have transmitted us */
if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
@@ -1412,12 +1505,18 @@ bool xprt_prepare_transmit(struct rpc_task *task)
return false;
}
+ if (atomic_read(&xprt->swapper))
+ /* This will be clear in __rpc_execute */
+ current->flags |= PF_MEMALLOC;
return true;
}
void xprt_end_transmit(struct rpc_task *task)
{
- xprt_release_write(task->tk_rqstp->rq_xprt, task);
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+
+ xprt_inject_disconnect(xprt);
+ xprt_release_write(xprt, task);
}
/**
@@ -1462,6 +1561,7 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
*/
req->rq_ntrans++;
+ trace_rpc_xdr_sendto(task, &req->rq_snd_buf);
connect_cookie = xprt->connect_cookie;
status = xprt->ops->send_request(req);
if (status != 0) {
@@ -1470,8 +1570,10 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
return status;
}
- if (is_retrans)
+ if (is_retrans) {
task->tk_client->cl_stats->rpcretrans++;
+ trace_xprt_retransmit(req);
+ }
xprt_inject_disconnect(xprt);
@@ -1510,39 +1612,66 @@ xprt_transmit(struct rpc_task *task)
int status;
spin_lock(&xprt->queue_lock);
- while (!list_empty(&xprt->xmit_queue)) {
- next = list_first_entry(&xprt->xmit_queue,
- struct rpc_rqst, rq_xmit);
+ for (;;) {
+ next = list_first_entry_or_null(&xprt->xmit_queue,
+ struct rpc_rqst, rq_xmit);
+ if (!next)
+ break;
xprt_pin_rqst(next);
spin_unlock(&xprt->queue_lock);
status = xprt_request_transmit(next, task);
if (status == -EBADMSG && next != req)
status = 0;
- cond_resched();
spin_lock(&xprt->queue_lock);
xprt_unpin_rqst(next);
- if (status == 0) {
- if (!xprt_request_data_received(task) ||
- test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
- continue;
- } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
- task->tk_status = status;
- break;
+ if (status < 0) {
+ if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+ task->tk_status = status;
+ break;
+ }
+ /* Was @task transmitted, and has it received a reply? */
+ if (xprt_request_data_received(task) &&
+ !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+ break;
+ cond_resched_lock(&xprt->queue_lock);
}
spin_unlock(&xprt->queue_lock);
}
-static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
+static void xprt_complete_request_init(struct rpc_task *task)
+{
+ if (task->tk_rqstp)
+ xprt_request_init(task);
+}
+
+void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
{
set_bit(XPRT_CONGESTED, &xprt->state);
- rpc_sleep_on(&xprt->backlog, task, NULL);
+ rpc_sleep_on(&xprt->backlog, task, xprt_complete_request_init);
+}
+EXPORT_SYMBOL_GPL(xprt_add_backlog);
+
+static bool __xprt_set_rq(struct rpc_task *task, void *data)
+{
+ struct rpc_rqst *req = data;
+
+ if (task->tk_rqstp == NULL) {
+ memset(req, 0, sizeof(*req)); /* mark unused */
+ task->tk_rqstp = req;
+ return true;
+ }
+ return false;
}
-static void xprt_wake_up_backlog(struct rpc_xprt *xprt)
+bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
- if (rpc_wake_up_next(&xprt->backlog) == NULL)
+ if (rpc_wake_up_first(&xprt->backlog, __xprt_set_rq, req) == NULL) {
clear_bit(XPRT_CONGESTED, &xprt->state);
+ return false;
+ }
+ return true;
}
+EXPORT_SYMBOL_GPL(xprt_wake_up_backlog);
static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task)
{
@@ -1552,7 +1681,7 @@ static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task
goto out;
spin_lock(&xprt->reserve_lock);
if (test_bit(XPRT_CONGESTED, &xprt->state)) {
- rpc_sleep_on(&xprt->backlog, task, NULL);
+ xprt_add_backlog(xprt, task);
ret = true;
}
spin_unlock(&xprt->reserve_lock);
@@ -1568,7 +1697,7 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt)
goto out;
++xprt->num_reqs;
spin_unlock(&xprt->reserve_lock);
- req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS);
+ req = kzalloc(sizeof(*req), rpc_task_gfp_mask());
spin_lock(&xprt->reserve_lock);
if (req != NULL)
goto out;
@@ -1610,7 +1739,7 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
case -EAGAIN:
xprt_add_backlog(xprt, task);
dprintk("RPC: waiting for request slot\n");
- /* fall through */
+ fallthrough;
default:
task->tk_status = -EAGAIN;
}
@@ -1629,11 +1758,11 @@ EXPORT_SYMBOL_GPL(xprt_alloc_slot);
void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
spin_lock(&xprt->reserve_lock);
- if (!xprt_dynamic_free_slot(xprt, req)) {
+ if (!xprt_wake_up_backlog(xprt, req) &&
+ !xprt_dynamic_free_slot(xprt, req)) {
memset(req, 0, sizeof(*req)); /* mark unused */
list_add(&req->rq_list, &xprt->free);
}
- xprt_wake_up_backlog(xprt);
spin_unlock(&xprt->reserve_lock);
}
EXPORT_SYMBOL_GPL(xprt_free_slot);
@@ -1648,6 +1777,30 @@ static void xprt_free_all_slots(struct rpc_xprt *xprt)
}
}
+static DEFINE_IDA(rpc_xprt_ids);
+
+void xprt_cleanup_ids(void)
+{
+ ida_destroy(&rpc_xprt_ids);
+}
+
+static int xprt_alloc_id(struct rpc_xprt *xprt)
+{
+ int id;
+
+ id = ida_alloc(&rpc_xprt_ids, GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ xprt->id = id;
+ return 0;
+}
+
+static void xprt_free_id(struct rpc_xprt *xprt)
+{
+ ida_free(&rpc_xprt_ids, xprt->id);
+}
+
struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
unsigned int num_prealloc,
unsigned int max_alloc)
@@ -1660,6 +1813,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
if (xprt == NULL)
goto out;
+ xprt_alloc_id(xprt);
xprt_init(xprt, net);
for (i = 0; i < num_prealloc; i++) {
@@ -1668,10 +1822,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
goto out_free;
list_add(&req->rq_list, &xprt->free);
}
- if (max_alloc > num_prealloc)
- xprt->max_reqs = max_alloc;
- else
- xprt->max_reqs = num_prealloc;
+ xprt->max_reqs = max_t(unsigned int, max_alloc, num_prealloc);
xprt->min_reqs = num_prealloc;
xprt->num_reqs = num_prealloc;
@@ -1686,8 +1837,10 @@ EXPORT_SYMBOL_GPL(xprt_alloc);
void xprt_free(struct rpc_xprt *xprt)
{
- put_net(xprt->xprt_net);
+ put_net_track(xprt->xprt_net, &xprt->ns_tracker);
xprt_free_all_slots(xprt);
+ xprt_free_id(xprt);
+ rpc_sysfs_xprt_destroy(xprt);
kfree_rcu(xprt, rcu);
}
EXPORT_SYMBOL_GPL(xprt_free);
@@ -1712,7 +1865,7 @@ xprt_alloc_xid(struct rpc_xprt *xprt)
static void
xprt_init_xid(struct rpc_xprt *xprt)
{
- xprt->xid = prandom_u32();
+ xprt->xid = get_random_u32();
}
static void
@@ -1734,8 +1887,8 @@ xprt_request_init(struct rpc_task *task)
req->rq_rcv_buf.bvec = NULL;
req->rq_release_snd_buf = NULL;
xprt_init_majortimeo(task, req);
- dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
- req, ntohl(req->rq_xid));
+
+ trace_xprt_reserve(req);
}
static void
@@ -1816,16 +1969,12 @@ void xprt_release(struct rpc_task *task)
spin_unlock(&xprt->transport_lock);
if (req->rq_buffer)
xprt->ops->buf_free(task);
- xprt_inject_disconnect(xprt);
- xdr_free_bvec(&req->rq_rcv_buf);
- xdr_free_bvec(&req->rq_snd_buf);
if (req->rq_cred != NULL)
put_rpccred(req->rq_cred);
- task->tk_rqstp = NULL;
if (req->rq_release_snd_buf)
req->rq_release_snd_buf(req);
- dprintk("RPC: %5u release request %p\n", task->tk_pid, req);
+ task->tk_rqstp = NULL;
if (likely(!bc_prealloc(req)))
xprt->ops->free_slot(xprt, req);
else
@@ -1878,7 +2027,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
xprt_init_xid(xprt);
- xprt->xprt_net = get_net(net);
+ xprt->xprt_net = get_net_track(net, &xprt->ns_tracker, GFP_KERNEL);
}
/**
@@ -1889,26 +2038,19 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
{
struct rpc_xprt *xprt;
- struct xprt_class *t;
+ const struct xprt_class *t;
- spin_lock(&xprt_list_lock);
- list_for_each_entry(t, &xprt_list, list) {
- if (t->ident == args->ident) {
- spin_unlock(&xprt_list_lock);
- goto found;
- }
+ t = xprt_class_find_by_ident(args->ident);
+ if (!t) {
+ dprintk("RPC: transport (%d) not supported\n", args->ident);
+ return ERR_PTR(-EIO);
}
- spin_unlock(&xprt_list_lock);
- dprintk("RPC: transport (%d) not supported\n", args->ident);
- return ERR_PTR(-EIO);
-found:
xprt = t->setup(args);
- if (IS_ERR(xprt)) {
- dprintk("RPC: xprt_create_transport: failed, %ld\n",
- -PTR_ERR(xprt));
+ xprt_class_release(t);
+
+ if (IS_ERR(xprt))
goto out;
- }
if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT)
xprt->idle_timeout = 0;
INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
@@ -1929,8 +2071,7 @@ found:
rpc_xprt_debugfs_register(xprt);
- dprintk("RPC: created transport %p with %u slots\n", xprt,
- xprt->max_reqs);
+ trace_xprt_create(xprt);
out:
return xprt;
}
@@ -1940,6 +2081,8 @@ static void xprt_destroy_cb(struct work_struct *work)
struct rpc_xprt *xprt =
container_of(work, struct rpc_xprt, task_cleanup);
+ trace_xprt_destroy(xprt);
+
rpc_xprt_debugfs_unregister(xprt);
rpc_destroy_wait_queue(&xprt->binding);
rpc_destroy_wait_queue(&xprt->pending);
@@ -1964,14 +2107,19 @@ static void xprt_destroy_cb(struct work_struct *work)
*/
static void xprt_destroy(struct rpc_xprt *xprt)
{
- dprintk("RPC: destroying transport %p\n", xprt);
-
/*
* Exclude transport connect/disconnect handlers and autoclose
*/
wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_UNINTERRUPTIBLE);
+ /*
+ * xprt_schedule_autodisconnect() can run after XPRT_LOCKED
+ * is cleared. We use ->transport_lock to ensure the mod_timer()
+ * can only run *before* del_time_sync(), never after.
+ */
+ spin_lock(&xprt->transport_lock);
del_timer_sync(&xprt->timer);
+ spin_unlock(&xprt->transport_lock);
/*
* Destroy sockets etc from the system workqueue so they can
@@ -2010,3 +2158,35 @@ void xprt_put(struct rpc_xprt *xprt)
kref_put(&xprt->kref, xprt_destroy_kref);
}
EXPORT_SYMBOL_GPL(xprt_put);
+
+void xprt_set_offline_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps)
+{
+ if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) {
+ spin_lock(&xps->xps_lock);
+ xps->xps_nactive--;
+ spin_unlock(&xps->xps_lock);
+ }
+}
+
+void xprt_set_online_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps)
+{
+ if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) {
+ spin_lock(&xps->xps_lock);
+ xps->xps_nactive++;
+ spin_unlock(&xps->xps_lock);
+ }
+}
+
+void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps)
+{
+ if (test_and_set_bit(XPRT_REMOVE, &xprt->state))
+ return;
+
+ xprt_force_disconnect(xprt);
+ if (!test_bit(XPRT_CONNECTED, &xprt->state))
+ return;
+
+ if (!xprt->sending.qlen && !xprt->pending.qlen &&
+ !xprt->backlog.qlen && !atomic_long_read(&xprt->queuelen))
+ rpc_xprt_switch_remove_xprt(xps, xprt, true);
+}
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 78c075a68c04..701250b305db 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -7,24 +7,27 @@
* Trond Myklebust <trond.myklebust@primarydata.com>
*
*/
+#include <linux/atomic.h>
#include <linux/types.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/slab.h>
-#include <asm/cmpxchg.h>
#include <linux/spinlock.h>
#include <linux/sunrpc/xprt.h>
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/xprtmultipath.h>
+#include "sysfs.h"
+
typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct rpc_xprt_switch *xps,
const struct rpc_xprt *cur);
static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular;
static const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin;
static const struct rpc_xprt_iter_ops rpc_xprt_iter_listall;
+static const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline;
static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps,
struct rpc_xprt *xprt)
@@ -55,14 +58,16 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
xprt_switch_add_xprt_locked(xps, xprt);
spin_unlock(&xps->xps_lock);
+ rpc_sysfs_xprt_setup(xps, xprt, GFP_KERNEL);
}
static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
- struct rpc_xprt *xprt)
+ struct rpc_xprt *xprt, bool offline)
{
if (unlikely(xprt == NULL))
return;
- xps->xps_nactive--;
+ if (!test_bit(XPRT_OFFLINE, &xprt->state) && offline)
+ xps->xps_nactive--;
xps->xps_nxprts--;
if (xps->xps_nxprts == 0)
xps->xps_net = NULL;
@@ -74,18 +79,43 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
* rpc_xprt_switch_remove_xprt - Removes an rpc_xprt from a rpc_xprt_switch
* @xps: pointer to struct rpc_xprt_switch
* @xprt: pointer to struct rpc_xprt
+ * @offline: indicates if the xprt that's being removed is in an offline state
*
* Removes xprt from the list of struct rpc_xprt in xps.
*/
void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
- struct rpc_xprt *xprt)
+ struct rpc_xprt *xprt, bool offline)
{
spin_lock(&xps->xps_lock);
- xprt_switch_remove_xprt_locked(xps, xprt);
+ xprt_switch_remove_xprt_locked(xps, xprt, offline);
spin_unlock(&xps->xps_lock);
xprt_put(xprt);
}
+static DEFINE_IDA(rpc_xprtswitch_ids);
+
+void xprt_multipath_cleanup_ids(void)
+{
+ ida_destroy(&rpc_xprtswitch_ids);
+}
+
+static int xprt_switch_alloc_id(struct rpc_xprt_switch *xps, gfp_t gfp_flags)
+{
+ int id;
+
+ id = ida_alloc(&rpc_xprtswitch_ids, gfp_flags);
+ if (id < 0)
+ return id;
+
+ xps->xps_id = id;
+ return 0;
+}
+
+static void xprt_switch_free_id(struct rpc_xprt_switch *xps)
+{
+ ida_free(&rpc_xprtswitch_ids, xps->xps_id);
+}
+
/**
* xprt_switch_alloc - Allocate a new struct rpc_xprt_switch
* @xprt: pointer to struct rpc_xprt
@@ -103,12 +133,16 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt,
if (xps != NULL) {
spin_lock_init(&xps->xps_lock);
kref_init(&xps->xps_kref);
+ xprt_switch_alloc_id(xps, gfp_flags);
xps->xps_nxprts = xps->xps_nactive = 0;
atomic_long_set(&xps->xps_queuelen, 0);
xps->xps_net = NULL;
INIT_LIST_HEAD(&xps->xps_xprt_list);
xps->xps_iter_ops = &rpc_xprt_iter_singular;
+ rpc_sysfs_xprt_switch_setup(xps, xprt, gfp_flags);
xprt_switch_add_xprt_locked(xps, xprt);
+ xps->xps_nunique_destaddr_xprts = 1;
+ rpc_sysfs_xprt_setup(xps, xprt, gfp_flags);
}
return xps;
@@ -122,7 +156,7 @@ static void xprt_switch_free_entries(struct rpc_xprt_switch *xps)
xprt = list_first_entry(&xps->xps_xprt_list,
struct rpc_xprt, xprt_switch);
- xprt_switch_remove_xprt_locked(xps, xprt);
+ xprt_switch_remove_xprt_locked(xps, xprt, true);
spin_unlock(&xps->xps_lock);
xprt_put(xprt);
spin_lock(&xps->xps_lock);
@@ -136,6 +170,8 @@ static void xprt_switch_free(struct kref *kref)
struct rpc_xprt_switch, xps_kref);
xprt_switch_free_entries(xps);
+ rpc_sysfs_xprt_switch_destroy(xps);
+ xprt_switch_free_id(xps);
kfree_rcu(xps, xps_rcu);
}
@@ -198,7 +234,8 @@ void xprt_iter_default_rewind(struct rpc_xprt_iter *xpi)
static
bool xprt_is_active(const struct rpc_xprt *xprt)
{
- return kref_read(&xprt->kref) != 0;
+ return (kref_read(&xprt->kref) != 0 &&
+ !test_bit(XPRT_OFFLINE, &xprt->state));
}
static
@@ -214,6 +251,18 @@ struct rpc_xprt *xprt_switch_find_first_entry(struct list_head *head)
}
static
+struct rpc_xprt *xprt_switch_find_first_entry_offline(struct list_head *head)
+{
+ struct rpc_xprt *pos;
+
+ list_for_each_entry_rcu(pos, head, xprt_switch) {
+ if (!xprt_is_active(pos))
+ return pos;
+ }
+ return NULL;
+}
+
+static
struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi)
{
struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
@@ -224,8 +273,9 @@ struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi)
}
static
-struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head,
- const struct rpc_xprt *cur)
+struct rpc_xprt *_xprt_switch_find_current_entry(struct list_head *head,
+ const struct rpc_xprt *cur,
+ bool find_active)
{
struct rpc_xprt *pos;
bool found = false;
@@ -233,14 +283,25 @@ struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head,
list_for_each_entry_rcu(pos, head, xprt_switch) {
if (cur == pos)
found = true;
- if (found && xprt_is_active(pos))
+ if (found && ((find_active && xprt_is_active(pos)) ||
+ (!find_active && xprt_is_active(pos))))
return pos;
}
return NULL;
}
static
-struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
+struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head,
+ const struct rpc_xprt *cur)
+{
+ return _xprt_switch_find_current_entry(head, cur, true);
+}
+
+static
+struct rpc_xprt * _xprt_iter_current_entry(struct rpc_xprt_iter *xpi,
+ struct rpc_xprt *first_entry(struct list_head *head),
+ struct rpc_xprt *current_entry(struct list_head *head,
+ const struct rpc_xprt *cur))
{
struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
struct list_head *head;
@@ -249,8 +310,30 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
return NULL;
head = &xps->xps_xprt_list;
if (xpi->xpi_cursor == NULL || xps->xps_nxprts < 2)
- return xprt_switch_find_first_entry(head);
- return xprt_switch_find_current_entry(head, xpi->xpi_cursor);
+ return first_entry(head);
+ return current_entry(head, xpi->xpi_cursor);
+}
+
+static
+struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
+{
+ return _xprt_iter_current_entry(xpi, xprt_switch_find_first_entry,
+ xprt_switch_find_current_entry);
+}
+
+static
+struct rpc_xprt *xprt_switch_find_current_entry_offline(struct list_head *head,
+ const struct rpc_xprt *cur)
+{
+ return _xprt_switch_find_current_entry(head, cur, false);
+}
+
+static
+struct rpc_xprt *xprt_iter_current_entry_offline(struct rpc_xprt_iter *xpi)
+{
+ return _xprt_iter_current_entry(xpi,
+ xprt_switch_find_first_entry_offline,
+ xprt_switch_find_current_entry_offline);
}
bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
@@ -275,7 +358,7 @@ bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
static
struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
- const struct rpc_xprt *cur)
+ const struct rpc_xprt *cur, bool check_active)
{
struct rpc_xprt *pos, *prev = NULL;
bool found = false;
@@ -283,7 +366,12 @@ struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
list_for_each_entry_rcu(pos, head, xprt_switch) {
if (cur == prev)
found = true;
- if (found && xprt_is_active(pos))
+ /* for request to return active transports return only
+ * active, for request to return offline transports
+ * return only offline
+ */
+ if (found && ((check_active && xprt_is_active(pos)) ||
+ (!check_active && !xprt_is_active(pos))))
return pos;
prev = pos;
}
@@ -320,7 +408,7 @@ struct rpc_xprt *__xprt_switch_find_next_entry_roundrobin(struct list_head *head
{
struct rpc_xprt *ret;
- ret = xprt_switch_find_next_entry(head, cur);
+ ret = xprt_switch_find_next_entry(head, cur, true);
if (ret != NULL)
return ret;
return xprt_switch_find_first_entry(head);
@@ -362,7 +450,14 @@ static
struct rpc_xprt *xprt_switch_find_next_entry_all(struct rpc_xprt_switch *xps,
const struct rpc_xprt *cur)
{
- return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur);
+ return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, true);
+}
+
+static
+struct rpc_xprt *xprt_switch_find_next_entry_offline(struct rpc_xprt_switch *xps,
+ const struct rpc_xprt *cur)
+{
+ return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, false);
}
static
@@ -372,6 +467,13 @@ struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi)
xprt_switch_find_next_entry_all);
}
+static
+struct rpc_xprt *xprt_iter_next_entry_offline(struct rpc_xprt_iter *xpi)
+{
+ return xprt_iter_next_entry_multiple(xpi,
+ xprt_switch_find_next_entry_offline);
+}
+
/*
* xprt_iter_rewind - Resets the xprt iterator
* @xpi: pointer to rpc_xprt_iter
@@ -379,7 +481,6 @@ struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi)
* Resets xpi to ensure that it points to the first entry in the list
* of transports.
*/
-static
void xprt_iter_rewind(struct rpc_xprt_iter *xpi)
{
rcu_read_lock();
@@ -425,6 +526,12 @@ void xprt_iter_init_listall(struct rpc_xprt_iter *xpi,
__xprt_iter_init(xpi, xps, &rpc_xprt_iter_listall);
}
+void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi,
+ struct rpc_xprt_switch *xps)
+{
+ __xprt_iter_init(xpi, xps, &rpc_xprt_iter_listoffline);
+}
+
/**
* xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch
* @xpi: pointer to rpc_xprt_iter
@@ -539,3 +646,10 @@ const struct rpc_xprt_iter_ops rpc_xprt_iter_listall = {
.xpi_xprt = xprt_iter_current_entry,
.xpi_next = xprt_iter_next_entry_all,
};
+
+static
+const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline = {
+ .xpi_rewind = xprt_iter_default_rewind,
+ .xpi_xprt = xprt_iter_current_entry_offline,
+ .xpi_next = xprt_iter_next_entry_offline,
+};
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 8ed0377d7a18..55b21bae866d 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -4,5 +4,5 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
- module.o
+ svc_rdma_pcl.o module.o
rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 1a0ae0c61353..e4d84a13c566 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (c) 2015 Oracle. All rights reserved.
+ * Copyright (c) 2015-2020, Oracle and/or its affiliates.
*
- * Support for backward direction RPCs on RPC/RDMA.
+ * Support for reverse-direction RPCs on RPC/RDMA.
*/
#include <linux/sunrpc/xprt.h>
@@ -13,10 +13,6 @@
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_TRANS
-#endif
-
#undef RPCRDMA_BACKCHANNEL_DEBUG
/**
@@ -44,10 +40,10 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
size_t maxmsg;
- maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv);
+ maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv);
maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
return maxmsg - RPCRDMA_HDRLEN_MIN;
}
@@ -82,7 +78,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
&rqst->rq_snd_buf, rpcrdma_noch_pullup))
return -EIO;
- trace_xprtrdma_cb_reply(rqst);
+ trace_xprtrdma_cb_reply(r_xprt, rqst);
return 0;
}
@@ -115,7 +111,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
if (rc < 0)
goto failed_marshal;
- if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ if (frwr_send(r_xprt, req))
goto drop_connection;
return 0;
@@ -155,9 +151,11 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
{
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_rep *rep = req->rl_reply;
struct rpc_xprt *xprt = rqst->rq_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- rpcrdma_recv_buffer_put(req->rl_reply);
+ rpcrdma_rep_put(&r_xprt->rx_buf, rep);
req->rl_reply = NULL;
spin_lock(&xprt->bc_pa_lock);
@@ -190,8 +188,8 @@ create_req:
if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS)
return NULL;
- size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE);
- req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL);
+ size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE);
+ req = rpcrdma_req_create(r_xprt, size);
if (!req)
return NULL;
if (rpcrdma_req_setup(r_xprt, req)) {
@@ -208,7 +206,7 @@ create_req:
}
/**
- * rpcrdma_bc_receive_call - Handle a backward direction call
+ * rpcrdma_bc_receive_call - Handle a reverse-direction Call
* @r_xprt: transport receiving the call
* @rep: receive buffer containing the call
*
@@ -260,7 +258,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
*/
req = rpcr_to_rdmar(rqst);
req->rl_reply = rep;
- trace_xprtrdma_cb_call(rqst);
+ trace_xprtrdma_cb_call(r_xprt, rqst);
/* Queue rqst for ULP's callback service */
bc_serv = xprt->bc_serv;
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 125297c9aa3e..ffbf99894970 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -40,51 +40,56 @@
* New MRs are created on demand.
*/
-#include <linux/sunrpc/rpc_rdma.h>
#include <linux/sunrpc/svc_rdma.h>
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_TRANS
-#endif
+static void frwr_cid_init(struct rpcrdma_ep *ep,
+ struct rpcrdma_mr *mr)
+{
+ struct rpc_rdma_cid *cid = &mr->mr_cid;
+
+ cid->ci_queue_id = ep->re_attr.send_cq->res.id;
+ cid->ci_completion_id = mr->mr_ibmr->res.id;
+}
+
+static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+{
+ if (mr->mr_device) {
+ trace_xprtrdma_mr_unmap(mr);
+ ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents,
+ mr->mr_dir);
+ mr->mr_device = NULL;
+ }
+}
/**
- * frwr_release_mr - Destroy one MR
- * @mr: MR allocated by frwr_init_mr
+ * frwr_mr_release - Destroy one MR
+ * @mr: MR allocated by frwr_mr_init
*
*/
-void frwr_release_mr(struct rpcrdma_mr *mr)
+void frwr_mr_release(struct rpcrdma_mr *mr)
{
int rc;
- rc = ib_dereg_mr(mr->frwr.fr_mr);
+ frwr_mr_unmap(mr->mr_xprt, mr);
+
+ rc = ib_dereg_mr(mr->mr_ibmr);
if (rc)
trace_xprtrdma_frwr_dereg(mr, rc);
kfree(mr->mr_sg);
kfree(mr);
}
-static void frwr_mr_recycle(struct rpcrdma_mr *mr)
+static void frwr_mr_put(struct rpcrdma_mr *mr)
{
- struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-
- trace_xprtrdma_mr_recycle(mr);
-
- if (mr->mr_dir != DMA_NONE) {
- trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
- mr->mr_sg, mr->mr_nents, mr->mr_dir);
- mr->mr_dir = DMA_NONE;
- }
-
- spin_lock(&r_xprt->rx_buf.rb_lock);
- list_del(&mr->mr_all);
- r_xprt->rx_stats.mrs_recycled++;
- spin_unlock(&r_xprt->rx_buf.rb_lock);
+ frwr_mr_unmap(mr->mr_xprt, mr);
- frwr_release_mr(mr);
+ /* The MR is returned to the req's MR free list instead
+ * of to the xprt's MR free list. No spinlock is needed.
+ */
+ rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
}
/* frwr_reset - Place MRs back on the free list
@@ -102,76 +107,70 @@ void frwr_reset(struct rpcrdma_req *req)
struct rpcrdma_mr *mr;
while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
- rpcrdma_mr_put(mr);
+ frwr_mr_put(mr);
}
/**
- * frwr_init_mr - Initialize one MR
- * @ia: interface adapter
+ * frwr_mr_init - Initialize one MR
+ * @r_xprt: controlling transport instance
* @mr: generic MR to prepare for FRWR
*
* Returns zero if successful. Otherwise a negative errno
* is returned.
*/
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
{
- unsigned int depth = ia->ri_max_frwr_depth;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ unsigned int depth = ep->re_max_fr_depth;
struct scatterlist *sg;
struct ib_mr *frmr;
- int rc;
- frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
+ sg = kcalloc_node(depth, sizeof(*sg), XPRTRDMA_GFP_FLAGS,
+ ibdev_to_node(ep->re_id->device));
+ if (!sg)
+ return -ENOMEM;
+
+ frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth);
if (IS_ERR(frmr))
goto out_mr_err;
- sg = kcalloc(depth, sizeof(*sg), GFP_NOFS);
- if (!sg)
- goto out_list_err;
-
- mr->frwr.fr_mr = frmr;
- mr->mr_dir = DMA_NONE;
+ mr->mr_xprt = r_xprt;
+ mr->mr_ibmr = frmr;
+ mr->mr_device = NULL;
INIT_LIST_HEAD(&mr->mr_list);
- init_completion(&mr->frwr.fr_linv_done);
+ init_completion(&mr->mr_linv_done);
+ frwr_cid_init(ep, mr);
sg_init_table(sg, depth);
mr->mr_sg = sg;
return 0;
out_mr_err:
- rc = PTR_ERR(frmr);
- trace_xprtrdma_frwr_alloc(mr, rc);
- return rc;
-
-out_list_err:
- ib_dereg_mr(frmr);
- return -ENOMEM;
+ kfree(sg);
+ trace_xprtrdma_frwr_alloc(mr, PTR_ERR(frmr));
+ return PTR_ERR(frmr);
}
/**
* frwr_query_device - Prepare a transport for use with FRWR
- * @r_xprt: controlling transport instance
+ * @ep: endpoint to fill in
* @device: RDMA device to query
*
* On success, sets:
- * ep->rep_attr
- * ep->rep_max_requests
- * ia->ri_max_rdma_segs
- *
- * And these FRWR-related fields:
- * ia->ri_max_frwr_depth
- * ia->ri_mrtype
+ * ep->re_attr
+ * ep->re_max_requests
+ * ep->re_max_rdma_segs
+ * ep->re_max_fr_depth
+ * ep->re_mrtype
*
* Return values:
* On success, returns zero.
* %-EINVAL - the device does not support FRWR memory registration
* %-ENOMEM - the device is not sufficiently capable for NFS/RDMA
*/
-int frwr_query_device(struct rpcrdma_xprt *r_xprt,
- const struct ib_device *device)
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
{
const struct ib_device_attr *attrs = &device->attrs;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
int max_qp_wr, depth, delta;
unsigned int max_sge;
@@ -188,23 +187,23 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge);
return -ENOMEM;
}
- ep->rep_attr.cap.max_send_sge = max_sge;
- ep->rep_attr.cap.max_recv_sge = 1;
+ ep->re_attr.cap.max_send_sge = max_sge;
+ ep->re_attr.cap.max_recv_sge = 1;
- ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
- if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
- ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
+ ep->re_mrtype = IB_MR_TYPE_MEM_REG;
+ if (attrs->kernel_cap_flags & IBK_SG_GAPS_REG)
+ ep->re_mrtype = IB_MR_TYPE_SG_GAPS;
/* Quirk: Some devices advertise a large max_fast_reg_page_list_len
* capability, but perform optimally when the MRs are not larger
* than a page.
*/
if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS)
- ia->ri_max_frwr_depth = attrs->max_sge_rd;
+ ep->re_max_fr_depth = attrs->max_sge_rd;
else
- ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len;
- if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS)
- ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS;
+ ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len;
+ if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS)
+ ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS;
/* Add room for frwr register and invalidate WRs.
* 1. FRWR reg WR for head
@@ -220,11 +219,11 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
/* Calculate N if the device max FRWR depth is smaller than
* RPCRDMA_MAX_DATA_SEGS.
*/
- if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) {
- delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth;
+ if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) {
+ delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth;
do {
depth += 2; /* FRWR reg + invalidate */
- delta -= ia->ri_max_frwr_depth;
+ delta -= ep->re_max_fr_depth;
} while (delta > 0);
}
@@ -233,34 +232,35 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
max_qp_wr -= 1;
if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
return -ENOMEM;
- if (ep->rep_max_requests > max_qp_wr)
- ep->rep_max_requests = max_qp_wr;
- ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
- if (ep->rep_attr.cap.max_send_wr > max_qp_wr) {
- ep->rep_max_requests = max_qp_wr / depth;
- if (!ep->rep_max_requests)
+ if (ep->re_max_requests > max_qp_wr)
+ ep->re_max_requests = max_qp_wr;
+ ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
+ if (ep->re_attr.cap.max_send_wr > max_qp_wr) {
+ ep->re_max_requests = max_qp_wr / depth;
+ if (!ep->re_max_requests)
return -ENOMEM;
- ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
+ ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
}
- ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
- ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests;
- ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
-
- ia->ri_max_rdma_segs =
- DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
+ ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+ ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
+ ep->re_attr.cap.max_recv_wr = ep->re_max_requests;
+ ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+ ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH;
+ ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
+
+ ep->re_max_rdma_segs =
+ DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth);
/* Reply chunks require segments for head and tail buffers */
- ia->ri_max_rdma_segs += 2;
- if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
- ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
+ ep->re_max_rdma_segs += 2;
+ if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
+ ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
/* Ensure the underlying device is capable of conveying the
* largest r/wsize NFS will ask for. This guarantees that
* failing over from one RDMA device to another will not
* break NFS I/O.
*/
- if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS)
+ if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS)
return -ENOMEM;
return 0;
@@ -286,41 +286,36 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr *mr)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_reg_wr *reg_wr;
int i, n, dma_nents;
struct ib_mr *ibmr;
u8 key;
- if (nsegs > ia->ri_max_frwr_depth)
- nsegs = ia->ri_max_frwr_depth;
+ if (nsegs > ep->re_max_fr_depth)
+ nsegs = ep->re_max_fr_depth;
for (i = 0; i < nsegs;) {
- if (seg->mr_page)
- sg_set_page(&mr->mr_sg[i],
- seg->mr_page,
- seg->mr_len,
- offset_in_page(seg->mr_offset));
- else
- sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
- seg->mr_len);
+ sg_set_page(&mr->mr_sg[i], seg->mr_page,
+ seg->mr_len, seg->mr_offset);
++seg;
++i;
- if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
+ if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
continue;
- if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+ if ((i < nsegs && seg->mr_offset) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
mr->mr_dir = rpcrdma_data_dir(writing);
mr->mr_nents = i;
- dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents,
+ dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents,
mr->mr_dir);
if (!dma_nents)
goto out_dmamap_err;
+ mr->mr_device = ep->re_id->device;
- ibmr = mr->frwr.fr_mr;
+ ibmr = mr->mr_ibmr;
n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
if (n != dma_nents)
goto out_mapmr_err;
@@ -330,7 +325,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
key = (u8)(ibmr->rkey & 0x000000FF);
ib_update_fast_reg_key(ibmr, ++key);
- reg_wr = &mr->frwr.fr_regwr;
+ reg_wr = &mr->mr_regwr;
reg_wr->mr = ibmr;
reg_wr->key = ibmr->rkey;
reg_wr->access = writing ?
@@ -345,7 +340,6 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
return seg;
out_dmamap_err:
- mr->mr_dir = DMA_NONE;
trace_xprtrdma_frwr_sgerr(mr, i);
return ERR_PTR(-EIO);
@@ -356,54 +350,73 @@ out_mapmr_err:
/**
* frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed FastReg WR
*
+ * Each flushed MR gets destroyed after the QP has drained.
*/
static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
- struct rpcrdma_frwr *frwr =
- container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+ struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_fastreg(wc, frwr);
- /* The MR will get recycled when the associated req is retransmitted */
+ trace_xprtrdma_wc_fastreg(wc, &mr->mr_cid);
+
+ rpcrdma_flush_disconnect(cq->cq_context, wc);
}
/**
- * frwr_send - post Send WR containing the RPC Call message
- * @ia: interface adapter
- * @req: Prepared RPC Call
+ * frwr_send - post Send WRs containing the RPC Call message
+ * @r_xprt: controlling transport instance
+ * @req: prepared RPC Call
*
* For FRWR, chain any FastReg WRs to the Send WR. Only a
* single ib_post_send call is needed to register memory
* and then post the Send WR.
*
- * Returns the result of ib_post_send.
+ * Returns the return code from ib_post_send.
+ *
+ * Caller must hold the transport send lock to ensure that the
+ * pointers to the transport's rdma_cm_id and QP are stable.
*/
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
- struct ib_send_wr *post_wr;
+ struct ib_send_wr *post_wr, *send_wr = &req->rl_wr;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct rpcrdma_mr *mr;
+ unsigned int num_wrs;
+ int ret;
- post_wr = &req->rl_wr;
+ num_wrs = 1;
+ post_wr = send_wr;
list_for_each_entry(mr, &req->rl_registered, mr_list) {
- struct rpcrdma_frwr *frwr;
-
- frwr = &mr->frwr;
-
- frwr->fr_cqe.done = frwr_wc_fastreg;
- frwr->fr_regwr.wr.next = post_wr;
- frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
- frwr->fr_regwr.wr.num_sge = 0;
- frwr->fr_regwr.wr.opcode = IB_WR_REG_MR;
- frwr->fr_regwr.wr.send_flags = 0;
+ trace_xprtrdma_mr_fastreg(mr);
+
+ mr->mr_cqe.done = frwr_wc_fastreg;
+ mr->mr_regwr.wr.next = post_wr;
+ mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe;
+ mr->mr_regwr.wr.num_sge = 0;
+ mr->mr_regwr.wr.opcode = IB_WR_REG_MR;
+ mr->mr_regwr.wr.send_flags = 0;
+ post_wr = &mr->mr_regwr.wr;
+ ++num_wrs;
+ }
- post_wr = &frwr->fr_regwr.wr;
+ if ((kref_read(&req->rl_kref) > 1) || num_wrs > ep->re_send_count) {
+ send_wr->send_flags |= IB_SEND_SIGNALED;
+ ep->re_send_count = min_t(unsigned int, ep->re_send_batch,
+ num_wrs - ep->re_send_count);
+ } else {
+ send_wr->send_flags &= ~IB_SEND_SIGNALED;
+ ep->re_send_count -= num_wrs;
}
- return ib_post_send(ia->ri_id->qp, post_wr, NULL);
+ trace_xprtrdma_post_send(req);
+ ret = ib_post_send(ep->re_id->qp, post_wr, NULL);
+ if (ret)
+ trace_xprtrdma_post_send_err(r_xprt, req, ret);
+ return ret;
}
/**
@@ -419,56 +432,54 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
list_for_each_entry(mr, mrs, mr_list)
if (mr->mr_handle == rep->rr_inv_rkey) {
list_del_init(&mr->mr_list);
- trace_xprtrdma_mr_remoteinv(mr);
- rpcrdma_mr_put(mr);
+ trace_xprtrdma_mr_reminv(mr);
+ frwr_mr_put(mr);
break; /* only one invalidated MR per RPC */
}
}
-static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
+static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr)
{
- if (wc->status != IB_WC_SUCCESS)
- frwr_mr_recycle(mr);
- else
- rpcrdma_mr_put(mr);
+ if (likely(wc->status == IB_WC_SUCCESS))
+ frwr_mr_put(mr);
}
/**
* frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
*/
static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
- struct rpcrdma_frwr *frwr =
- container_of(cqe, struct rpcrdma_frwr, fr_cqe);
- struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+ struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_li(wc, frwr);
- __frwr_release_mr(wc, mr);
+ trace_xprtrdma_wc_li(wc, &mr->mr_cid);
+ frwr_mr_done(wc, mr);
+
+ rpcrdma_flush_disconnect(cq->cq_context, wc);
}
/**
* frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
* Awaken anyone waiting for an MR to finish being fenced.
*/
static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
- struct rpcrdma_frwr *frwr =
- container_of(cqe, struct rpcrdma_frwr, fr_cqe);
- struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+ struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_li_wake(wc, frwr);
- __frwr_release_mr(wc, mr);
- complete(&frwr->fr_linv_done);
+ trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid);
+ frwr_mr_done(wc, mr);
+ complete(&mr->mr_linv_done);
+
+ rpcrdma_flush_disconnect(cq->cq_context, wc);
}
/**
@@ -485,8 +496,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *first, **prev, *last;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
const struct ib_send_wr *bad_wr;
- struct rpcrdma_frwr *frwr;
struct rpcrdma_mr *mr;
int rc;
@@ -495,85 +506,86 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* Chain the LOCAL_INV Work Requests and post them with
* a single ib_post_send() call.
*/
- frwr = NULL;
prev = &first;
- while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
-
+ mr = rpcrdma_mr_pop(&req->rl_registered);
+ do {
trace_xprtrdma_mr_localinv(mr);
r_xprt->rx_stats.local_inv_needed++;
- frwr = &mr->frwr;
- frwr->fr_cqe.done = frwr_wc_localinv;
- last = &frwr->fr_invwr;
+ last = &mr->mr_invwr;
last->next = NULL;
- last->wr_cqe = &frwr->fr_cqe;
+ last->wr_cqe = &mr->mr_cqe;
last->sg_list = NULL;
last->num_sge = 0;
last->opcode = IB_WR_LOCAL_INV;
last->send_flags = IB_SEND_SIGNALED;
last->ex.invalidate_rkey = mr->mr_handle;
+ last->wr_cqe->done = frwr_wc_localinv;
+
*prev = last;
prev = &last->next;
- }
+ } while ((mr = rpcrdma_mr_pop(&req->rl_registered)));
+
+ mr = container_of(last, struct rpcrdma_mr, mr_invwr);
/* Strong send queue ordering guarantees that when the
* last WR in the chain completes, all WRs in the chain
* are complete.
*/
- frwr->fr_cqe.done = frwr_wc_localinv_wake;
- reinit_completion(&frwr->fr_linv_done);
+ last->wr_cqe->done = frwr_wc_localinv_wake;
+ reinit_completion(&mr->mr_linv_done);
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
- * unless ri_id->qp is a valid pointer.
+ * unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
/* The final LOCAL_INV WR in the chain is supposed to
* do the wake. If it was never posted, the wake will
* not happen, so don't wait in that case.
*/
if (bad_wr != first)
- wait_for_completion(&frwr->fr_linv_done);
+ wait_for_completion(&mr->mr_linv_done);
if (!rc)
return;
- /* Recycle MRs in the LOCAL_INV chain that did not get posted.
+ /* On error, the MRs get destroyed once the QP has drained. */
+ trace_xprtrdma_post_linv_err(req, rc);
+
+ /* Force a connection loss to ensure complete recovery.
*/
- trace_xprtrdma_post_linv(req, rc);
- while (bad_wr) {
- frwr = container_of(bad_wr, struct rpcrdma_frwr,
- fr_invwr);
- mr = container_of(frwr, struct rpcrdma_mr, frwr);
- bad_wr = bad_wr->next;
-
- list_del_init(&mr->mr_list);
- frwr_mr_recycle(mr);
- }
+ rpcrdma_force_disconnect(ep);
}
/**
* frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
*/
static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
- struct rpcrdma_frwr *frwr =
- container_of(cqe, struct rpcrdma_frwr, fr_cqe);
- struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
- struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
+ struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
+ struct rpcrdma_rep *rep;
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_li_done(wc, frwr);
- __frwr_release_mr(wc, mr);
+ trace_xprtrdma_wc_li_done(wc, &mr->mr_cid);
- /* Ensure @rep is generated before __frwr_release_mr */
+ /* Ensure that @rep is generated before the MR is released */
+ rep = mr->mr_req->rl_reply;
smp_rmb();
+
+ if (wc->status != IB_WC_SUCCESS) {
+ if (rep)
+ rpcrdma_unpin_rqst(rep);
+ rpcrdma_flush_disconnect(cq->cq_context, wc);
+ return;
+ }
+ frwr_mr_put(mr);
rpcrdma_complete_rqst(rep);
}
@@ -590,66 +602,95 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *first, *last, **prev;
- const struct ib_send_wr *bad_wr;
- struct rpcrdma_frwr *frwr;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct rpcrdma_mr *mr;
int rc;
/* Chain the LOCAL_INV Work Requests and post them with
* a single ib_post_send() call.
*/
- frwr = NULL;
prev = &first;
- while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
-
+ mr = rpcrdma_mr_pop(&req->rl_registered);
+ do {
trace_xprtrdma_mr_localinv(mr);
r_xprt->rx_stats.local_inv_needed++;
- frwr = &mr->frwr;
- frwr->fr_cqe.done = frwr_wc_localinv;
- last = &frwr->fr_invwr;
+ last = &mr->mr_invwr;
last->next = NULL;
- last->wr_cqe = &frwr->fr_cqe;
+ last->wr_cqe = &mr->mr_cqe;
last->sg_list = NULL;
last->num_sge = 0;
last->opcode = IB_WR_LOCAL_INV;
last->send_flags = IB_SEND_SIGNALED;
last->ex.invalidate_rkey = mr->mr_handle;
+ last->wr_cqe->done = frwr_wc_localinv;
+
*prev = last;
prev = &last->next;
- }
+ } while ((mr = rpcrdma_mr_pop(&req->rl_registered)));
/* Strong send queue ordering guarantees that when the
* last WR in the chain completes, all WRs in the chain
* are complete. The last completion will wake up the
* RPC waiter.
*/
- frwr->fr_cqe.done = frwr_wc_localinv_done;
+ last->wr_cqe->done = frwr_wc_localinv_done;
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
- * unless ri_id->qp is a valid pointer.
+ * unless re_id->qp is a valid pointer.
*/
- bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(ep->re_id->qp, first, NULL);
if (!rc)
return;
- /* Recycle MRs in the LOCAL_INV chain that did not get posted.
- */
- trace_xprtrdma_post_linv(req, rc);
- while (bad_wr) {
- frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
- mr = container_of(frwr, struct rpcrdma_mr, frwr);
- bad_wr = bad_wr->next;
-
- frwr_mr_recycle(mr);
- }
+ /* On error, the MRs get destroyed once the QP has drained. */
+ trace_xprtrdma_post_linv_err(req, rc);
/* The final LOCAL_INV WR in the chain is supposed to
- * do the wake. If it was never posted, the wake will
- * not happen, so wake here in that case.
+ * do the wake. If it was never posted, the wake does
+ * not happen. Unpin the rqst in preparation for its
+ * retransmission.
+ */
+ rpcrdma_unpin_rqst(req->rl_reply);
+
+ /* Force a connection loss to ensure complete recovery.
*/
- rpcrdma_complete_rqst(req->rl_reply);
+ rpcrdma_force_disconnect(ep);
+}
+
+/**
+ * frwr_wp_create - Create an MR for padding Write chunks
+ * @r_xprt: transport resources to use
+ *
+ * Return 0 on success, negative errno on failure.
+ */
+int frwr_wp_create(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct rpcrdma_mr_seg seg;
+ struct rpcrdma_mr *mr;
+
+ mr = rpcrdma_mr_get(r_xprt);
+ if (!mr)
+ return -EAGAIN;
+ mr->mr_req = NULL;
+ ep->re_write_pad_mr = mr;
+
+ seg.mr_len = XDR_UNIT;
+ seg.mr_page = virt_to_page(ep->re_write_pad);
+ seg.mr_offset = offset_in_page(ep->re_write_pad);
+ if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr)))
+ return -EIO;
+ trace_xprtrdma_mr_fastreg(mr);
+
+ mr->mr_cqe.done = frwr_wc_fastreg;
+ mr->mr_regwr.wr.next = NULL;
+ mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe;
+ mr->mr_regwr.wr.num_sge = 0;
+ mr->mr_regwr.wr.opcode = IB_WR_REG_MR;
+ mr->mr_regwr.wr.send_flags = 0;
+
+ return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL);
}
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
index 620327c01302..45c5b41ac8dc 100644
--- a/net/sunrpc/xprtrdma/module.c
+++ b/net/sunrpc/xprtrdma/module.c
@@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS("svcrdma");
MODULE_ALIAS("xprtrdma");
+MODULE_ALIAS("rpcrdma6");
static void __exit rpc_rdma_cleanup(void)
{
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 28020ec104d4..190a4de239c8 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/*
- * Copyright (c) 2014-2017 Oracle. All rights reserved.
+ * Copyright (c) 2014-2020, Oracle and/or its affiliates.
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
@@ -54,10 +54,6 @@
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_TRANS
-#endif
-
/* Returns size of largest RPC-over-RDMA header in a Call message
*
* The largest Call header contains a full-size Read list and a
@@ -71,7 +67,7 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
size = RPCRDMA_HDRLEN_MIN;
/* Maximum Read list size */
- size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
+ size += maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
/* Minimal Read chunk size */
size += sizeof(__be32); /* segment count */
@@ -94,7 +90,7 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
size = RPCRDMA_HDRLEN_MIN;
/* Maximum Write list size */
- size = sizeof(__be32); /* segment count */
+ size += sizeof(__be32); /* segment count */
size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
size += sizeof(__be32); /* list discriminator */
@@ -103,21 +99,20 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
/**
* rpcrdma_set_max_header_sizes - Initialize inline payload sizes
- * @r_xprt: transport instance to initialize
+ * @ep: endpoint to initialize
*
* The max_inline fields contain the maximum size of an RPC message
* so the marshaling code doesn't have to repeat this calculation
* for every RPC.
*/
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep)
{
- unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ unsigned int maxsegs = ep->re_max_rdma_segs;
- ep->rep_max_inline_send =
- ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs);
- ep->rep_max_inline_recv =
- ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
+ ep->re_max_inline_send =
+ ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs);
+ ep->re_max_inline_recv =
+ ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
}
/* The client can send a request inline as long as the RPCRDMA header
@@ -132,9 +127,10 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
struct xdr_buf *xdr = &rqst->rq_snd_buf;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned int count, remaining, offset;
- if (xdr->len > r_xprt->rx_ep.rep_max_inline_send)
+ if (xdr->len > ep->re_max_inline_send)
return false;
if (xdr->page_len) {
@@ -145,7 +141,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
remaining -= min_t(unsigned int,
PAGE_SIZE - offset, remaining);
offset = 0;
- if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge)
+ if (++count > ep->re_attr.cap.max_send_sge)
return false;
}
}
@@ -162,7 +158,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
- return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv;
+ return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv;
}
/* The client is required to provide a Reply chunk if the maximum
@@ -176,12 +172,35 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
const struct xdr_buf *buf = &rqst->rq_rcv_buf;
return (buf->head[0].iov_len + buf->tail[0].iov_len) <
- r_xprt->rx_ep.rep_max_inline_recv;
+ r_xprt->rx_ep->re_max_inline_recv;
+}
+
+/* ACL likes to be lazy in allocating pages. For TCP, these
+ * pages can be allocated during receive processing. Not true
+ * for RDMA, which must always provision receive buffers
+ * up front.
+ */
+static noinline int
+rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
+{
+ struct page **ppages;
+ int len;
+
+ len = buf->page_len;
+ ppages = buf->pages + (buf->page_base >> PAGE_SHIFT);
+ while (len > 0) {
+ if (!*ppages)
+ *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+ if (!*ppages)
+ return -ENOBUFS;
+ ppages++;
+ len -= PAGE_SIZE;
+ }
+
+ return 0;
}
-/* Split @vec on page boundaries into SGEs. FMR registers pages, not
- * a byte range. Other modes coalesce these SGEs into a single MR
- * when they can.
+/* Convert @vec to a single SGL element.
*
* Returns pointer to next available SGE, and bumps the total number
* of SGEs consumed.
@@ -190,22 +209,11 @@ static struct rpcrdma_mr_seg *
rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
unsigned int *n)
{
- u32 remaining, page_offset;
- char *base;
-
- base = vec->iov_base;
- page_offset = offset_in_page(base);
- remaining = vec->iov_len;
- while (remaining) {
- seg->mr_page = NULL;
- seg->mr_offset = base;
- seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
- remaining -= seg->mr_len;
- base += seg->mr_len;
- ++seg;
- ++(*n);
- page_offset = 0;
- }
+ seg->mr_page = virt_to_page(vec->iov_base);
+ seg->mr_offset = offset_in_page(vec->iov_base);
+ seg->mr_len = vec->iov_len;
+ ++seg;
+ ++(*n);
return seg;
}
@@ -233,17 +241,8 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
page_base = offset_in_page(xdrbuf->page_base);
while (len) {
- /* ACL likes to be lazy in allocating pages - ACLs
- * are small by default but can get huge.
- */
- if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) {
- if (!*ppages)
- *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
- if (!*ppages)
- return -ENOBUFS;
- }
seg->mr_page = *ppages;
- seg->mr_offset = (char *)page_base;
+ seg->mr_offset = page_base;
seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
len -= seg->mr_len;
++ppages;
@@ -252,22 +251,11 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
page_base = 0;
}
- /* When encoding a Read chunk, the tail iovec contains an
- * XDR pad and may be omitted.
- */
- if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
- goto out;
-
- /* When encoding a Write chunk, some servers need to see an
- * extra segment for non-XDR-aligned Write chunks. The upper
- * layer provides space in the tail iovec that may be used
- * for this purpose.
- */
- if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
+ if (type == rpcrdma_readch || type == rpcrdma_writech)
goto out;
if (xdrbuf->tail[0].iov_len)
- seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
+ rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
out:
if (unlikely(n > RPCRDMA_MAX_SEGS))
@@ -275,40 +263,6 @@ out:
return n;
}
-static inline int
-encode_item_present(struct xdr_stream *xdr)
-{
- __be32 *p;
-
- p = xdr_reserve_space(xdr, sizeof(*p));
- if (unlikely(!p))
- return -EMSGSIZE;
-
- *p = xdr_one;
- return 0;
-}
-
-static inline int
-encode_item_not_present(struct xdr_stream *xdr)
-{
- __be32 *p;
-
- p = xdr_reserve_space(xdr, sizeof(*p));
- if (unlikely(!p))
- return -EMSGSIZE;
-
- *p = xdr_zero;
- return 0;
-}
-
-static void
-xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr)
-{
- *iptr++ = cpu_to_be32(mr->mr_handle);
- *iptr++ = cpu_to_be32(mr->mr_length);
- xdr_encode_hyper(iptr, mr->mr_offset);
-}
-
static int
encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
{
@@ -318,7 +272,7 @@ encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
if (unlikely(!p))
return -EMSGSIZE;
- xdr_encode_rdma_segment(p, mr);
+ xdr_encode_rdma_segment(p, mr->mr_handle, mr->mr_length, mr->mr_offset);
return 0;
}
@@ -333,8 +287,8 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
return -EMSGSIZE;
*p++ = xdr_one; /* Item present */
- *p++ = cpu_to_be32(position);
- xdr_encode_rdma_segment(p, mr);
+ xdr_encode_read_segment(p, position, mr->mr_handle, mr->mr_length,
+ mr->mr_offset);
return 0;
}
@@ -349,7 +303,6 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
*mr = rpcrdma_mr_get(r_xprt);
if (!*mr)
goto out_getmr_err;
- trace_xprtrdma_mr_get(req);
(*mr)->mr_req = req;
}
@@ -357,7 +310,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
out_getmr_err:
- trace_xprtrdma_nomrs(req);
+ trace_xprtrdma_nomrs_err(r_xprt, req);
xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
rpcrdma_mrs_refresh(r_xprt);
return ERR_PTR(-EAGAIN);
@@ -414,7 +367,9 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
} while (nsegs);
done:
- return encode_item_not_present(xdr);
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return -EMSGSIZE;
+ return 0;
}
/* Register and XDR encode the Write list. Supports encoding a list
@@ -438,6 +393,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
enum rpcrdma_chunktype wtype)
{
struct xdr_stream *xdr = &req->rl_stream;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct rpcrdma_mr_seg *seg;
struct rpcrdma_mr *mr;
int nsegs, nchunks;
@@ -453,7 +409,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
if (nsegs < 0)
return nsegs;
- if (encode_item_present(xdr) < 0)
+ if (xdr_stream_encode_item_present(xdr) < 0)
return -EMSGSIZE;
segcount = xdr_reserve_space(xdr, sizeof(*segcount));
if (unlikely(!segcount))
@@ -476,11 +432,25 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
nsegs -= mr->mr_nents;
} while (nsegs);
+ if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) {
+ if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0)
+ return -EMSGSIZE;
+
+ trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr,
+ nsegs);
+ r_xprt->rx_stats.write_chunk_count++;
+ r_xprt->rx_stats.total_rdma_request += mr->mr_length;
+ nchunks++;
+ nsegs -= mr->mr_nents;
+ }
+
/* Update count of segments in this Write chunk */
*segcount = cpu_to_be32(nchunks);
done:
- return encode_item_not_present(xdr);
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return -EMSGSIZE;
+ return 0;
}
/* Register and XDR encode the Reply chunk. Supports encoding an array
@@ -506,15 +476,18 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
int nsegs, nchunks;
__be32 *segcount;
- if (wtype != rpcrdma_replych)
- return encode_item_not_present(xdr);
+ if (wtype != rpcrdma_replych) {
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return -EMSGSIZE;
+ return 0;
+ }
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
return nsegs;
- if (encode_item_present(xdr) < 0)
+ if (xdr_stream_encode_item_present(xdr) < 0)
return -EMSGSIZE;
segcount = xdr_reserve_space(xdr, sizeof(*segcount));
if (unlikely(!segcount))
@@ -894,6 +867,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
__be32 *p;
int ret;
+ if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) {
+ ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf);
+ if (ret)
+ return ret;
+ }
+
rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf),
rqst);
@@ -911,8 +890,8 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
* or privacy, direct data placement of individual data items
* is not allowed.
*/
- ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
- RPCAUTH_AUTH_DATATOUCH);
+ ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH,
+ &rqst->rq_cred->cr_auth->au_flags);
/*
* Chunks needed for results?
@@ -1142,6 +1121,7 @@ static bool
rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
{
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
struct xdr_stream *xdr = &rep->rr_stream;
__be32 *p;
@@ -1152,11 +1132,11 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
p = xdr_inline_decode(xdr, 0);
/* Chunk lists */
- if (*p++ != xdr_zero)
+ if (xdr_item_is_present(p++))
return false;
- if (*p++ != xdr_zero)
+ if (xdr_item_is_present(p++))
return false;
- if (*p++ != xdr_zero)
+ if (xdr_item_is_present(p++))
return false;
/* RPC header */
@@ -1165,19 +1145,19 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
if (*p != cpu_to_be32(RPC_CALL))
return false;
+ /* No bc service. */
+ if (xprt->bc_serv == NULL)
+ return false;
+
/* Now that we are sure this is a backchannel call,
* advance to the RPC header.
*/
p = xdr_inline_decode(xdr, 3 * sizeof(*p));
if (unlikely(!p))
- goto out_short;
+ return true;
rpcrdma_bc_receive_call(r_xprt, rep);
return true;
-
-out_short:
- pr_warn("RPC/RDMA short backward direction call\n");
- return true;
}
#else /* CONFIG_SUNRPC_BACKCHANNEL */
{
@@ -1195,10 +1175,7 @@ static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
if (unlikely(!p))
return -EIO;
- handle = be32_to_cpup(p++);
- *length = be32_to_cpup(p++);
- xdr_decode_hyper(p, &offset);
-
+ xdr_decode_rdma_segment(p, &handle, length, &offset);
trace_xprtrdma_decode_seg(handle, *length, offset);
return 0;
}
@@ -1234,7 +1211,7 @@ static int decode_read_list(struct xdr_stream *xdr)
p = xdr_inline_decode(xdr, sizeof(*p));
if (unlikely(!p))
return -EIO;
- if (unlikely(*p != xdr_zero))
+ if (unlikely(xdr_item_is_present(p)))
return -EIO;
return 0;
}
@@ -1253,7 +1230,7 @@ static int decode_write_list(struct xdr_stream *xdr, u32 *length)
p = xdr_inline_decode(xdr, sizeof(*p));
if (unlikely(!p))
return -EIO;
- if (*p == xdr_zero)
+ if (xdr_item_is_absent(p))
break;
if (!first)
return -EIO;
@@ -1275,7 +1252,7 @@ static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
return -EIO;
*length = 0;
- if (*p != xdr_zero)
+ if (xdr_item_is_present(p))
if (decode_write_chunk(xdr, length))
return -EIO;
return 0;
@@ -1352,29 +1329,47 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
p = xdr_inline_decode(xdr, 2 * sizeof(*p));
if (!p)
break;
- dprintk("RPC: %s: server reports "
- "version error (%u-%u), xid %08x\n", __func__,
- be32_to_cpup(p), be32_to_cpu(*(p + 1)),
- be32_to_cpu(rep->rr_xid));
+ trace_xprtrdma_err_vers(rqst, p, p + 1);
break;
case err_chunk:
- dprintk("RPC: %s: server reports "
- "header decoding error, xid %08x\n", __func__,
- be32_to_cpu(rep->rr_xid));
+ trace_xprtrdma_err_chunk(rqst);
break;
default:
- dprintk("RPC: %s: server reports "
- "unrecognized error %d, xid %08x\n", __func__,
- be32_to_cpup(p), be32_to_cpu(rep->rr_xid));
+ trace_xprtrdma_err_unrecognized(rqst, p);
}
- r_xprt->rx_stats.bad_reply_count++;
- return -EREMOTEIO;
+ return -EIO;
+}
+
+/**
+ * rpcrdma_unpin_rqst - Release rqst without completing it
+ * @rep: RPC/RDMA Receive context
+ *
+ * This is done when a connection is lost so that a Reply
+ * can be dropped and its matching Call can be subsequently
+ * retransmitted on a new connection.
+ */
+void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep)
+{
+ struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt;
+ struct rpc_rqst *rqst = rep->rr_rqst;
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+
+ req->rl_reply = NULL;
+ rep->rr_rqst = NULL;
+
+ spin_lock(&xprt->queue_lock);
+ xprt_unpin_rqst(rqst);
+ spin_unlock(&xprt->queue_lock);
}
-/* Perform XID lookup, reconstruction of the RPC reply, and
- * RPC completion while holding the transport lock to ensure
- * the rep, rqst, and rq_task pointers remain stable.
+/**
+ * rpcrdma_complete_rqst - Pass completed rqst back to RPC
+ * @rep: RPC/RDMA Receive context
+ *
+ * Reconstruct the RPC reply and complete the transaction
+ * while @rqst is still pinned to ensure the rep, rqst, and
+ * rq_task pointers remain stable.
*/
void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
{
@@ -1406,13 +1401,11 @@ out:
spin_unlock(&xprt->queue_lock);
return;
-/* If the incoming reply terminated a pending RPC, the next
- * RPC call will post a replacement receive buffer as it is
- * being marshaled.
- */
out_badheader:
- trace_xprtrdma_reply_hdr(rep);
+ trace_xprtrdma_reply_hdr_err(rep);
r_xprt->rx_stats.bad_reply_count++;
+ rqst->rq_task->tk_status = status;
+ status = 0;
goto out;
}
@@ -1476,21 +1469,20 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
if (credits == 0)
credits = 1; /* don't deadlock */
- else if (credits > r_xprt->rx_ep.rep_max_requests)
- credits = r_xprt->rx_ep.rep_max_requests;
+ else if (credits > r_xprt->rx_ep->re_max_requests)
+ credits = r_xprt->rx_ep->re_max_requests;
+ rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1),
+ false);
if (buf->rb_credits != credits)
rpcrdma_update_cwnd(r_xprt, credits);
- rpcrdma_post_recvs(r_xprt, false);
req = rpcr_to_rdmar(rqst);
- if (req->rl_reply) {
- trace_xprtrdma_leaked_rep(rqst, req->rl_reply);
- rpcrdma_recv_buffer_put(req->rl_reply);
- }
+ if (unlikely(req->rl_reply))
+ rpcrdma_rep_put(buf, req->rl_reply);
req->rl_reply = rep;
rep->rr_rqst = rqst;
- trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
+ trace_xprtrdma_reply(rqst->rq_task, rep, credits);
if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
frwr_reminv(rep, &req->rl_registered);
@@ -1502,17 +1494,17 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
return;
out_badversion:
- trace_xprtrdma_reply_vers(rep);
+ trace_xprtrdma_reply_vers_err(rep);
goto out;
out_norqst:
spin_unlock(&xprt->queue_lock);
- trace_xprtrdma_reply_rqst(rep);
+ trace_xprtrdma_reply_rqst_err(rep);
goto out;
out_shortreply:
- trace_xprtrdma_reply_short(rep);
+ trace_xprtrdma_reply_short_err(rep);
out:
- rpcrdma_recv_buffer_put(rep);
+ rpcrdma_rep_put(buf, rep);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 97bca509a391..5bc20e9d09cd 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -62,52 +62,47 @@ static unsigned int max_max_requests = 16384;
unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
+static unsigned int svcrdma_stat_unused;
+static unsigned int zero;
-atomic_t rdma_stat_recv;
-atomic_t rdma_stat_read;
-atomic_t rdma_stat_write;
-atomic_t rdma_stat_sq_starve;
-atomic_t rdma_stat_rq_starve;
-atomic_t rdma_stat_rq_poll;
-atomic_t rdma_stat_rq_prod;
-atomic_t rdma_stat_sq_poll;
-atomic_t rdma_stat_sq_prod;
+struct percpu_counter svcrdma_stat_read;
+struct percpu_counter svcrdma_stat_recv;
+struct percpu_counter svcrdma_stat_sq_starve;
+struct percpu_counter svcrdma_stat_write;
-/*
- * This function implements reading and resetting an atomic_t stat
- * variable through read/write to a proc file. Any write to the file
- * resets the associated statistic to zero. Any read returns it's
- * current value.
- */
-static int read_reset_stat(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+enum {
+ SVCRDMA_COUNTER_BUFSIZ = sizeof(unsigned long long),
+};
+
+static int svcrdma_counter_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- atomic_t *stat = (atomic_t *)table->data;
-
- if (!stat)
- return -EINVAL;
-
- if (write)
- atomic_set(stat, 0);
- else {
- char str_buf[32];
- int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
- if (len >= 32)
- return -EFAULT;
- len = strlen(str_buf);
- if (*ppos > len) {
- *lenp = 0;
- return 0;
- }
- len -= *ppos;
- if (len > *lenp)
- len = *lenp;
- if (len && copy_to_user(buffer, str_buf, len))
- return -EFAULT;
- *lenp = len;
- *ppos += len;
+ struct percpu_counter *stat = (struct percpu_counter *)table->data;
+ char tmp[SVCRDMA_COUNTER_BUFSIZ + 1];
+ int len;
+
+ if (write) {
+ percpu_counter_set(stat, 0);
+ return 0;
}
+
+ len = snprintf(tmp, SVCRDMA_COUNTER_BUFSIZ, "%lld\n",
+ percpu_counter_sum_positive(stat));
+ if (len >= SVCRDMA_COUNTER_BUFSIZ)
+ return -EFAULT;
+ len = strlen(tmp);
+ if (*ppos > len) {
+ *lenp = 0;
+ return 0;
+ }
+ len -= *ppos;
+ if (len > *lenp)
+ len = *lenp;
+ if (len)
+ memcpy(buffer, tmp, len);
+ *lenp = len;
+ *ppos += len;
+
return 0;
}
@@ -143,66 +138,76 @@ static struct ctl_table svcrdma_parm_table[] = {
{
.procname = "rdma_stat_read",
- .data = &rdma_stat_read,
- .maxlen = sizeof(atomic_t),
+ .data = &svcrdma_stat_read,
+ .maxlen = SVCRDMA_COUNTER_BUFSIZ,
.mode = 0644,
- .proc_handler = read_reset_stat,
+ .proc_handler = svcrdma_counter_handler,
},
{
.procname = "rdma_stat_recv",
- .data = &rdma_stat_recv,
- .maxlen = sizeof(atomic_t),
+ .data = &svcrdma_stat_recv,
+ .maxlen = SVCRDMA_COUNTER_BUFSIZ,
.mode = 0644,
- .proc_handler = read_reset_stat,
+ .proc_handler = svcrdma_counter_handler,
},
{
.procname = "rdma_stat_write",
- .data = &rdma_stat_write,
- .maxlen = sizeof(atomic_t),
+ .data = &svcrdma_stat_write,
+ .maxlen = SVCRDMA_COUNTER_BUFSIZ,
.mode = 0644,
- .proc_handler = read_reset_stat,
+ .proc_handler = svcrdma_counter_handler,
},
{
.procname = "rdma_stat_sq_starve",
- .data = &rdma_stat_sq_starve,
- .maxlen = sizeof(atomic_t),
+ .data = &svcrdma_stat_sq_starve,
+ .maxlen = SVCRDMA_COUNTER_BUFSIZ,
.mode = 0644,
- .proc_handler = read_reset_stat,
+ .proc_handler = svcrdma_counter_handler,
},
{
.procname = "rdma_stat_rq_starve",
- .data = &rdma_stat_rq_starve,
- .maxlen = sizeof(atomic_t),
+ .data = &svcrdma_stat_unused,
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = read_reset_stat,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &zero,
},
{
.procname = "rdma_stat_rq_poll",
- .data = &rdma_stat_rq_poll,
- .maxlen = sizeof(atomic_t),
+ .data = &svcrdma_stat_unused,
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = read_reset_stat,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &zero,
},
{
.procname = "rdma_stat_rq_prod",
- .data = &rdma_stat_rq_prod,
- .maxlen = sizeof(atomic_t),
+ .data = &svcrdma_stat_unused,
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = read_reset_stat,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &zero,
},
{
.procname = "rdma_stat_sq_poll",
- .data = &rdma_stat_sq_poll,
- .maxlen = sizeof(atomic_t),
+ .data = &svcrdma_stat_unused,
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = read_reset_stat,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &zero,
},
{
.procname = "rdma_stat_sq_prod",
- .data = &rdma_stat_sq_prod,
- .maxlen = sizeof(atomic_t),
+ .data = &svcrdma_stat_unused,
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = read_reset_stat,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &zero,
},
{ },
};
@@ -225,27 +230,69 @@ static struct ctl_table svcrdma_root_table[] = {
{ },
};
+static void svc_rdma_proc_cleanup(void)
+{
+ if (!svcrdma_table_header)
+ return;
+ unregister_sysctl_table(svcrdma_table_header);
+ svcrdma_table_header = NULL;
+
+ percpu_counter_destroy(&svcrdma_stat_write);
+ percpu_counter_destroy(&svcrdma_stat_sq_starve);
+ percpu_counter_destroy(&svcrdma_stat_recv);
+ percpu_counter_destroy(&svcrdma_stat_read);
+}
+
+static int svc_rdma_proc_init(void)
+{
+ int rc;
+
+ if (svcrdma_table_header)
+ return 0;
+
+ rc = percpu_counter_init(&svcrdma_stat_read, 0, GFP_KERNEL);
+ if (rc)
+ goto out_err;
+ rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL);
+ if (rc)
+ goto out_err;
+ rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL);
+ if (rc)
+ goto out_err;
+ rc = percpu_counter_init(&svcrdma_stat_write, 0, GFP_KERNEL);
+ if (rc)
+ goto out_err;
+
+ svcrdma_table_header = register_sysctl_table(svcrdma_root_table);
+ return 0;
+
+out_err:
+ percpu_counter_destroy(&svcrdma_stat_sq_starve);
+ percpu_counter_destroy(&svcrdma_stat_recv);
+ percpu_counter_destroy(&svcrdma_stat_read);
+ return rc;
+}
+
void svc_rdma_cleanup(void)
{
dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
- if (svcrdma_table_header) {
- unregister_sysctl_table(svcrdma_table_header);
- svcrdma_table_header = NULL;
- }
svc_unreg_xprt_class(&svc_rdma_class);
+ svc_rdma_proc_cleanup();
}
int svc_rdma_init(void)
{
+ int rc;
+
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
dprintk("\tmax_requests : %u\n", svcrdma_max_requests);
dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
- if (!svcrdma_table_header)
- svcrdma_table_header =
- register_sysctl_table(svcrdma_root_table);
+ rc = svc_rdma_proc_init();
+ if (rc)
+ return rc;
/* Register RDMA with the SVC transport switch */
svc_reg_xprt_class(&svc_rdma_class);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 908e78bb87c6..aa2227a7e552 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -2,7 +2,7 @@
/*
* Copyright (c) 2015-2018 Oracle. All rights reserved.
*
- * Support for backward direction RPCs on RPC/RDMA (server-side).
+ * Support for reverse-direction RPCs on RPC/RDMA (server-side).
*/
#include <linux/sunrpc/svc_rdma.h>
@@ -10,59 +10,34 @@
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
-#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-
-#undef SVCRDMA_BACKCHANNEL_DEBUG
-
/**
- * svc_rdma_handle_bc_reply - Process incoming backchannel reply
- * @xprt: controlling backchannel transport
- * @rdma_resp: pointer to incoming transport header
- * @rcvbuf: XDR buffer into which to decode the reply
+ * svc_rdma_handle_bc_reply - Process incoming backchannel Reply
+ * @rqstp: resources for handling the Reply
+ * @rctxt: Received message
*
- * Returns:
- * %0 if @rcvbuf is filled in, xprt_complete_rqst called,
- * %-EAGAIN if server should call ->recvfrom again.
*/
-int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
- struct xdr_buf *rcvbuf)
+void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp,
+ struct svc_rdma_recv_ctxt *rctxt)
{
+ struct svc_xprt *sxprt = rqstp->rq_xprt;
+ struct rpc_xprt *xprt = sxprt->xpt_bc_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct xdr_buf *rcvbuf = &rqstp->rq_arg;
struct kvec *dst, *src = &rcvbuf->head[0];
+ __be32 *rdma_resp = rctxt->rc_recv_buf;
struct rpc_rqst *req;
u32 credits;
- size_t len;
- __be32 xid;
- __be32 *p;
- int ret;
-
- p = (__be32 *)src->iov_base;
- len = src->iov_len;
- xid = *rdma_resp;
-
-#ifdef SVCRDMA_BACKCHANNEL_DEBUG
- pr_info("%s: xid=%08x, length=%zu\n",
- __func__, be32_to_cpu(xid), len);
- pr_info("%s: RPC/RDMA: %*ph\n",
- __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
- pr_info("%s: RPC: %*ph\n",
- __func__, (int)len, p);
-#endif
-
- ret = -EAGAIN;
- if (src->iov_len < 24)
- goto out_shortreply;
spin_lock(&xprt->queue_lock);
- req = xprt_lookup_rqst(xprt, xid);
+ req = xprt_lookup_rqst(xprt, *rdma_resp);
if (!req)
- goto out_notfound;
+ goto out_unlock;
dst = &req->rq_private_buf.head[0];
memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
- if (dst->iov_len < len)
+ if (dst->iov_len < src->iov_len)
goto out_unlock;
- memcpy(dst->iov_base, p, len);
+ memcpy(dst->iov_base, src->iov_base, src->iov_len);
xprt_pin_rqst(req);
spin_unlock(&xprt->queue_lock);
@@ -71,34 +46,20 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
credits = r_xprt->rx_buf.rb_bc_max_requests;
-
spin_lock(&xprt->transport_lock);
xprt->cwnd = credits << RPC_CWNDSHIFT;
spin_unlock(&xprt->transport_lock);
spin_lock(&xprt->queue_lock);
- ret = 0;
xprt_complete_rqst(req->rq_task, rcvbuf->len);
xprt_unpin_rqst(req);
rcvbuf->len = 0;
out_unlock:
spin_unlock(&xprt->queue_lock);
-out:
- return ret;
-
-out_shortreply:
- dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n",
- xprt, src->iov_len);
- goto out;
-
-out_notfound:
- dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n",
- xprt, be32_to_cpu(xid));
- goto out_unlock;
}
-/* Send a backwards direction RPC call.
+/* Send a reverse-direction RPC Call.
*
* Caller holds the connection's mutex and has already marshaled
* the RPC/RDMA request.
@@ -113,11 +74,17 @@ out_notfound:
*/
static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
struct rpc_rqst *rqst,
- struct svc_rdma_send_ctxt *ctxt)
+ struct svc_rdma_send_ctxt *sctxt)
{
+ struct svc_rdma_recv_ctxt *rctxt;
int ret;
- ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqst->rq_snd_buf, NULL);
+ rctxt = svc_rdma_recv_ctxt_get(rdma);
+ if (!rctxt)
+ return -EIO;
+
+ ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf);
+ svc_rdma_recv_ctxt_put(rdma, rctxt);
if (ret < 0)
return -EIO;
@@ -125,8 +92,14 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
* the rq_buffer before all retransmits are complete.
*/
get_page(virt_to_page(rqst->rq_buffer));
- ctxt->sc_send_wr.opcode = IB_WR_SEND;
- return svc_rdma_send(rdma, &ctxt->sc_send_wr);
+ sctxt->sc_send_wr.opcode = IB_WR_SEND;
+ ret = svc_rdma_send(rdma, sctxt);
+ if (ret < 0)
+ return ret;
+
+ ret = wait_for_completion_killable(&sctxt->sc_done);
+ svc_rdma_send_ctxt_put(rdma, sctxt);
+ return ret;
}
/* Server-side transport endpoint wants a whole page for its send
@@ -146,12 +119,12 @@ xprt_rdma_bc_allocate(struct rpc_task *task)
return -EINVAL;
}
- page = alloc_page(RPCRDMA_DEF_GFP);
+ page = alloc_page(GFP_NOIO | __GFP_NOWARN);
if (!page)
return -ENOMEM;
rqst->rq_buffer = page_address(page);
- rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP);
+ rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, GFP_NOIO | __GFP_NOWARN);
if (!rqst->rq_rbuffer) {
put_page(page);
return -ENOMEM;
@@ -181,7 +154,9 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
if (!ctxt)
goto drop_connection;
- p = ctxt->sc_xprt_buf;
+ p = xdr_reserve_space(&ctxt->sc_stream, RPCRDMA_HDRLEN_MIN);
+ if (!p)
+ goto put_ctxt;
*p++ = rqst->rq_xid;
*p++ = rpcrdma_version;
*p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
@@ -189,67 +164,55 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
*p++ = xdr_zero;
*p++ = xdr_zero;
*p = xdr_zero;
- svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_MIN);
-
-#ifdef SVCRDMA_BACKCHANNEL_DEBUG
- pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
-#endif
rqst->rq_xtime = ktime_get();
rc = svc_rdma_bc_sendto(rdma, rqst, ctxt);
- if (rc) {
- svc_rdma_send_ctxt_put(rdma, ctxt);
- goto drop_connection;
- }
+ if (rc)
+ goto put_ctxt;
return 0;
+put_ctxt:
+ svc_rdma_send_ctxt_put(rdma, ctxt);
+
drop_connection:
- dprintk("svcrdma: failed to send bc call\n");
return -ENOTCONN;
}
-/* Send an RPC call on the passive end of a transport
- * connection.
+/**
+ * xprt_rdma_bc_send_request - Send a reverse-direction Call
+ * @rqst: rpc_rqst containing Call message to be sent
+ *
+ * Return values:
+ * %0 if the message was sent successfully
+ * %ENOTCONN if the message was not sent
*/
-static int
-xprt_rdma_bc_send_request(struct rpc_rqst *rqst)
+static int xprt_rdma_bc_send_request(struct rpc_rqst *rqst)
{
struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
- struct svcxprt_rdma *rdma;
+ struct svcxprt_rdma *rdma =
+ container_of(sxprt, struct svcxprt_rdma, sc_xprt);
int ret;
- dprintk("svcrdma: sending bc call with xid: %08x\n",
- be32_to_cpu(rqst->rq_xid));
-
- mutex_lock(&sxprt->xpt_mutex);
-
- ret = -ENOTCONN;
- rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
- if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) {
- ret = rpcrdma_bc_send_request(rdma, rqst);
- if (ret == -ENOTCONN)
- svc_close_xprt(sxprt);
- }
-
- mutex_unlock(&sxprt->xpt_mutex);
+ if (test_bit(XPT_DEAD, &sxprt->xpt_flags))
+ return -ENOTCONN;
- if (ret < 0)
- return ret;
- return 0;
+ ret = rpcrdma_bc_send_request(rdma, rqst);
+ if (ret == -ENOTCONN)
+ svc_xprt_close(sxprt);
+ return ret;
}
static void
xprt_rdma_bc_close(struct rpc_xprt *xprt)
{
- dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+ xprt_disconnect_done(xprt);
xprt->cwnd = RPC_CWNDSHIFT;
}
static void
xprt_rdma_bc_put(struct rpc_xprt *xprt)
{
- dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
-
+ xprt_rdma_free_addresses(xprt);
xprt_free(xprt);
}
@@ -283,26 +246,21 @@ xprt_setup_rdma_bc(struct xprt_create *args)
struct rpc_xprt *xprt;
struct rpcrdma_xprt *new_xprt;
- if (args->addrlen > sizeof(xprt->addr)) {
- dprintk("RPC: %s: address too large\n", __func__);
+ if (args->addrlen > sizeof(xprt->addr))
return ERR_PTR(-EBADF);
- }
xprt = xprt_alloc(args->net, sizeof(*new_xprt),
RPCRDMA_MAX_BC_REQUESTS,
RPCRDMA_MAX_BC_REQUESTS);
- if (!xprt) {
- dprintk("RPC: %s: couldn't allocate rpc_xprt\n",
- __func__);
+ if (!xprt)
return ERR_PTR(-ENOMEM);
- }
xprt->timeout = &xprt_rdma_bc_timeout;
xprt_set_bound(xprt);
xprt_set_connected(xprt);
- xprt->bind_timeout = RPCRDMA_BIND_TO;
- xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
- xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+ xprt->bind_timeout = 0;
+ xprt->reestablish_timeout = 0;
+ xprt->idle_timeout = 0;
xprt->prot = XPRT_TRANSPORT_BC_RDMA;
xprt->ops = &xprt_rdma_bc_procs;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
new file mode 100644
index 000000000000..b63cfeaa2923
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Oracle. All rights reserved.
+ */
+
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/rpc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+/**
+ * pcl_free - Release all memory associated with a parsed chunk list
+ * @pcl: parsed chunk list
+ *
+ */
+void pcl_free(struct svc_rdma_pcl *pcl)
+{
+ while (!list_empty(&pcl->cl_chunks)) {
+ struct svc_rdma_chunk *chunk;
+
+ chunk = pcl_first_chunk(pcl);
+ list_del(&chunk->ch_list);
+ kfree(chunk);
+ }
+}
+
+static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position)
+{
+ struct svc_rdma_chunk *chunk;
+
+ chunk = kmalloc(struct_size(chunk, ch_segments, segcount), GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ chunk->ch_position = position;
+ chunk->ch_length = 0;
+ chunk->ch_payload_length = 0;
+ chunk->ch_segcount = 0;
+ return chunk;
+}
+
+static struct svc_rdma_chunk *
+pcl_lookup_position(struct svc_rdma_pcl *pcl, u32 position)
+{
+ struct svc_rdma_chunk *pos;
+
+ pcl_for_each_chunk(pos, pcl) {
+ if (pos->ch_position == position)
+ return pos;
+ }
+ return NULL;
+}
+
+static void pcl_insert_position(struct svc_rdma_pcl *pcl,
+ struct svc_rdma_chunk *chunk)
+{
+ struct svc_rdma_chunk *pos;
+
+ pcl_for_each_chunk(pos, pcl) {
+ if (pos->ch_position > chunk->ch_position)
+ break;
+ }
+ __list_add(&chunk->ch_list, pos->ch_list.prev, &pos->ch_list);
+ pcl->cl_count++;
+}
+
+static void pcl_set_read_segment(const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_chunk *chunk,
+ u32 handle, u32 length, u64 offset)
+{
+ struct svc_rdma_segment *segment;
+
+ segment = &chunk->ch_segments[chunk->ch_segcount];
+ segment->rs_handle = handle;
+ segment->rs_length = length;
+ segment->rs_offset = offset;
+
+ trace_svcrdma_decode_rseg(&rctxt->rc_cid, chunk, segment);
+
+ chunk->ch_length += length;
+ chunk->ch_segcount++;
+}
+
+/**
+ * pcl_alloc_call - Construct a parsed chunk list for the Call body
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
+ *
+ * Assumptions:
+ * - The incoming Read list has already been sanity checked.
+ * - cl_count is already set to the number of segments in
+ * the un-decoded list.
+ * - The list might not be in order by position.
+ *
+ * Return values:
+ * %true: Parsed chunk list was successfully constructed, and
+ * cl_count is updated to be the number of chunks (ie.
+ * unique positions) in the Read list.
+ * %false: Memory allocation failed.
+ */
+bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+ struct svc_rdma_pcl *pcl = &rctxt->rc_call_pcl;
+ unsigned int i, segcount = pcl->cl_count;
+
+ pcl->cl_count = 0;
+ for (i = 0; i < segcount; i++) {
+ struct svc_rdma_chunk *chunk;
+ u32 position, handle, length;
+ u64 offset;
+
+ p++; /* skip the list discriminator */
+ p = xdr_decode_read_segment(p, &position, &handle,
+ &length, &offset);
+ if (position != 0)
+ continue;
+
+ if (pcl_is_empty(pcl)) {
+ chunk = pcl_alloc_chunk(segcount, position);
+ if (!chunk)
+ return false;
+ pcl_insert_position(pcl, chunk);
+ } else {
+ chunk = list_first_entry(&pcl->cl_chunks,
+ struct svc_rdma_chunk,
+ ch_list);
+ }
+
+ pcl_set_read_segment(rctxt, chunk, handle, length, offset);
+ }
+
+ return true;
+}
+
+/**
+ * pcl_alloc_read - Construct a parsed chunk list for normal Read chunks
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
+ *
+ * Assumptions:
+ * - The incoming Read list has already been sanity checked.
+ * - cl_count is already set to the number of segments in
+ * the un-decoded list.
+ * - The list might not be in order by position.
+ *
+ * Return values:
+ * %true: Parsed chunk list was successfully constructed, and
+ * cl_count is updated to be the number of chunks (ie.
+ * unique position values) in the Read list.
+ * %false: Memory allocation failed.
+ *
+ * TODO:
+ * - Check for chunk range overlaps
+ */
+bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+ struct svc_rdma_pcl *pcl = &rctxt->rc_read_pcl;
+ unsigned int i, segcount = pcl->cl_count;
+
+ pcl->cl_count = 0;
+ for (i = 0; i < segcount; i++) {
+ struct svc_rdma_chunk *chunk;
+ u32 position, handle, length;
+ u64 offset;
+
+ p++; /* skip the list discriminator */
+ p = xdr_decode_read_segment(p, &position, &handle,
+ &length, &offset);
+ if (position == 0)
+ continue;
+
+ chunk = pcl_lookup_position(pcl, position);
+ if (!chunk) {
+ chunk = pcl_alloc_chunk(segcount, position);
+ if (!chunk)
+ return false;
+ pcl_insert_position(pcl, chunk);
+ }
+
+ pcl_set_read_segment(rctxt, chunk, handle, length, offset);
+ }
+
+ return true;
+}
+
+/**
+ * pcl_alloc_write - Construct a parsed chunk list from a Write list
+ * @rctxt: Ingress receive context
+ * @pcl: Parsed chunk list to populate
+ * @p: Start of an un-decoded Write list
+ *
+ * Assumptions:
+ * - The incoming Write list has already been sanity checked, and
+ * - cl_count is set to the number of chunks in the un-decoded list.
+ *
+ * Return values:
+ * %true: Parsed chunk list was successfully constructed.
+ * %false: Memory allocation failed.
+ */
+bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_pcl *pcl, __be32 *p)
+{
+ struct svc_rdma_segment *segment;
+ struct svc_rdma_chunk *chunk;
+ unsigned int i, j;
+ u32 segcount;
+
+ for (i = 0; i < pcl->cl_count; i++) {
+ p++; /* skip the list discriminator */
+ segcount = be32_to_cpup(p++);
+
+ chunk = pcl_alloc_chunk(segcount, 0);
+ if (!chunk)
+ return false;
+ list_add_tail(&chunk->ch_list, &pcl->cl_chunks);
+
+ for (j = 0; j < segcount; j++) {
+ segment = &chunk->ch_segments[j];
+ p = xdr_decode_rdma_segment(p, &segment->rs_handle,
+ &segment->rs_length,
+ &segment->rs_offset);
+ trace_svcrdma_decode_wseg(&rctxt->rc_cid, chunk, j);
+
+ chunk->ch_length += segment->rs_length;
+ chunk->ch_segcount++;
+ }
+ }
+ return true;
+}
+
+static int pcl_process_region(const struct xdr_buf *xdr,
+ unsigned int offset, unsigned int length,
+ int (*actor)(const struct xdr_buf *, void *),
+ void *data)
+{
+ struct xdr_buf subbuf;
+
+ if (!length)
+ return 0;
+ if (xdr_buf_subsegment(xdr, &subbuf, offset, length))
+ return -EMSGSIZE;
+ return actor(&subbuf, data);
+}
+
+/**
+ * pcl_process_nonpayloads - Process non-payload regions inside @xdr
+ * @pcl: Chunk list to process
+ * @xdr: xdr_buf to process
+ * @actor: Function to invoke on each non-payload region
+ * @data: Arguments for @actor
+ *
+ * This mechanism must ignore not only result payloads that were already
+ * sent via RDMA Write, but also XDR padding for those payloads that
+ * the upper layer has added.
+ *
+ * Assumptions:
+ * The xdr->len and ch_position fields are aligned to 4-byte multiples.
+ *
+ * Returns:
+ * On success, zero,
+ * %-EMSGSIZE on XDR buffer overflow, or
+ * The return value of @actor
+ */
+int pcl_process_nonpayloads(const struct svc_rdma_pcl *pcl,
+ const struct xdr_buf *xdr,
+ int (*actor)(const struct xdr_buf *, void *),
+ void *data)
+{
+ struct svc_rdma_chunk *chunk, *next;
+ unsigned int start;
+ int ret;
+
+ chunk = pcl_first_chunk(pcl);
+
+ /* No result payloads were generated */
+ if (!chunk || !chunk->ch_payload_length)
+ return actor(xdr, data);
+
+ /* Process the region before the first result payload */
+ ret = pcl_process_region(xdr, 0, chunk->ch_position, actor, data);
+ if (ret < 0)
+ return ret;
+
+ /* Process the regions between each middle result payload */
+ while ((next = pcl_next_chunk(pcl, chunk))) {
+ if (!next->ch_payload_length)
+ break;
+
+ start = pcl_chunk_end_offset(chunk);
+ ret = pcl_process_region(xdr, start, next->ch_position - start,
+ actor, data);
+ if (ret < 0)
+ return ret;
+
+ chunk = next;
+ }
+
+ /* Process the region after the last result payload */
+ start = pcl_chunk_end_offset(chunk);
+ ret = pcl_process_region(xdr, start, xdr->len - start, actor, data);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 96bccd398469..5242ad121450 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -89,10 +89,10 @@
* svc_rdma_recvfrom call returns.
*
* During the second svc_rdma_recvfrom call, RDMA Read sink pages
- * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst
- * (see rdma_read_complete() below).
+ * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst.
*/
+#include <linux/slab.h>
#include <linux/spinlock.h>
#include <asm/unaligned.h>
#include <rdma/ib_verbs.h>
@@ -106,8 +106,6 @@
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
-#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-
static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
static inline struct svc_rdma_recv_ctxt *
@@ -117,6 +115,13 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
rc_list);
}
+static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
+ struct rpc_rdma_cid *cid)
+{
+ cid->ci_queue_id = rdma->sc_rq_cq->res.id;
+ cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
static struct svc_rdma_recv_ctxt *
svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
{
@@ -135,6 +140,12 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
goto fail2;
+ svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
+ pcl_init(&ctxt->rc_call_pcl);
+ pcl_init(&ctxt->rc_read_pcl);
+ pcl_init(&ctxt->rc_write_pcl);
+ pcl_init(&ctxt->rc_reply_pcl);
+
ctxt->rc_recv_wr.next = NULL;
ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe;
ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
@@ -180,8 +191,13 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
}
}
-static struct svc_rdma_recv_ctxt *
-svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
+/**
+ * svc_rdma_recv_ctxt_get - Allocate a recv_ctxt
+ * @rdma: controlling svcxprt_rdma
+ *
+ * Returns a recv_ctxt or (rarely) NULL if none are available.
+ */
+struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;
struct llist_node *node;
@@ -211,10 +227,10 @@ out_empty:
void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *ctxt)
{
- unsigned int i;
-
- for (i = 0; i < ctxt->rc_page_count; i++)
- put_page(ctxt->rc_pages[i]);
+ pcl_free(&ctxt->rc_call_pcl);
+ pcl_free(&ctxt->rc_read_pcl);
+ pcl_free(&ctxt->rc_write_pcl);
+ pcl_free(&ctxt->rc_reply_pcl);
if (!ctxt->rc_temp)
llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
@@ -222,32 +238,68 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
svc_rdma_recv_ctxt_destroy(rdma, ctxt);
}
-static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma,
- struct svc_rdma_recv_ctxt *ctxt)
+/**
+ * svc_rdma_release_rqst - Release transport-specific per-rqst resources
+ * @rqstp: svc_rqst being released
+ *
+ * Ensure that the recv_ctxt is released whether or not a Reply
+ * was sent. For example, the client could close the connection,
+ * or svc_process could drop an RPC, before the Reply is sent.
+ */
+void svc_rdma_release_rqst(struct svc_rqst *rqstp)
{
- int ret;
-
- svc_xprt_get(&rdma->sc_xprt);
- ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL);
- trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret);
- if (ret)
- goto err_post;
- return 0;
+ struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt;
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
-err_post:
- svc_rdma_recv_ctxt_put(rdma, ctxt);
- svc_xprt_put(&rdma->sc_xprt);
- return ret;
+ rqstp->rq_xprt_ctxt = NULL;
+ if (ctxt)
+ svc_rdma_recv_ctxt_put(rdma, ctxt);
}
-static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
+static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
+ unsigned int wanted, bool temp)
{
+ const struct ib_recv_wr *bad_wr = NULL;
struct svc_rdma_recv_ctxt *ctxt;
+ struct ib_recv_wr *recv_chain;
+ int ret;
- ctxt = svc_rdma_recv_ctxt_get(rdma);
- if (!ctxt)
- return -ENOMEM;
- return __svc_rdma_post_recv(rdma, ctxt);
+ if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
+ return false;
+
+ recv_chain = NULL;
+ while (wanted--) {
+ ctxt = svc_rdma_recv_ctxt_get(rdma);
+ if (!ctxt)
+ break;
+
+ trace_svcrdma_post_recv(ctxt);
+ ctxt->rc_temp = temp;
+ ctxt->rc_recv_wr.next = recv_chain;
+ recv_chain = &ctxt->rc_recv_wr;
+ rdma->sc_pending_recvs++;
+ }
+ if (!recv_chain)
+ return false;
+
+ ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr);
+ if (ret)
+ goto err_free;
+ return true;
+
+err_free:
+ trace_svcrdma_rq_post_err(rdma, ret);
+ while (bad_wr) {
+ ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt,
+ rc_recv_wr);
+ bad_wr = bad_wr->next;
+ svc_rdma_recv_ctxt_put(rdma, ctxt);
+ }
+ /* Since we're destroying the xprt, no need to reset
+ * sc_pending_recvs. */
+ return false;
}
/**
@@ -258,20 +310,7 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
*/
bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
{
- struct svc_rdma_recv_ctxt *ctxt;
- unsigned int i;
- int ret;
-
- for (i = 0; i < rdma->sc_max_requests; i++) {
- ctxt = svc_rdma_recv_ctxt_get(rdma);
- if (!ctxt)
- return false;
- ctxt->rc_temp = true;
- ret = __svc_rdma_post_recv(rdma, ctxt);
- if (ret)
- return false;
- }
- return true;
+ return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true);
}
/**
@@ -279,8 +318,6 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
* @cq: Completion Queue context
* @wc: Work Completion object
*
- * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that
- * the Receive completion handler could be running.
*/
static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
{
@@ -288,22 +325,30 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
struct ib_cqe *cqe = wc->wr_cqe;
struct svc_rdma_recv_ctxt *ctxt;
- trace_svcrdma_wc_receive(wc);
+ rdma->sc_pending_recvs--;
/* WARNING: Only wc->wr_cqe and wc->status are reliable */
ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
if (wc->status != IB_WC_SUCCESS)
goto flushed;
-
- if (svc_rdma_post_recv(rdma))
- goto post_err;
+ trace_svcrdma_wc_recv(wc, &ctxt->rc_cid);
+
+ /* If receive posting fails, the connection is about to be
+ * lost anyway. The server will not be able to send a reply
+ * for this RPC, and the client will retransmit this RPC
+ * anyway when it reconnects.
+ *
+ * Therefore we drop the Receive, even if status was SUCCESS
+ * to reduce the likelihood of replayed requests once the
+ * client reconnects.
+ */
+ if (rdma->sc_pending_recvs < rdma->sc_max_requests)
+ if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false))
+ goto dropped;
/* All wc fields are now known to be valid */
ctxt->rc_byte_len = wc->byte_len;
- ib_dma_sync_single_for_cpu(rdma->sc_pd->device,
- ctxt->rc_recv_sge.addr,
- wc->byte_len, DMA_FROM_DEVICE);
spin_lock(&rdma->sc_rq_dto_lock);
list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q);
@@ -312,15 +357,16 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
spin_unlock(&rdma->sc_rq_dto_lock);
if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
svc_xprt_enqueue(&rdma->sc_xprt);
- goto out;
+ return;
flushed:
-post_err:
+ if (wc->status == IB_WC_WR_FLUSH_ERR)
+ trace_svcrdma_wc_recv_flush(wc, &ctxt->rc_cid);
+ else
+ trace_svcrdma_wc_recv_err(wc, &ctxt->rc_cid);
+dropped:
svc_rdma_recv_ctxt_put(rdma, ctxt);
- set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
- svc_xprt_enqueue(&rdma->sc_xprt);
-out:
- svc_xprt_put(&rdma->sc_xprt);
+ svc_xprt_deferred_close(&rdma->sc_xprt);
}
/**
@@ -332,10 +378,6 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;
- while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
- list_del(&ctxt->rc_list);
- svc_rdma_recv_ctxt_put(rdma, ctxt);
- }
while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) {
list_del(&ctxt->rc_list);
svc_rdma_recv_ctxt_put(rdma, ctxt);
@@ -357,122 +399,183 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
arg->len = ctxt->rc_byte_len;
}
-/* This accommodates the largest possible Write chunk,
- * in one segment.
+/**
+ * xdr_count_read_segments - Count number of Read segments in Read list
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
+ *
+ * Before allocating anything, ensure the ingress Read list is safe
+ * to use.
+ *
+ * The segment count is limited to how many segments can fit in the
+ * transport header without overflowing the buffer. That's about 40
+ * Read segments for a 1KB inline threshold.
+ *
+ * Return values:
+ * %true: Read list is valid. @rctxt's xdr_stream is updated to point
+ * to the first byte past the Read list. rc_read_pcl and
+ * rc_call_pcl cl_count fields are set to the number of
+ * Read segments in the list.
+ * %false: Read list is corrupt. @rctxt's xdr_stream is left in an
+ * unknown state.
*/
-#define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT))
+static bool xdr_count_read_segments(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+ rctxt->rc_call_pcl.cl_count = 0;
+ rctxt->rc_read_pcl.cl_count = 0;
+ while (xdr_item_is_present(p)) {
+ u32 position, handle, length;
+ u64 offset;
+
+ p = xdr_inline_decode(&rctxt->rc_stream,
+ rpcrdma_readseg_maxsz * sizeof(*p));
+ if (!p)
+ return false;
-/* This accommodates the largest possible Position-Zero
- * Read chunk or Reply chunk, in one segment.
- */
-#define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT))
+ xdr_decode_read_segment(p, &position, &handle,
+ &length, &offset);
+ if (position) {
+ if (position & 3)
+ return false;
+ ++rctxt->rc_read_pcl.cl_count;
+ } else {
+ ++rctxt->rc_call_pcl.cl_count;
+ }
+
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+ }
+ return true;
+}
/* Sanity check the Read list.
*
- * Implementation limits:
- * - This implementation supports only one Read chunk.
- *
* Sanity checks:
- * - Read list does not overflow buffer.
- * - Segment size limited by largest NFS data payload.
- *
- * The segment count is limited to how many segments can
- * fit in the transport header without overflowing the
- * buffer. That's about 40 Read segments for a 1KB inline
- * threshold.
- *
- * Returns pointer to the following Write list.
+ * - Read list does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
+ *
+ * Return values:
+ * %true: Read list is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Read list.
+ * %false: Read list is corrupt. @rctxt's xdr_stream is left
+ * in an unknown state.
*/
-static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end)
+static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt)
{
- u32 position;
- bool first;
-
- first = true;
- while (*p++ != xdr_zero) {
- if (first) {
- position = be32_to_cpup(p++);
- first = false;
- } else if (be32_to_cpup(p++) != position) {
- return NULL;
- }
- p++; /* handle */
- if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG)
- return NULL;
- p += 2; /* offset */
+ __be32 *p;
- if (p > end)
- return NULL;
- }
- return p;
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+ if (!xdr_count_read_segments(rctxt, p))
+ return false;
+ if (!pcl_alloc_call(rctxt, p))
+ return false;
+ return pcl_alloc_read(rctxt, p);
}
-/* The segment count is limited to how many segments can
- * fit in the transport header without overflowing the
- * buffer. That's about 60 Write segments for a 1KB inline
- * threshold.
- */
-static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end,
- u32 maxlen)
+static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt)
{
- u32 i, segcount;
+ u32 segcount;
+ __be32 *p;
- segcount = be32_to_cpup(p++);
- for (i = 0; i < segcount; i++) {
- p++; /* handle */
- if (be32_to_cpup(p++) > maxlen)
- return NULL;
- p += 2; /* offset */
+ if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount))
+ return false;
- if (p > end)
- return NULL;
- }
+ /* A bogus segcount causes this buffer overflow check to fail. */
+ p = xdr_inline_decode(&rctxt->rc_stream,
+ segcount * rpcrdma_segment_maxsz * sizeof(*p));
+ return p != NULL;
+}
- return p;
+/**
+ * xdr_count_write_chunks - Count number of Write chunks in Write list
+ * @rctxt: Received header and decoding state
+ * @p: start of an un-decoded Write list
+ *
+ * Before allocating anything, ensure the ingress Write list is
+ * safe to use.
+ *
+ * Return values:
+ * %true: Write list is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Write list, and
+ * the number of Write chunks is in rc_write_pcl.cl_count.
+ * %false: Write list is corrupt. @rctxt's xdr_stream is left
+ * in an indeterminate state.
+ */
+static bool xdr_count_write_chunks(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+ rctxt->rc_write_pcl.cl_count = 0;
+ while (xdr_item_is_present(p)) {
+ if (!xdr_check_write_chunk(rctxt))
+ return false;
+ ++rctxt->rc_write_pcl.cl_count;
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+ }
+ return true;
}
/* Sanity check the Write list.
*
* Implementation limits:
- * - This implementation supports only one Write chunk.
+ * - This implementation currently supports only one Write chunk.
*
* Sanity checks:
- * - Write list does not overflow buffer.
- * - Segment size limited by largest NFS data payload.
- *
- * Returns pointer to the following Reply chunk.
+ * - Write list does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
+ *
+ * Return values:
+ * %true: Write list is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Write list.
+ * %false: Write list is corrupt. @rctxt's xdr_stream is left
+ * in an unknown state.
*/
-static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end)
+static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt)
{
- u32 chcount;
+ __be32 *p;
- chcount = 0;
- while (*p++ != xdr_zero) {
- p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG);
- if (!p)
- return NULL;
- if (chcount++ > 1)
- return NULL;
- }
- return p;
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+ if (!xdr_count_write_chunks(rctxt, p))
+ return false;
+ if (!pcl_alloc_write(rctxt, &rctxt->rc_write_pcl, p))
+ return false;
+
+ rctxt->rc_cur_result_payload = pcl_first_chunk(&rctxt->rc_write_pcl);
+ return true;
}
/* Sanity check the Reply chunk.
*
* Sanity checks:
- * - Reply chunk does not overflow buffer.
- * - Segment size limited by largest NFS data payload.
- *
- * Returns pointer to the following RPC header.
+ * - Reply chunk does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
+ *
+ * Return values:
+ * %true: Reply chunk is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Reply chunk.
+ * %false: Reply chunk is corrupt. @rctxt's xdr_stream is left
+ * in an unknown state.
*/
-static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
+static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt)
{
- if (*p++ != xdr_zero) {
- p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG);
- if (!p)
- return NULL;
- }
- return p;
+ __be32 *p;
+
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+
+ if (!xdr_item_is_present(p))
+ return true;
+ if (!xdr_check_write_chunk(rctxt))
+ return false;
+
+ rctxt->rc_reply_pcl.cl_count = 1;
+ return pcl_alloc_write(rctxt, &rctxt->rc_reply_pcl, p);
}
/* RPC-over-RDMA Version One private extension: Remote Invalidation.
@@ -481,206 +584,147 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
*
* If there is exactly one distinct R_key in the received transport
* header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
- *
- * Perform this operation while the received transport header is
- * still in the CPU cache.
*/
static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *ctxt)
{
- __be32 inv_rkey, *p;
- u32 i, segcount;
+ struct svc_rdma_segment *segment;
+ struct svc_rdma_chunk *chunk;
+ u32 inv_rkey;
ctxt->rc_inv_rkey = 0;
if (!rdma->sc_snd_w_inv)
return;
- inv_rkey = xdr_zero;
- p = ctxt->rc_recv_buf;
- p += rpcrdma_fixed_maxsz;
-
- /* Read list */
- while (*p++ != xdr_zero) {
- p++; /* position */
- if (inv_rkey == xdr_zero)
- inv_rkey = *p;
- else if (inv_rkey != *p)
- return;
- p += 4;
+ inv_rkey = 0;
+ pcl_for_each_chunk(chunk, &ctxt->rc_call_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
+ return;
+ }
}
-
- /* Write list */
- while (*p++ != xdr_zero) {
- segcount = be32_to_cpup(p++);
- for (i = 0; i < segcount; i++) {
- if (inv_rkey == xdr_zero)
- inv_rkey = *p;
- else if (inv_rkey != *p)
+ pcl_for_each_chunk(chunk, &ctxt->rc_read_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
return;
- p += 4;
}
}
-
- /* Reply chunk */
- if (*p++ != xdr_zero) {
- segcount = be32_to_cpup(p++);
- for (i = 0; i < segcount; i++) {
- if (inv_rkey == xdr_zero)
- inv_rkey = *p;
- else if (inv_rkey != *p)
+ pcl_for_each_chunk(chunk, &ctxt->rc_write_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
return;
- p += 4;
}
}
-
- ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
+ pcl_for_each_chunk(chunk, &ctxt->rc_reply_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
+ return;
+ }
+ }
+ ctxt->rc_inv_rkey = inv_rkey;
}
-/* On entry, xdr->head[0].iov_base points to first byte in the
- * RPC-over-RDMA header.
+/**
+ * svc_rdma_xdr_decode_req - Decode the transport header
+ * @rq_arg: xdr_buf containing ingress RPC/RDMA message
+ * @rctxt: state of decoding
+ *
+ * On entry, xdr->head[0].iov_base points to first byte of the
+ * RPC-over-RDMA transport header.
*
* On successful exit, head[0] points to first byte past the
* RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ *
* The length of the RPC-over-RDMA header is returned.
*
* Assumptions:
* - The transport header is entirely contained in the head iovec.
*/
-static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
+static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg,
+ struct svc_rdma_recv_ctxt *rctxt)
{
- __be32 *p, *end, *rdma_argp;
+ __be32 *p, *rdma_argp;
unsigned int hdr_len;
- /* Verify that there's enough bytes for header + something */
- if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
- goto out_short;
-
rdma_argp = rq_arg->head[0].iov_base;
- if (*(rdma_argp + 1) != rpcrdma_version)
- goto out_version;
+ xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp, NULL);
- switch (*(rdma_argp + 3)) {
+ p = xdr_inline_decode(&rctxt->rc_stream,
+ rpcrdma_fixed_maxsz * sizeof(*p));
+ if (unlikely(!p))
+ goto out_short;
+ p++;
+ if (*p != rpcrdma_version)
+ goto out_version;
+ p += 2;
+ rctxt->rc_msgtype = *p;
+ switch (rctxt->rc_msgtype) {
case rdma_msg:
break;
case rdma_nomsg:
break;
-
case rdma_done:
goto out_drop;
-
case rdma_error:
goto out_drop;
-
default:
goto out_proc;
}
- end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
- p = xdr_check_read_list(rdma_argp + 4, end);
- if (!p)
+ if (!xdr_check_read_list(rctxt))
goto out_inval;
- p = xdr_check_write_list(p, end);
- if (!p)
- goto out_inval;
- p = xdr_check_reply_chunk(p, end);
- if (!p)
+ if (!xdr_check_write_list(rctxt))
goto out_inval;
- if (p > end)
+ if (!xdr_check_reply_chunk(rctxt))
goto out_inval;
- rq_arg->head[0].iov_base = p;
- hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
+ rq_arg->head[0].iov_base = rctxt->rc_stream.p;
+ hdr_len = xdr_stream_pos(&rctxt->rc_stream);
rq_arg->head[0].iov_len -= hdr_len;
rq_arg->len -= hdr_len;
- trace_svcrdma_decode_rqst(rdma_argp, hdr_len);
+ trace_svcrdma_decode_rqst(rctxt, rdma_argp, hdr_len);
return hdr_len;
out_short:
- trace_svcrdma_decode_short(rq_arg->len);
+ trace_svcrdma_decode_short_err(rctxt, rq_arg->len);
return -EINVAL;
out_version:
- trace_svcrdma_decode_badvers(rdma_argp);
+ trace_svcrdma_decode_badvers_err(rctxt, rdma_argp);
return -EPROTONOSUPPORT;
out_drop:
- trace_svcrdma_decode_drop(rdma_argp);
+ trace_svcrdma_decode_drop_err(rctxt, rdma_argp);
return 0;
out_proc:
- trace_svcrdma_decode_badproc(rdma_argp);
+ trace_svcrdma_decode_badproc_err(rctxt, rdma_argp);
return -EINVAL;
out_inval:
- trace_svcrdma_decode_parse(rdma_argp);
+ trace_svcrdma_decode_parse_err(rctxt, rdma_argp);
return -EINVAL;
}
-static void rdma_read_complete(struct svc_rqst *rqstp,
- struct svc_rdma_recv_ctxt *head)
-{
- int page_no;
-
- /* Move Read chunk pages to rqstp so that they will be released
- * when svc_process is done with them.
- */
- for (page_no = 0; page_no < head->rc_page_count; page_no++) {
- put_page(rqstp->rq_pages[page_no]);
- rqstp->rq_pages[page_no] = head->rc_pages[page_no];
- }
- head->rc_page_count = 0;
-
- /* Point rq_arg.pages past header */
- rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count];
- rqstp->rq_arg.page_len = head->rc_arg.page_len;
-
- /* rq_respages starts after the last arg page */
- rqstp->rq_respages = &rqstp->rq_pages[page_no];
- rqstp->rq_next_page = rqstp->rq_respages + 1;
-
- /* Rebuild rq_arg head and tail. */
- rqstp->rq_arg.head[0] = head->rc_arg.head[0];
- rqstp->rq_arg.tail[0] = head->rc_arg.tail[0];
- rqstp->rq_arg.len = head->rc_arg.len;
- rqstp->rq_arg.buflen = head->rc_arg.buflen;
-}
-
-static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
- __be32 *rdma_argp, int status)
+static void svc_rdma_send_error(struct svcxprt_rdma *rdma,
+ struct svc_rdma_recv_ctxt *rctxt,
+ int status)
{
- struct svc_rdma_send_ctxt *ctxt;
- unsigned int length;
- __be32 *p;
- int ret;
+ struct svc_rdma_send_ctxt *sctxt;
- ctxt = svc_rdma_send_ctxt_get(xprt);
- if (!ctxt)
+ sctxt = svc_rdma_send_ctxt_get(rdma);
+ if (!sctxt)
return;
-
- p = ctxt->sc_xprt_buf;
- *p++ = *rdma_argp;
- *p++ = *(rdma_argp + 1);
- *p++ = xprt->sc_fc_credits;
- *p++ = rdma_error;
- switch (status) {
- case -EPROTONOSUPPORT:
- *p++ = err_vers;
- *p++ = rpcrdma_version;
- *p++ = rpcrdma_version;
- trace_svcrdma_err_vers(*rdma_argp);
- break;
- default:
- *p++ = err_chunk;
- trace_svcrdma_err_chunk(*rdma_argp);
- }
- length = (unsigned long)p - (unsigned long)ctxt->sc_xprt_buf;
- svc_rdma_sync_reply_hdr(xprt, ctxt, length);
-
- ctxt->sc_send_wr.opcode = IB_WR_SEND;
- ret = svc_rdma_send(xprt, &ctxt->sc_send_wr);
- if (ret)
- svc_rdma_send_ctxt_put(xprt, ctxt);
+ svc_rdma_send_error_msg(rdma, sctxt, rctxt, status);
}
/* By convention, backchannel calls arrive via rdma_msg type
@@ -688,30 +732,28 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
* the RPC/RDMA header small and fixed in size, so it is
* straightforward to check the RPC header's direction field.
*/
-static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
- __be32 *rdma_resp)
+static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
+ struct svc_rdma_recv_ctxt *rctxt)
{
- __be32 *p;
+ __be32 *p = rctxt->rc_recv_buf;
if (!xprt->xpt_bc_xprt)
return false;
- p = rdma_resp + 3;
- if (*p++ != rdma_msg)
+ if (rctxt->rc_msgtype != rdma_msg)
return false;
- if (*p++ != xdr_zero)
+ if (!pcl_is_empty(&rctxt->rc_call_pcl))
return false;
- if (*p++ != xdr_zero)
+ if (!pcl_is_empty(&rctxt->rc_read_pcl))
return false;
- if (*p++ != xdr_zero)
+ if (!pcl_is_empty(&rctxt->rc_write_pcl))
return false;
-
- /* XID sanity */
- if (*p++ != *rdma_resp)
+ if (!pcl_is_empty(&rctxt->rc_reply_pcl))
return false;
- /* call direction */
- if (*p == cpu_to_be32(RPC_CALL))
+
+ /* RPC call direction */
+ if (*(p + 8) == cpu_to_be32(RPC_CALL))
return false;
return true;
@@ -753,29 +795,29 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
struct svcxprt_rdma *rdma_xprt =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
struct svc_rdma_recv_ctxt *ctxt;
- __be32 *p;
int ret;
+ rqstp->rq_xprt_ctxt = NULL;
+
+ ctxt = NULL;
spin_lock(&rdma_xprt->sc_rq_dto_lock);
- ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);
- if (ctxt) {
- list_del(&ctxt->rc_list);
- spin_unlock(&rdma_xprt->sc_rq_dto_lock);
- rdma_read_complete(rqstp, ctxt);
- goto complete;
- }
ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q);
- if (!ctxt) {
+ if (ctxt)
+ list_del(&ctxt->rc_list);
+ else
/* No new incoming requests, terminate the loop */
clear_bit(XPT_DATA, &xprt->xpt_flags);
- spin_unlock(&rdma_xprt->sc_rq_dto_lock);
- return 0;
- }
- list_del(&ctxt->rc_list);
spin_unlock(&rdma_xprt->sc_rq_dto_lock);
- atomic_inc(&rdma_stat_recv);
+ /* Unblock the transport for the next receive */
+ svc_xprt_received(xprt);
+ if (!ctxt)
+ return 0;
+ percpu_counter_inc(&svcrdma_stat_recv);
+ ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device,
+ ctxt->rc_recv_sge.addr, ctxt->rc_byte_len,
+ DMA_FROM_DEVICE);
svc_rdma_build_arg_xdr(rqstp, ctxt);
/* Prevent svc_xprt_release from releasing pages in rq_pages
@@ -784,49 +826,42 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
rqstp->rq_respages = rqstp->rq_pages;
rqstp->rq_next_page = rqstp->rq_respages;
- p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
- ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
+ ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
if (ret < 0)
goto out_err;
if (ret == 0)
goto out_drop;
- rqstp->rq_xprt_hlen = ret;
- if (svc_rdma_is_backchannel_reply(xprt, p)) {
- ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p,
- &rqstp->rq_arg);
- svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
- return ret;
- }
+ if (svc_rdma_is_reverse_direction_reply(xprt, ctxt))
+ goto out_backchannel;
+
svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
- p += rpcrdma_fixed_maxsz;
- if (*p != xdr_zero)
- goto out_readchunk;
+ if (!pcl_is_empty(&ctxt->rc_read_pcl) ||
+ !pcl_is_empty(&ctxt->rc_call_pcl)) {
+ ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
+ if (ret < 0)
+ goto out_readfail;
+ }
-complete:
rqstp->rq_xprt_ctxt = ctxt;
rqstp->rq_prot = IPPROTO_MAX;
svc_xprt_copy_addrs(rqstp, xprt);
return rqstp->rq_arg.len;
-out_readchunk:
- ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p);
- if (ret < 0)
- goto out_postfail;
- return 0;
-
out_err:
- svc_rdma_send_error(rdma_xprt, p, ret);
+ svc_rdma_send_error(rdma_xprt, ctxt, ret);
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
return 0;
-out_postfail:
+out_readfail:
if (ret == -EINVAL)
- svc_rdma_send_error(rdma_xprt, p, ret);
+ svc_rdma_send_error(rdma_xprt, ctxt, ret);
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
return ret;
+out_backchannel:
+ svc_rdma_handle_bc_reply(rqstp, ctxt);
out_drop:
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
return 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 48fe3b16b0d9..11cf7c646644 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -7,15 +7,13 @@
#include <rdma/rw.h>
+#include <linux/sunrpc/xdr.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/sunrpc/svc_rdma.h>
-#include <linux/sunrpc/debug.h>
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
-#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-
static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc);
static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
@@ -37,11 +35,12 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
* controlling svcxprt_rdma is destroyed.
*/
struct svc_rdma_rw_ctxt {
+ struct llist_node rw_node;
struct list_head rw_list;
struct rdma_rw_ctx rw_ctx;
- int rw_nents;
+ unsigned int rw_nents;
struct sg_table rw_sg_table;
- struct scatterlist rw_first_sgl[0];
+ struct scatterlist rw_first_sgl[];
};
static inline struct svc_rdma_rw_ctxt *
@@ -55,41 +54,48 @@ static struct svc_rdma_rw_ctxt *
svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
{
struct svc_rdma_rw_ctxt *ctxt;
+ struct llist_node *node;
spin_lock(&rdma->sc_rw_ctxt_lock);
-
- ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
- if (ctxt) {
- list_del(&ctxt->rw_list);
- spin_unlock(&rdma->sc_rw_ctxt_lock);
+ node = llist_del_first(&rdma->sc_rw_ctxts);
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+ if (node) {
+ ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
} else {
- spin_unlock(&rdma->sc_rw_ctxt_lock);
ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
GFP_KERNEL);
if (!ctxt)
- goto out;
+ goto out_noctx;
+
INIT_LIST_HEAD(&ctxt->rw_list);
}
ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
ctxt->rw_sg_table.sgl,
- SG_CHUNK_SIZE)) {
- kfree(ctxt);
- ctxt = NULL;
- }
-out:
+ SG_CHUNK_SIZE))
+ goto out_free;
return ctxt;
+
+out_free:
+ kfree(ctxt);
+out_noctx:
+ trace_svcrdma_no_rwctx_err(rdma, sges);
+ return NULL;
}
-static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
- struct svc_rdma_rw_ctxt *ctxt)
+static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+ struct svc_rdma_rw_ctxt *ctxt,
+ struct llist_head *list)
{
sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE);
+ llist_add(&ctxt->rw_node, list);
+}
- spin_lock(&rdma->sc_rw_ctxt_lock);
- list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
- spin_unlock(&rdma->sc_rw_ctxt_lock);
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts);
}
/**
@@ -100,15 +106,44 @@ static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
{
struct svc_rdma_rw_ctxt *ctxt;
+ struct llist_node *node;
- while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
- list_del(&ctxt->rw_list);
+ while ((node = llist_del_first(&rdma->sc_rw_ctxts)) != NULL) {
+ ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
kfree(ctxt);
}
}
+/**
+ * svc_rdma_rw_ctx_init - Prepare a R/W context for I/O
+ * @rdma: controlling transport instance
+ * @ctxt: R/W context to prepare
+ * @offset: RDMA offset
+ * @handle: RDMA tag/handle
+ * @direction: I/O direction
+ *
+ * Returns on success, the number of WQEs that will be needed
+ * on the workqueue, or a negative errno.
+ */
+static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
+ struct svc_rdma_rw_ctxt *ctxt,
+ u64 offset, u32 handle,
+ enum dma_data_direction direction)
+{
+ int ret;
+
+ ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num,
+ ctxt->rw_sg_table.sgl, ctxt->rw_nents,
+ 0, offset, handle, direction);
+ if (unlikely(ret < 0)) {
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ trace_svcrdma_dma_map_rw_err(rdma, ctxt->rw_nents, ret);
+ }
+ return ret;
+}
+
/* A chunk context tracks all I/O for moving one Read or Write
- * chunk. This is a a set of rdma_rw's that handle data movement
+ * chunk. This is a set of rdma_rw's that handle data movement
* for all segments of one chunk.
*
* These are small, acquired with a single allocator call, and
@@ -116,37 +151,62 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
* demand, and not cached.
*/
struct svc_rdma_chunk_ctxt {
+ struct rpc_rdma_cid cc_cid;
struct ib_cqe cc_cqe;
struct svcxprt_rdma *cc_rdma;
struct list_head cc_rwctxts;
+ ktime_t cc_posttime;
int cc_sqecount;
+ enum ib_wc_status cc_status;
+ struct completion cc_done;
};
+static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
+ struct rpc_rdma_cid *cid)
+{
+ cid->ci_queue_id = rdma->sc_sq_cq->res.id;
+ cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
struct svc_rdma_chunk_ctxt *cc)
{
+ svc_rdma_cc_cid_init(rdma, &cc->cc_cid);
cc->cc_rdma = rdma;
- svc_xprt_get(&rdma->sc_xprt);
INIT_LIST_HEAD(&cc->cc_rwctxts);
cc->cc_sqecount = 0;
}
+/*
+ * The consumed rw_ctx's are cleaned and placed on a local llist so
+ * that only one atomic llist operation is needed to put them all
+ * back on the free list.
+ */
static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
enum dma_data_direction dir)
{
struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct llist_node *first, *last;
struct svc_rdma_rw_ctxt *ctxt;
+ LLIST_HEAD(free);
+ first = last = NULL;
while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
list_del(&ctxt->rw_list);
rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
rdma->sc_port_num, ctxt->rw_sg_table.sgl,
ctxt->rw_nents, dir);
- svc_rdma_put_rw_ctxt(rdma, ctxt);
+ __svc_rdma_put_rw_ctxt(rdma, ctxt, &free);
+
+ ctxt->rw_node.next = first;
+ first = &ctxt->rw_node;
+ if (!last)
+ last = first;
}
- svc_xprt_put(&rdma->sc_xprt);
+ if (first)
+ llist_add_batch(first, last, &rdma->sc_rw_ctxts);
}
/* State for sending a Write or Reply chunk.
@@ -154,14 +214,14 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
* - Stores arguments for the SGL constructor functions
*/
struct svc_rdma_write_info {
+ const struct svc_rdma_chunk *wi_chunk;
+
/* write state of this chunk */
unsigned int wi_seg_off;
unsigned int wi_seg_no;
- unsigned int wi_nsegs;
- __be32 *wi_segs;
/* SGL constructor arguments */
- struct xdr_buf *wi_xdr;
+ const struct xdr_buf *wi_xdr;
unsigned char *wi_base;
unsigned int wi_next_off;
@@ -169,7 +229,8 @@ struct svc_rdma_write_info {
};
static struct svc_rdma_write_info *
-svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_chunk *chunk)
{
struct svc_rdma_write_info *info;
@@ -177,10 +238,9 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
if (!info)
return info;
+ info->wi_chunk = chunk;
info->wi_seg_off = 0;
info->wi_seg_no = 0;
- info->wi_nsegs = be32_to_cpup(++chunk);
- info->wi_segs = ++chunk;
svc_rdma_cc_init(rdma, &info->wi_cc);
info->wi_cc.cc_cqe.done = svc_rdma_write_done;
return info;
@@ -208,13 +268,21 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
struct svc_rdma_write_info *info =
container_of(cc, struct svc_rdma_write_info, wi_cc);
- trace_svcrdma_wc_write(wc);
+ switch (wc->status) {
+ case IB_WC_SUCCESS:
+ trace_svcrdma_wc_write(wc, &cc->cc_cid);
+ break;
+ case IB_WC_WR_FLUSH_ERR:
+ trace_svcrdma_wc_write_flush(wc, &cc->cc_cid);
+ break;
+ default:
+ trace_svcrdma_wc_write_err(wc, &cc->cc_cid);
+ }
- atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
- wake_up(&rdma->sc_send_wait);
+ svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
if (unlikely(wc->status != IB_WC_SUCCESS))
- set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ svc_xprt_deferred_close(&rdma->sc_xprt);
svc_rdma_write_info_free(info);
}
@@ -222,11 +290,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
/* State for pulling a Read chunk.
*/
struct svc_rdma_read_info {
+ struct svc_rqst *ri_rqst;
struct svc_rdma_recv_ctxt *ri_readctxt;
- unsigned int ri_position;
unsigned int ri_pageno;
unsigned int ri_pageoff;
- unsigned int ri_chunklen;
+ unsigned int ri_totalbytes;
struct svc_rdma_chunk_ctxt ri_cc;
};
@@ -262,30 +330,25 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
struct ib_cqe *cqe = wc->wr_cqe;
struct svc_rdma_chunk_ctxt *cc =
container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
- struct svcxprt_rdma *rdma = cc->cc_rdma;
- struct svc_rdma_read_info *info =
- container_of(cc, struct svc_rdma_read_info, ri_cc);
-
- trace_svcrdma_wc_read(wc);
-
- atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
- wake_up(&rdma->sc_send_wait);
+ struct svc_rdma_read_info *info;
- if (unlikely(wc->status != IB_WC_SUCCESS)) {
- set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
- svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt);
- } else {
- spin_lock(&rdma->sc_rq_dto_lock);
- list_add_tail(&info->ri_readctxt->rc_list,
- &rdma->sc_read_complete_q);
- /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
- set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
- spin_unlock(&rdma->sc_rq_dto_lock);
-
- svc_xprt_enqueue(&rdma->sc_xprt);
+ switch (wc->status) {
+ case IB_WC_SUCCESS:
+ info = container_of(cc, struct svc_rdma_read_info, ri_cc);
+ trace_svcrdma_wc_read(wc, &cc->cc_cid, info->ri_totalbytes,
+ cc->cc_posttime);
+ break;
+ case IB_WC_WR_FLUSH_ERR:
+ trace_svcrdma_wc_read_flush(wc, &cc->cc_cid);
+ break;
+ default:
+ trace_svcrdma_wc_read_err(wc, &cc->cc_cid);
}
- svc_rdma_read_info_free(info);
+ svc_rdma_wake_send_waiters(cc->cc_rdma, cc->cc_sqecount);
+ cc->cc_status = wc->status;
+ complete(&cc->cc_done);
+ return;
}
/* This function sleeps when the transport's Send Queue is congested.
@@ -298,7 +361,6 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
{
struct svcxprt_rdma *rdma = cc->cc_rdma;
- struct svc_xprt *xprt = &rdma->sc_xprt;
struct ib_send_wr *first_wr;
const struct ib_send_wr *bad_wr;
struct list_head *tmp;
@@ -322,14 +384,14 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
do {
if (atomic_sub_return(cc->cc_sqecount,
&rdma->sc_sq_avail) > 0) {
+ cc->cc_posttime = ktime_get();
ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
- trace_svcrdma_post_rw(&cc->cc_cqe,
- cc->cc_sqecount, ret);
if (ret)
break;
return 0;
}
+ percpu_counter_inc(&svcrdma_stat_sq_starve);
trace_svcrdma_sq_full(rdma);
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
wait_event(rdma->sc_send_wait,
@@ -337,7 +399,8 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
trace_svcrdma_sq_retry(rdma);
} while (1);
- set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ trace_svcrdma_sq_post_err(rdma, ret);
+ svc_xprt_deferred_close(&rdma->sc_xprt);
/* If even one was posted, there will be a completion. */
if (bad_wr != first_wr)
@@ -369,7 +432,7 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
struct svc_rdma_rw_ctxt *ctxt)
{
unsigned int sge_no, sge_bytes, page_off, page_no;
- struct xdr_buf *xdr = info->wi_xdr;
+ const struct xdr_buf *xdr = info->wi_xdr;
struct scatterlist *sg;
struct page **page;
@@ -407,43 +470,37 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
{
struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
struct svcxprt_rdma *rdma = cc->cc_rdma;
+ const struct svc_rdma_segment *seg;
struct svc_rdma_rw_ctxt *ctxt;
- __be32 *seg;
int ret;
- seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
do {
unsigned int write_len;
- u32 seg_length, seg_handle;
- u64 seg_offset;
+ u64 offset;
- if (info->wi_seg_no >= info->wi_nsegs)
+ if (info->wi_seg_no >= info->wi_chunk->ch_segcount)
goto out_overflow;
- seg_handle = be32_to_cpup(seg);
- seg_length = be32_to_cpup(seg + 1);
- xdr_decode_hyper(seg + 2, &seg_offset);
- seg_offset += info->wi_seg_off;
-
- write_len = min(remaining, seg_length - info->wi_seg_off);
+ seg = &info->wi_chunk->ch_segments[info->wi_seg_no];
+ write_len = min(remaining, seg->rs_length - info->wi_seg_off);
+ if (!write_len)
+ goto out_overflow;
ctxt = svc_rdma_get_rw_ctxt(rdma,
(write_len >> PAGE_SHIFT) + 2);
if (!ctxt)
- goto out_noctx;
+ return -ENOMEM;
constructor(info, write_len, ctxt);
- ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
- rdma->sc_port_num, ctxt->rw_sg_table.sgl,
- ctxt->rw_nents, 0, seg_offset,
- seg_handle, DMA_TO_DEVICE);
+ offset = seg->rs_offset + info->wi_seg_off;
+ ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle,
+ DMA_TO_DEVICE);
if (ret < 0)
- goto out_initerr;
+ return -EIO;
+ percpu_counter_inc(&svcrdma_stat_write);
- trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset);
list_add(&ctxt->rw_list, &cc->cc_rwctxts);
cc->cc_sqecount += ret;
- if (write_len == seg_length - info->wi_seg_off) {
- seg += 4;
+ if (write_len == seg->rs_length - info->wi_seg_off) {
info->wi_seg_no++;
info->wi_seg_off = 0;
} else {
@@ -455,51 +512,96 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
return 0;
out_overflow:
- dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
- info->wi_nsegs);
+ trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no,
+ info->wi_chunk->ch_segcount);
return -E2BIG;
-
-out_noctx:
- dprintk("svcrdma: no R/W ctxs available\n");
- return -ENOMEM;
-
-out_initerr:
- svc_rdma_put_rw_ctxt(rdma, ctxt);
- trace_svcrdma_dma_map_rwctx(rdma, ret);
- return -EIO;
}
-/* Send one of an xdr_buf's kvecs by itself. To send a Reply
- * chunk, the whole RPC Reply is written back to the client.
- * This function writes either the head or tail of the xdr_buf
- * containing the Reply.
+/**
+ * svc_rdma_iov_write - Construct RDMA Writes from an iov
+ * @info: pointer to write arguments
+ * @iov: kvec to write
+ *
+ * Returns:
+ * On success, returns zero
+ * %-E2BIG if the client-provided Write chunk is too small
+ * %-ENOMEM if a resource has been exhausted
+ * %-EIO if an rdma-rw error occurred
*/
-static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
- struct kvec *vec)
+static int svc_rdma_iov_write(struct svc_rdma_write_info *info,
+ const struct kvec *iov)
{
- info->wi_base = vec->iov_base;
+ info->wi_base = iov->iov_base;
return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
- vec->iov_len);
+ iov->iov_len);
}
-/* Send an xdr_buf's page list by itself. A Write chunk is
- * just the page list. a Reply chunk is the head, page list,
- * and tail. This function is shared between the two types
- * of chunk.
+/**
+ * svc_rdma_pages_write - Construct RDMA Writes from pages
+ * @info: pointer to write arguments
+ * @xdr: xdr_buf with pages to write
+ * @offset: offset into the content of @xdr
+ * @length: number of bytes to write
+ *
+ * Returns:
+ * On success, returns zero
+ * %-E2BIG if the client-provided Write chunk is too small
+ * %-ENOMEM if a resource has been exhausted
+ * %-EIO if an rdma-rw error occurred
*/
-static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
- struct xdr_buf *xdr)
+static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
+ const struct xdr_buf *xdr,
+ unsigned int offset,
+ unsigned long length)
{
info->wi_xdr = xdr;
- info->wi_next_off = 0;
+ info->wi_next_off = offset - xdr->head[0].iov_len;
return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
- xdr->page_len);
+ length);
+}
+
+/**
+ * svc_rdma_xb_write - Construct RDMA Writes to write an xdr_buf
+ * @xdr: xdr_buf to write
+ * @data: pointer to write arguments
+ *
+ * Returns:
+ * On success, returns zero
+ * %-E2BIG if the client-provided Write chunk is too small
+ * %-ENOMEM if a resource has been exhausted
+ * %-EIO if an rdma-rw error occurred
+ */
+static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
+{
+ struct svc_rdma_write_info *info = data;
+ int ret;
+
+ if (xdr->head[0].iov_len) {
+ ret = svc_rdma_iov_write(info, &xdr->head[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (xdr->page_len) {
+ ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len,
+ xdr->page_len);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (xdr->tail[0].iov_len) {
+ ret = svc_rdma_iov_write(info, &xdr->tail[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ return xdr->len;
}
/**
* svc_rdma_send_write_chunk - Write all segments in a Write chunk
* @rdma: controlling RDMA transport
- * @wr_ch: Write chunk provided by client
+ * @chunk: Write chunk provided by the client
* @xdr: xdr_buf containing the data payload
*
* Returns a non-negative number of bytes the chunk consumed, or
@@ -509,29 +611,28 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
* %-ENOTCONN if posting failed (connection is lost),
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
*/
-int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
- struct xdr_buf *xdr)
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_chunk *chunk,
+ const struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
+ struct svc_rdma_chunk_ctxt *cc;
int ret;
- if (!xdr->page_len)
- return 0;
-
- info = svc_rdma_write_info_alloc(rdma, wr_ch);
+ info = svc_rdma_write_info_alloc(rdma, chunk);
if (!info)
return -ENOMEM;
+ cc = &info->wi_cc;
- ret = svc_rdma_send_xdr_pagelist(info, xdr);
- if (ret < 0)
+ ret = svc_rdma_xb_write(xdr, info);
+ if (ret != xdr->len)
goto out_err;
- ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
+ ret = svc_rdma_post_chunk_ctxt(cc);
if (ret < 0)
goto out_err;
-
- trace_svcrdma_encode_write(xdr->page_len);
- return xdr->page_len;
+ return xdr->len;
out_err:
svc_rdma_write_info_free(info);
@@ -541,8 +642,7 @@ out_err:
/**
* svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
* @rdma: controlling RDMA transport
- * @rp_ch: Reply chunk provided by client
- * @writelist: true if client provided a Write list
+ * @rctxt: Write and Reply chunks from client
* @xdr: xdr_buf containing an RPC Reply
*
* Returns a non-negative number of bytes the chunk consumed, or
@@ -552,65 +652,68 @@ out_err:
* %-ENOTCONN if posting failed (connection is lost),
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
*/
-int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
- bool writelist, struct xdr_buf *xdr)
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ const struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
- int consumed, ret;
+ struct svc_rdma_chunk_ctxt *cc;
+ struct svc_rdma_chunk *chunk;
+ int ret;
- info = svc_rdma_write_info_alloc(rdma, rp_ch);
+ if (pcl_is_empty(&rctxt->rc_reply_pcl))
+ return 0;
+
+ chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
+ info = svc_rdma_write_info_alloc(rdma, chunk);
if (!info)
return -ENOMEM;
+ cc = &info->wi_cc;
- ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+ ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_write, info);
if (ret < 0)
goto out_err;
- consumed = xdr->head[0].iov_len;
-
- /* Send the page list in the Reply chunk only if the
- * client did not provide Write chunks.
- */
- if (!writelist && xdr->page_len) {
- ret = svc_rdma_send_xdr_pagelist(info, xdr);
- if (ret < 0)
- goto out_err;
- consumed += xdr->page_len;
- }
-
- if (xdr->tail[0].iov_len) {
- ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
- if (ret < 0)
- goto out_err;
- consumed += xdr->tail[0].iov_len;
- }
- ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
+ ret = svc_rdma_post_chunk_ctxt(cc);
if (ret < 0)
goto out_err;
- trace_svcrdma_encode_reply(consumed);
- return consumed;
+ return xdr->len;
out_err:
svc_rdma_write_info_free(info);
return ret;
}
+/**
+ * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment
+ * @info: context for ongoing I/O
+ * @segment: co-ordinates of remote memory to be read
+ *
+ * Returns:
+ * %0: the Read WR chain was constructed successfully
+ * %-EINVAL: there were not enough rq_pages to finish
+ * %-ENOMEM: allocating a local resources failed
+ * %-EIO: a DMA mapping error occurred
+ */
static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
- struct svc_rqst *rqstp,
- u32 rkey, u32 len, u64 offset)
+ const struct svc_rdma_segment *segment)
{
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
+ struct svc_rqst *rqstp = info->ri_rqst;
+ unsigned int sge_no, seg_len, len;
struct svc_rdma_rw_ctxt *ctxt;
- unsigned int sge_no, seg_len;
struct scatterlist *sg;
int ret;
+ len = segment->rs_length;
sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT;
ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no);
if (!ctxt)
- goto out_noctx;
+ return -ENOMEM;
ctxt->rw_nents = sge_no;
sg = ctxt->rw_sg_table.sgl;
@@ -618,8 +721,6 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
seg_len = min_t(unsigned int, len,
PAGE_SIZE - info->ri_pageoff);
- head->rc_arg.pages[info->ri_pageno] =
- rqstp->rq_pages[info->ri_pageno];
if (!info->ri_pageoff)
head->rc_page_count++;
@@ -640,102 +741,198 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
goto out_overrun;
}
- ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp,
- cc->cc_rdma->sc_port_num,
- ctxt->rw_sg_table.sgl, ctxt->rw_nents,
- 0, offset, rkey, DMA_FROM_DEVICE);
+ ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset,
+ segment->rs_handle, DMA_FROM_DEVICE);
if (ret < 0)
- goto out_initerr;
+ return -EIO;
+ percpu_counter_inc(&svcrdma_stat_read);
list_add(&ctxt->rw_list, &cc->cc_rwctxts);
cc->cc_sqecount += ret;
return 0;
-out_noctx:
- dprintk("svcrdma: no R/W ctxs available\n");
- return -ENOMEM;
-
out_overrun:
- dprintk("svcrdma: request overruns rq_pages\n");
+ trace_svcrdma_page_overrun_err(cc->cc_rdma, rqstp, info->ri_pageno);
return -EINVAL;
-
-out_initerr:
- trace_svcrdma_dma_map_rwctx(cc->cc_rdma, ret);
- svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt);
- return -EIO;
}
-/* Walk the segments in the Read chunk starting at @p and construct
- * RDMA Read operations to pull the chunk to the server.
+/**
+ * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk
+ * @info: context for ongoing I/O
+ * @chunk: Read chunk to pull
+ *
+ * Return values:
+ * %0: the Read WR chain was constructed successfully
+ * %-EINVAL: there were not enough resources to finish
+ * %-ENOMEM: allocating a local resources failed
+ * %-EIO: a DMA mapping error occurred
*/
-static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
- struct svc_rdma_read_info *info,
- __be32 *p)
+static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info,
+ const struct svc_rdma_chunk *chunk)
{
- unsigned int i;
+ const struct svc_rdma_segment *segment;
int ret;
ret = -EINVAL;
- info->ri_chunklen = 0;
- while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) {
- u32 rs_handle, rs_length;
- u64 rs_offset;
-
- rs_handle = be32_to_cpup(p++);
- rs_length = be32_to_cpup(p++);
- p = xdr_decode_hyper(p, &rs_offset);
-
- ret = svc_rdma_build_read_segment(info, rqstp,
- rs_handle, rs_length,
- rs_offset);
+ pcl_for_each_segment(segment, chunk) {
+ ret = svc_rdma_build_read_segment(info, segment);
if (ret < 0)
break;
+ info->ri_totalbytes += segment->rs_length;
+ }
+ return ret;
+}
+
+/**
+ * svc_rdma_copy_inline_range - Copy part of the inline content into pages
+ * @info: context for RDMA Reads
+ * @offset: offset into the Receive buffer of region to copy
+ * @remaining: length of region to copy
+ *
+ * Take a page at a time from rqstp->rq_pages and copy the inline
+ * content from the Receive buffer into that page. Update
+ * info->ri_pageno and info->ri_pageoff so that the next RDMA Read
+ * result will land contiguously with the copied content.
+ *
+ * Return values:
+ * %0: Inline content was successfully copied
+ * %-EINVAL: offset or length was incorrect
+ */
+static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
+ unsigned int offset,
+ unsigned int remaining)
+{
+ struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ unsigned char *dst, *src = head->rc_recv_buf;
+ struct svc_rqst *rqstp = info->ri_rqst;
+ unsigned int page_no, numpages;
- trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset);
- info->ri_chunklen += rs_length;
+ numpages = PAGE_ALIGN(info->ri_pageoff + remaining) >> PAGE_SHIFT;
+ for (page_no = 0; page_no < numpages; page_no++) {
+ unsigned int page_len;
+
+ page_len = min_t(unsigned int, remaining,
+ PAGE_SIZE - info->ri_pageoff);
+
+ if (!info->ri_pageoff)
+ head->rc_page_count++;
+
+ dst = page_address(rqstp->rq_pages[info->ri_pageno]);
+ memcpy(dst + info->ri_pageno, src + offset, page_len);
+
+ info->ri_totalbytes += page_len;
+ info->ri_pageoff += page_len;
+ if (info->ri_pageoff == PAGE_SIZE) {
+ info->ri_pageno++;
+ info->ri_pageoff = 0;
+ }
+ remaining -= page_len;
+ offset += page_len;
}
- /* Pages under I/O have been copied to head->rc_pages.
- * Prevent their premature release by svc_xprt_release() .
- */
- for (i = 0; i < info->ri_readctxt->rc_page_count; i++)
- rqstp->rq_pages[i] = NULL;
+ return -EINVAL;
+}
- return ret;
+/**
+ * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks
+ * @info: context for RDMA Reads
+ *
+ * The chunk data lands in rqstp->rq_arg as a series of contiguous pages,
+ * like an incoming TCP call.
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
+ */
+static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *info)
+{
+ struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
+ struct xdr_buf *buf = &info->ri_rqst->rq_arg;
+ struct svc_rdma_chunk *chunk, *next;
+ unsigned int start, length;
+ int ret;
+
+ start = 0;
+ chunk = pcl_first_chunk(pcl);
+ length = chunk->ch_position;
+ ret = svc_rdma_copy_inline_range(info, start, length);
+ if (ret < 0)
+ return ret;
+
+ pcl_for_each_chunk(chunk, pcl) {
+ ret = svc_rdma_build_read_chunk(info, chunk);
+ if (ret < 0)
+ return ret;
+
+ next = pcl_next_chunk(pcl, chunk);
+ if (!next)
+ break;
+
+ start += length;
+ length = next->ch_position - info->ri_totalbytes;
+ ret = svc_rdma_copy_inline_range(info, start, length);
+ if (ret < 0)
+ return ret;
+ }
+
+ start += length;
+ length = head->rc_byte_len - start;
+ ret = svc_rdma_copy_inline_range(info, start, length);
+ if (ret < 0)
+ return ret;
+
+ buf->len += info->ri_totalbytes;
+ buf->buflen += info->ri_totalbytes;
+
+ buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
+ buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
+ buf->pages = &info->ri_rqst->rq_pages[1];
+ buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
+ return 0;
}
-/* Construct RDMA Reads to pull over a normal Read chunk. The chunk
- * data lands in the page list of head->rc_arg.pages.
+/**
+ * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks
+ * @info: context for RDMA Reads
+ *
+ * The chunk data lands in the page list of rqstp->rq_arg.pages.
*
- * Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
+ * Currently NFSD does not look at the rqstp->rq_arg.tail[0] kvec.
* Therefore, XDR round-up of the Read chunk and trailing
* inline content must both be added at the end of the pagelist.
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
*/
-static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
- struct svc_rdma_read_info *info,
- __be32 *p)
+static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
{
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ struct xdr_buf *buf = &info->ri_rqst->rq_arg;
+ struct svc_rdma_chunk *chunk;
+ unsigned int length;
int ret;
- ret = svc_rdma_build_read_chunk(rqstp, info, p);
+ chunk = pcl_first_chunk(&head->rc_read_pcl);
+ ret = svc_rdma_build_read_chunk(info, chunk);
if (ret < 0)
goto out;
- trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position);
-
- head->rc_hdr_count = 0;
-
/* Split the Receive buffer between the head and tail
* buffers at Read chunk's position. XDR roundup of the
* chunk is not included in either the pagelist or in
* the tail.
*/
- head->rc_arg.tail[0].iov_base =
- head->rc_arg.head[0].iov_base + info->ri_position;
- head->rc_arg.tail[0].iov_len =
- head->rc_arg.head[0].iov_len - info->ri_position;
- head->rc_arg.head[0].iov_len = info->ri_position;
+ buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position;
+ buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position;
+ buf->head[0].iov_len = chunk->ch_position;
/* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2).
*
@@ -746,109 +943,221 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
* Currently these chunks always start at page offset 0,
* thus the rounded-up length never crosses a page boundary.
*/
- info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2;
-
- head->rc_arg.page_len = info->ri_chunklen;
- head->rc_arg.len += info->ri_chunklen;
- head->rc_arg.buflen += info->ri_chunklen;
+ buf->pages = &info->ri_rqst->rq_pages[0];
+ length = xdr_align_size(chunk->ch_length);
+ buf->page_len = length;
+ buf->len += length;
+ buf->buflen += length;
out:
return ret;
}
-/* Construct RDMA Reads to pull over a Position Zero Read chunk.
- * The start of the data lands in the first page just after
- * the Transport header, and the rest lands in the page list of
- * head->rc_arg.pages.
+/**
+ * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk
+ * @info: context for RDMA Reads
+ * @chunk: parsed Call chunk to pull
+ * @offset: offset of region to pull
+ * @length: length of region to pull
*
- * Assumptions:
- * - A PZRC has an XDR-aligned length (no implicit round-up).
- * - There can be no trailing inline content (IOW, we assume
- * a PZRC is never sent in an RDMA_MSG message, though it's
- * allowed by spec).
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: there were not enough resources to finish
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
*/
-static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
- struct svc_rdma_read_info *info,
- __be32 *p)
+static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info,
+ const struct svc_rdma_chunk *chunk,
+ unsigned int offset, unsigned int length)
+{
+ const struct svc_rdma_segment *segment;
+ int ret;
+
+ ret = -EINVAL;
+ pcl_for_each_segment(segment, chunk) {
+ struct svc_rdma_segment dummy;
+
+ if (offset > segment->rs_length) {
+ offset -= segment->rs_length;
+ continue;
+ }
+
+ dummy.rs_handle = segment->rs_handle;
+ dummy.rs_length = min_t(u32, length, segment->rs_length) - offset;
+ dummy.rs_offset = segment->rs_offset + offset;
+
+ ret = svc_rdma_build_read_segment(info, &dummy);
+ if (ret < 0)
+ break;
+
+ info->ri_totalbytes += dummy.rs_length;
+ length -= dummy.rs_length;
+ offset = 0;
+ }
+ return ret;
+}
+
+/**
+ * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message
+ * @info: context for RDMA Reads
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: there were not enough resources to finish
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
+ */
+static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
{
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ const struct svc_rdma_chunk *call_chunk =
+ pcl_first_chunk(&head->rc_call_pcl);
+ const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
+ struct svc_rdma_chunk *chunk, *next;
+ unsigned int start, length;
int ret;
- ret = svc_rdma_build_read_chunk(rqstp, info, p);
+ if (pcl_is_empty(pcl))
+ return svc_rdma_build_read_chunk(info, call_chunk);
+
+ start = 0;
+ chunk = pcl_first_chunk(pcl);
+ length = chunk->ch_position;
+ ret = svc_rdma_read_chunk_range(info, call_chunk, start, length);
if (ret < 0)
- goto out;
+ return ret;
- trace_svcrdma_encode_pzr(info->ri_chunklen);
+ pcl_for_each_chunk(chunk, pcl) {
+ ret = svc_rdma_build_read_chunk(info, chunk);
+ if (ret < 0)
+ return ret;
- head->rc_arg.len += info->ri_chunklen;
- head->rc_arg.buflen += info->ri_chunklen;
+ next = pcl_next_chunk(pcl, chunk);
+ if (!next)
+ break;
- head->rc_hdr_count = 1;
- head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]);
- head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE,
- info->ri_chunklen);
+ start += length;
+ length = next->ch_position - info->ri_totalbytes;
+ ret = svc_rdma_read_chunk_range(info, call_chunk,
+ start, length);
+ if (ret < 0)
+ return ret;
+ }
- head->rc_arg.page_len = info->ri_chunklen -
- head->rc_arg.head[0].iov_len;
+ start += length;
+ length = call_chunk->ch_length - start;
+ return svc_rdma_read_chunk_range(info, call_chunk, start, length);
+}
+
+/**
+ * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message
+ * @info: context for RDMA Reads
+ *
+ * The start of the data lands in the first page just after the
+ * Transport header, and the rest lands in rqstp->rq_arg.pages.
+ *
+ * Assumptions:
+ * - A PZRC is never sent in an RDMA_MSG message, though it's
+ * allowed by spec.
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
+ */
+static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
+{
+ struct xdr_buf *buf = &info->ri_rqst->rq_arg;
+ int ret;
+
+ ret = svc_rdma_read_call_chunk(info);
+ if (ret < 0)
+ goto out;
+
+ buf->len += info->ri_totalbytes;
+ buf->buflen += info->ri_totalbytes;
+
+ buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
+ buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
+ buf->pages = &info->ri_rqst->rq_pages[1];
+ buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
out:
return ret;
}
/**
- * svc_rdma_recv_read_chunk - Pull a Read chunk from the client
+ * svc_rdma_process_read_list - Pull list of Read chunks from the client
* @rdma: controlling RDMA transport
* @rqstp: set of pages to use as Read sink buffers
* @head: pages under I/O collect here
- * @p: pointer to start of Read chunk
*
- * Returns:
- * %0 if all needed RDMA Reads were posted successfully,
- * %-EINVAL if client provided too many segments,
- * %-ENOMEM if rdma_rw context pool was exhausted,
- * %-ENOTCONN if posting failed (connection is lost),
- * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ * The RPC/RDMA protocol assumes that the upper layer's XDR decoders
+ * pull each Read chunk as they decode an incoming RPC message.
*
- * Assumptions:
- * - All Read segments in @p have the same Position value.
+ * On Linux, however, the server needs to have a fully-constructed RPC
+ * message in rqstp->rq_arg when there is a positive return code from
+ * ->xpo_recvfrom. So the Read list is safety-checked immediately when
+ * it is received, then here the whole Read list is pulled all at once.
+ * The ingress RPC message is fully reconstructed once all associated
+ * RDMA Reads have completed.
+ *
+ * Return values:
+ * %1: all needed RDMA Reads were posted successfully,
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
*/
-int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
- struct svc_rdma_recv_ctxt *head, __be32 *p)
+int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_recv_ctxt *head)
{
struct svc_rdma_read_info *info;
+ struct svc_rdma_chunk_ctxt *cc;
int ret;
- /* The request (with page list) is constructed in
- * head->rc_arg. Pages involved with RDMA Read I/O are
- * transferred there.
- */
- head->rc_arg.head[0] = rqstp->rq_arg.head[0];
- head->rc_arg.tail[0] = rqstp->rq_arg.tail[0];
- head->rc_arg.pages = head->rc_pages;
- head->rc_arg.page_base = 0;
- head->rc_arg.page_len = 0;
- head->rc_arg.len = rqstp->rq_arg.len;
- head->rc_arg.buflen = rqstp->rq_arg.buflen;
-
info = svc_rdma_read_info_alloc(rdma);
if (!info)
return -ENOMEM;
+ cc = &info->ri_cc;
+ info->ri_rqst = rqstp;
info->ri_readctxt = head;
info->ri_pageno = 0;
info->ri_pageoff = 0;
-
- info->ri_position = be32_to_cpup(p + 1);
- if (info->ri_position)
- ret = svc_rdma_build_normal_read_chunk(rqstp, info, p);
- else
- ret = svc_rdma_build_pz_read_chunk(rqstp, info, p);
+ info->ri_totalbytes = 0;
+
+ if (pcl_is_empty(&head->rc_call_pcl)) {
+ if (head->rc_read_pcl.cl_count == 1)
+ ret = svc_rdma_read_data_item(info);
+ else
+ ret = svc_rdma_read_multiple_chunks(info);
+ } else
+ ret = svc_rdma_read_special(info);
if (ret < 0)
goto out_err;
- ret = svc_rdma_post_chunk_ctxt(&info->ri_cc);
+ trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount);
+ init_completion(&cc->cc_done);
+ ret = svc_rdma_post_chunk_ctxt(cc);
if (ret < 0)
goto out_err;
- return 0;
+
+ ret = 1;
+ wait_for_completion(&cc->cc_done);
+ if (cc->cc_status != IB_WC_SUCCESS)
+ ret = -EIO;
+
+ /* rq_respages starts after the last arg page */
+ rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ /* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */
+ head->rc_page_count = 0;
out_err:
svc_rdma_read_info_free(info);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index f3f108090aa4..22a871e6fe4d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -106,21 +106,18 @@
#include <rdma/rdma_cm.h>
#include <linux/sunrpc/debug.h>
-#include <linux/sunrpc/rpc_rdma.h>
#include <linux/sunrpc/svc_rdma.h>
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
-#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-
static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc);
-static inline struct svc_rdma_send_ctxt *
-svc_rdma_next_send_ctxt(struct list_head *list)
+static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
+ struct rpc_rdma_cid *cid)
{
- return list_first_entry_or_null(list, struct svc_rdma_send_ctxt,
- sc_list);
+ cid->ci_queue_id = rdma->sc_sq_cq->res.id;
+ cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
}
static struct svc_rdma_send_ctxt *
@@ -145,12 +142,17 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
goto fail2;
+ svc_rdma_send_cid_init(rdma, &ctxt->sc_cid);
+
ctxt->sc_send_wr.next = NULL;
ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
+ init_completion(&ctxt->sc_done);
ctxt->sc_cqe.done = svc_rdma_wc_send;
ctxt->sc_xprt_buf = buffer;
+ xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
+ rdma->sc_max_req_size);
ctxt->sc_sges[0].addr = addr;
for (i = 0; i < rdma->sc_max_send_sges; i++)
@@ -173,9 +175,10 @@ fail0:
void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma)
{
struct svc_rdma_send_ctxt *ctxt;
+ struct llist_node *node;
- while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) {
- list_del(&ctxt->sc_list);
+ while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) {
+ ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node);
ib_dma_unmap_single(rdma->sc_pd->device,
ctxt->sc_sges[0].addr,
rdma->sc_max_req_size,
@@ -195,18 +198,22 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma)
struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
{
struct svc_rdma_send_ctxt *ctxt;
+ struct llist_node *node;
spin_lock(&rdma->sc_send_lock);
- ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts);
- if (!ctxt)
+ node = llist_del_first(&rdma->sc_send_ctxts);
+ if (!node)
goto out_empty;
- list_del(&ctxt->sc_list);
+ ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node);
spin_unlock(&rdma->sc_send_lock);
out:
+ rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0);
+ xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf,
+ ctxt->sc_xprt_buf, NULL);
+
ctxt->sc_send_wr.num_sge = 0;
ctxt->sc_cur_sge_no = 0;
- ctxt->sc_page_count = 0;
return ctxt;
out_empty:
@@ -221,8 +228,6 @@ out_empty:
* svc_rdma_send_ctxt_put - Return send_ctxt to free list
* @rdma: controlling svcxprt_rdma
* @ctxt: object to return to the free list
- *
- * Pages left in sc_pages are DMA unmapped and released.
*/
void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *ctxt)
@@ -243,12 +248,21 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
ctxt->sc_sges[i].length);
}
- for (i = 0; i < ctxt->sc_page_count; ++i)
- put_page(ctxt->sc_pages[i]);
+ llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts);
+}
- spin_lock(&rdma->sc_send_lock);
- list_add(&ctxt->sc_list, &rdma->sc_send_ctxts);
- spin_unlock(&rdma->sc_send_lock);
+/**
+ * svc_rdma_wake_send_waiters - manage Send Queue accounting
+ * @rdma: controlling transport
+ * @avail: Number of additional SQEs that are now available
+ *
+ */
+void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail)
+{
+ atomic_add(avail, &rdma->sc_sq_avail);
+ smp_mb__after_atomic();
+ if (unlikely(waitqueue_active(&rdma->sc_send_wait)))
+ wake_up(&rdma->sc_send_wait);
}
/**
@@ -263,42 +277,51 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
{
struct svcxprt_rdma *rdma = cq->cq_context;
struct ib_cqe *cqe = wc->wr_cqe;
- struct svc_rdma_send_ctxt *ctxt;
+ struct svc_rdma_send_ctxt *ctxt =
+ container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
- trace_svcrdma_wc_send(wc);
+ svc_rdma_wake_send_waiters(rdma, 1);
+ complete(&ctxt->sc_done);
- atomic_inc(&rdma->sc_sq_avail);
- wake_up(&rdma->sc_send_wait);
+ if (unlikely(wc->status != IB_WC_SUCCESS))
+ goto flushed;
- ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
- svc_rdma_send_ctxt_put(rdma, ctxt);
+ trace_svcrdma_wc_send(wc, &ctxt->sc_cid);
+ return;
- if (unlikely(wc->status != IB_WC_SUCCESS)) {
- set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
- svc_xprt_enqueue(&rdma->sc_xprt);
- }
-
- svc_xprt_put(&rdma->sc_xprt);
+flushed:
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid);
+ else
+ trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid);
+ svc_xprt_deferred_close(&rdma->sc_xprt);
}
/**
* svc_rdma_send - Post a single Send WR
* @rdma: transport on which to post the WR
- * @wr: prepared Send WR to post
+ * @ctxt: send ctxt with a Send WR ready to post
*
- * Returns zero the Send WR was posted successfully. Otherwise, a
+ * Returns zero if the Send WR was posted successfully. Otherwise, a
* negative errno is returned.
*/
-int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)
+int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
{
+ struct ib_send_wr *wr = &ctxt->sc_send_wr;
int ret;
- might_sleep();
+ reinit_completion(&ctxt->sc_done);
+
+ /* Sync the transport header buffer */
+ ib_dma_sync_single_for_device(rdma->sc_pd->device,
+ wr->sg_list[0].addr,
+ wr->sg_list[0].length,
+ DMA_TO_DEVICE);
/* If the SQ is full, wait until an SQ entry is available */
while (1) {
if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) {
- atomic_inc(&rdma_stat_sq_starve);
+ percpu_counter_inc(&svcrdma_stat_sq_starve);
trace_svcrdma_sq_full(rdma);
atomic_inc(&rdma->sc_sq_avail);
wait_event(rdma->sc_send_wait,
@@ -309,423 +332,471 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)
continue;
}
- svc_xprt_get(&rdma->sc_xprt);
+ trace_svcrdma_post_send(ctxt);
ret = ib_post_send(rdma->sc_qp, wr, NULL);
- trace_svcrdma_post_send(wr, ret);
- if (ret) {
- set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
- svc_xprt_put(&rdma->sc_xprt);
- wake_up(&rdma->sc_send_wait);
- }
- break;
+ if (ret)
+ break;
+ return 0;
}
+
+ trace_svcrdma_sq_post_err(rdma, ret);
+ svc_xprt_deferred_close(&rdma->sc_xprt);
+ wake_up(&rdma->sc_send_wait);
return ret;
}
-static u32 xdr_padsize(u32 len)
+/**
+ * svc_rdma_encode_read_list - Encode RPC Reply's Read chunk list
+ * @sctxt: Send context for the RPC Reply
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Reply Read list
+ * %-EMSGSIZE on XDR buffer overflow
+ */
+static ssize_t svc_rdma_encode_read_list(struct svc_rdma_send_ctxt *sctxt)
{
- return (len & 3) ? (4 - (len & 3)) : 0;
+ /* RPC-over-RDMA version 1 replies never have a Read list. */
+ return xdr_stream_encode_item_absent(&sctxt->sc_stream);
}
-/* Returns length of transport header, in bytes.
+/**
+ * svc_rdma_encode_write_segment - Encode one Write segment
+ * @sctxt: Send context for the RPC Reply
+ * @chunk: Write chunk to push
+ * @remaining: remaining bytes of the payload left in the Write chunk
+ * @segno: which segment in the chunk
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Write segment, and updates @remaining
+ * %-EMSGSIZE on XDR buffer overflow
*/
-static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
+static ssize_t svc_rdma_encode_write_segment(struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_chunk *chunk,
+ u32 *remaining, unsigned int segno)
{
- unsigned int nsegs;
+ const struct svc_rdma_segment *segment = &chunk->ch_segments[segno];
+ const size_t len = rpcrdma_segment_maxsz * sizeof(__be32);
+ u32 length;
__be32 *p;
- p = rdma_resp;
-
- /* RPC-over-RDMA V1 replies never have a Read list. */
- p += rpcrdma_fixed_maxsz + 1;
-
- /* Skip Write list. */
- while (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- /* Skip Reply chunk. */
- if (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- return (unsigned long)p - (unsigned long)rdma_resp;
+ p = xdr_reserve_space(&sctxt->sc_stream, len);
+ if (!p)
+ return -EMSGSIZE;
+
+ length = min_t(u32, *remaining, segment->rs_length);
+ *remaining -= length;
+ xdr_encode_rdma_segment(p, segment->rs_handle, length,
+ segment->rs_offset);
+ trace_svcrdma_encode_wseg(sctxt, segno, segment->rs_handle, length,
+ segment->rs_offset);
+ return len;
}
-/* One Write chunk is copied from Call transport header to Reply
- * transport header. Each segment's length field is updated to
- * reflect number of bytes consumed in the segment.
+/**
+ * svc_rdma_encode_write_chunk - Encode one Write chunk
+ * @sctxt: Send context for the RPC Reply
+ * @chunk: Write chunk to push
*
- * Returns number of segments in this chunk.
+ * Copy a Write chunk from the Call transport header to the
+ * Reply transport header. Update each segment's length field
+ * to reflect the number of bytes written in that segment.
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Write chunk
+ * %-EMSGSIZE on XDR buffer overflow
*/
-static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
- unsigned int remaining)
+static ssize_t svc_rdma_encode_write_chunk(struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_chunk *chunk)
{
- unsigned int i, nsegs;
- u32 seg_len;
-
- /* Write list discriminator */
- *dst++ = *src++;
-
- /* number of segments in this chunk */
- nsegs = be32_to_cpup(src);
- *dst++ = *src++;
-
- for (i = nsegs; i; i--) {
- /* segment's RDMA handle */
- *dst++ = *src++;
-
- /* bytes returned in this segment */
- seg_len = be32_to_cpu(*src);
- if (remaining >= seg_len) {
- /* entire segment was consumed */
- *dst = *src;
- remaining -= seg_len;
- } else {
- /* segment only partly filled */
- *dst = cpu_to_be32(remaining);
- remaining = 0;
- }
- dst++; src++;
+ u32 remaining = chunk->ch_payload_length;
+ unsigned int segno;
+ ssize_t len, ret;
- /* segment's RDMA offset */
- *dst++ = *src++;
- *dst++ = *src++;
- }
+ len = 0;
+ ret = xdr_stream_encode_item_present(&sctxt->sc_stream);
+ if (ret < 0)
+ return ret;
+ len += ret;
- return nsegs;
-}
+ ret = xdr_stream_encode_u32(&sctxt->sc_stream, chunk->ch_segcount);
+ if (ret < 0)
+ return ret;
+ len += ret;
-/* The client provided a Write list in the Call message. Fill in
- * the segments in the first Write chunk in the Reply's transport
- * header with the number of bytes consumed in each segment.
- * Remaining chunks are returned unused.
- *
- * Assumptions:
- * - Client has provided only one Write chunk
- */
-static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
- unsigned int consumed)
-{
- unsigned int nsegs;
- __be32 *p, *q;
-
- /* RPC-over-RDMA V1 replies never have a Read list. */
- p = rdma_resp + rpcrdma_fixed_maxsz + 1;
-
- q = wr_ch;
- while (*q != xdr_zero) {
- nsegs = xdr_encode_write_chunk(p, q, consumed);
- q += 2 + nsegs * rpcrdma_segment_maxsz;
- p += 2 + nsegs * rpcrdma_segment_maxsz;
- consumed = 0;
+ for (segno = 0; segno < chunk->ch_segcount; segno++) {
+ ret = svc_rdma_encode_write_segment(sctxt, chunk, &remaining, segno);
+ if (ret < 0)
+ return ret;
+ len += ret;
}
- /* Terminate Write list */
- *p++ = xdr_zero;
-
- /* Reply chunk discriminator; may be replaced later */
- *p = xdr_zero;
+ return len;
}
-/* The client provided a Reply chunk in the Call message. Fill in
- * the segments in the Reply chunk in the Reply message with the
- * number of bytes consumed in each segment.
+/**
+ * svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list
+ * @rctxt: Reply context with information about the RPC Call
+ * @sctxt: Send context for the RPC Reply
*
- * Assumptions:
- * - Reply can always fit in the provided Reply chunk
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Reply's Write list
+ * %-EMSGSIZE on XDR buffer overflow
*/
-static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
- unsigned int consumed)
+static ssize_t svc_rdma_encode_write_list(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_send_ctxt *sctxt)
{
- __be32 *p;
+ struct svc_rdma_chunk *chunk;
+ ssize_t len, ret;
- /* Find the Reply chunk in the Reply's xprt header.
- * RPC-over-RDMA V1 replies never have a Read list.
- */
- p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+ len = 0;
+ pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) {
+ ret = svc_rdma_encode_write_chunk(sctxt, chunk);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ }
- /* Skip past Write list */
- while (*p++ != xdr_zero)
- p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+ /* Terminate the Write list */
+ ret = xdr_stream_encode_item_absent(&sctxt->sc_stream);
+ if (ret < 0)
+ return ret;
- xdr_encode_write_chunk(p, rp_ch, consumed);
+ return len + ret;
}
-/* Parse the RPC Call's transport header.
+/**
+ * svc_rdma_encode_reply_chunk - Encode RPC Reply's Reply chunk
+ * @rctxt: Reply context with information about the RPC Call
+ * @sctxt: Send context for the RPC Reply
+ * @length: size in bytes of the payload in the Reply chunk
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Reply's Reply chunk
+ * %-EMSGSIZE on XDR buffer overflow
+ * %-E2BIG if the RPC message is larger than the Reply chunk
*/
-static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
- __be32 **write, __be32 **reply)
+static ssize_t
+svc_rdma_encode_reply_chunk(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_send_ctxt *sctxt,
+ unsigned int length)
{
- __be32 *p;
-
- p = rdma_argp + rpcrdma_fixed_maxsz;
+ struct svc_rdma_chunk *chunk;
- /* Read list */
- while (*p++ != xdr_zero)
- p += 5;
+ if (pcl_is_empty(&rctxt->rc_reply_pcl))
+ return xdr_stream_encode_item_absent(&sctxt->sc_stream);
- /* Write list */
- if (*p != xdr_zero) {
- *write = p;
- while (*p++ != xdr_zero)
- p += 1 + be32_to_cpu(*p) * 4;
- } else {
- *write = NULL;
- p++;
- }
+ chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
+ if (length > chunk->ch_length)
+ return -E2BIG;
- /* Reply chunk */
- if (*p != xdr_zero)
- *reply = p;
- else
- *reply = NULL;
+ chunk->ch_payload_length = length;
+ return svc_rdma_encode_write_chunk(sctxt, chunk);
}
-static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- struct page *page,
- unsigned long offset,
- unsigned int len)
+struct svc_rdma_map_data {
+ struct svcxprt_rdma *md_rdma;
+ struct svc_rdma_send_ctxt *md_ctxt;
+};
+
+/**
+ * svc_rdma_page_dma_map - DMA map one page
+ * @data: pointer to arguments
+ * @page: struct page to DMA map
+ * @offset: offset into the page
+ * @len: number of bytes to map
+ *
+ * Returns:
+ * %0 if DMA mapping was successful
+ * %-EIO if the page cannot be DMA mapped
+ */
+static int svc_rdma_page_dma_map(void *data, struct page *page,
+ unsigned long offset, unsigned int len)
{
+ struct svc_rdma_map_data *args = data;
+ struct svcxprt_rdma *rdma = args->md_rdma;
+ struct svc_rdma_send_ctxt *ctxt = args->md_ctxt;
struct ib_device *dev = rdma->sc_cm_id->device;
dma_addr_t dma_addr;
+ ++ctxt->sc_cur_sge_no;
+
dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
- trace_svcrdma_dma_map_page(rdma, dma_addr, len);
if (ib_dma_mapping_error(dev, dma_addr))
goto out_maperr;
+ trace_svcrdma_dma_map_page(rdma, dma_addr, len);
ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr;
ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len;
ctxt->sc_send_wr.num_sge++;
return 0;
out_maperr:
+ trace_svcrdma_dma_map_err(rdma, dma_addr, len);
return -EIO;
}
-/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+/**
+ * svc_rdma_iov_dma_map - DMA map an iovec
+ * @data: pointer to arguments
+ * @iov: kvec to DMA map
+ *
+ * ib_dma_map_page() is used here because svc_rdma_dma_unmap()
* handles DMA-unmap and it uses ib_dma_unmap_page() exclusively.
+ *
+ * Returns:
+ * %0 if DMA mapping was successful
+ * %-EIO if the iovec cannot be DMA mapped
*/
-static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- unsigned char *base,
- unsigned int len)
+static int svc_rdma_iov_dma_map(void *data, const struct kvec *iov)
{
- return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base),
- offset_in_page(base), len);
+ if (!iov->iov_len)
+ return 0;
+ return svc_rdma_page_dma_map(data, virt_to_page(iov->iov_base),
+ offset_in_page(iov->iov_base),
+ iov->iov_len);
}
/**
- * svc_rdma_sync_reply_hdr - DMA sync the transport header buffer
- * @rdma: controlling transport
- * @ctxt: send_ctxt for the Send WR
- * @len: length of transport header
+ * svc_rdma_xb_dma_map - DMA map all segments of an xdr_buf
+ * @xdr: xdr_buf containing portion of an RPC message to transmit
+ * @data: pointer to arguments
+ *
+ * Returns:
+ * %0 if DMA mapping was successful
+ * %-EIO if DMA mapping failed
*
+ * On failure, any DMA mappings that have been already done must be
+ * unmapped by the caller.
*/
-void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- unsigned int len)
+static int svc_rdma_xb_dma_map(const struct xdr_buf *xdr, void *data)
{
- ctxt->sc_sges[0].length = len;
- ctxt->sc_send_wr.num_sge++;
- ib_dma_sync_single_for_device(rdma->sc_pd->device,
- ctxt->sc_sges[0].addr, len,
- DMA_TO_DEVICE);
-}
+ unsigned int len, remaining;
+ unsigned long pageoff;
+ struct page **ppages;
+ int ret;
-/* If the xdr_buf has more elements than the device can
- * transmit in a single RDMA Send, then the reply will
- * have to be copied into a bounce buffer.
- */
-static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
- struct xdr_buf *xdr,
- __be32 *wr_lst)
-{
- int elements;
-
- /* xdr->head */
- elements = 1;
-
- /* xdr->pages */
- if (!wr_lst) {
- unsigned int remaining;
- unsigned long pageoff;
-
- pageoff = xdr->page_base & ~PAGE_MASK;
- remaining = xdr->page_len;
- while (remaining) {
- ++elements;
- remaining -= min_t(u32, PAGE_SIZE - pageoff,
- remaining);
- pageoff = 0;
- }
+ ret = svc_rdma_iov_dma_map(data, &xdr->head[0]);
+ if (ret < 0)
+ return ret;
+
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ pageoff = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ len = min_t(u32, PAGE_SIZE - pageoff, remaining);
+
+ ret = svc_rdma_page_dma_map(data, *ppages++, pageoff, len);
+ if (ret < 0)
+ return ret;
+
+ remaining -= len;
+ pageoff = 0;
}
- /* xdr->tail */
- if (xdr->tail[0].iov_len)
- ++elements;
+ ret = svc_rdma_iov_dma_map(data, &xdr->tail[0]);
+ if (ret < 0)
+ return ret;
- /* assume 1 SGE is needed for the transport header */
- return elements >= rdma->sc_max_send_sges;
+ return xdr->len;
}
-/* The device is not capable of sending the reply directly.
- * Assemble the elements of @xdr into the transport header
- * buffer.
+struct svc_rdma_pullup_data {
+ u8 *pd_dest;
+ unsigned int pd_length;
+ unsigned int pd_num_sges;
+};
+
+/**
+ * svc_rdma_xb_count_sges - Count how many SGEs will be needed
+ * @xdr: xdr_buf containing portion of an RPC message to transmit
+ * @data: pointer to arguments
+ *
+ * Returns:
+ * Number of SGEs needed to Send the contents of @xdr inline
*/
-static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- struct xdr_buf *xdr, __be32 *wr_lst)
+static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr,
+ void *data)
{
- unsigned char *dst, *tailbase;
- unsigned int taillen;
-
- dst = ctxt->sc_xprt_buf;
- dst += ctxt->sc_sges[0].length;
+ struct svc_rdma_pullup_data *args = data;
+ unsigned int remaining;
+ unsigned long offset;
- memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len);
- dst += xdr->head[0].iov_len;
+ if (xdr->head[0].iov_len)
+ ++args->pd_num_sges;
- tailbase = xdr->tail[0].iov_base;
- taillen = xdr->tail[0].iov_len;
- if (wr_lst) {
- u32 xdrpad;
-
- xdrpad = xdr_padsize(xdr->page_len);
- if (taillen && xdrpad) {
- tailbase += xdrpad;
- taillen -= xdrpad;
- }
- } else {
- unsigned int len, remaining;
- unsigned long pageoff;
- struct page **ppages;
-
- ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
- pageoff = xdr->page_base & ~PAGE_MASK;
- remaining = xdr->page_len;
- while (remaining) {
- len = min_t(u32, PAGE_SIZE - pageoff, remaining);
-
- memcpy(dst, page_address(*ppages), len);
- remaining -= len;
- dst += len;
- pageoff = 0;
- }
+ offset = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ ++args->pd_num_sges;
+ remaining -= min_t(u32, PAGE_SIZE - offset, remaining);
+ offset = 0;
}
- if (taillen)
- memcpy(dst, tailbase, taillen);
-
- ctxt->sc_sges[0].length += xdr->len;
- ib_dma_sync_single_for_device(rdma->sc_pd->device,
- ctxt->sc_sges[0].addr,
- ctxt->sc_sges[0].length,
- DMA_TO_DEVICE);
+ if (xdr->tail[0].iov_len)
+ ++args->pd_num_sges;
+ args->pd_length += xdr->len;
return 0;
}
-/* svc_rdma_map_reply_msg - Map the buffer holding RPC message
+/**
+ * svc_rdma_pull_up_needed - Determine whether to use pull-up
* @rdma: controlling transport
- * @ctxt: send_ctxt for the Send WR
- * @xdr: prepared xdr_buf containing RPC message
- * @wr_lst: pointer to Call header's Write list, or NULL
- *
- * Load the xdr_buf into the ctxt's sge array, and DMA map each
- * element as it is added.
+ * @sctxt: send_ctxt for the Send WR
+ * @rctxt: Write and Reply chunks provided by client
+ * @xdr: xdr_buf containing RPC message to transmit
*
- * Returns zero on success, or a negative errno on failure.
+ * Returns:
+ * %true if pull-up must be used
+ * %false otherwise
*/
-int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- struct xdr_buf *xdr, __be32 *wr_lst)
+static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma,
+ const struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ const struct xdr_buf *xdr)
{
- unsigned int len, remaining;
- unsigned long page_off;
- struct page **ppages;
- unsigned char *base;
- u32 xdr_pad;
+ /* Resources needed for the transport header */
+ struct svc_rdma_pullup_data args = {
+ .pd_length = sctxt->sc_hdrbuf.len,
+ .pd_num_sges = 1,
+ };
int ret;
- if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst))
- return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst);
-
- ++ctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_buf(rdma, ctxt,
- xdr->head[0].iov_base,
- xdr->head[0].iov_len);
+ ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_count_sges, &args);
if (ret < 0)
- return ret;
+ return false;
- /* If a Write chunk is present, the xdr_buf's page list
- * is not included inline. However the Upper Layer may
- * have added XDR padding in the tail buffer, and that
- * should not be included inline.
- */
- if (wr_lst) {
- base = xdr->tail[0].iov_base;
- len = xdr->tail[0].iov_len;
- xdr_pad = xdr_padsize(xdr->page_len);
-
- if (len && xdr_pad) {
- base += xdr_pad;
- len -= xdr_pad;
- }
+ if (args.pd_length < RPCRDMA_PULLUP_THRESH)
+ return true;
+ return args.pd_num_sges >= rdma->sc_max_send_sges;
+}
- goto tail;
+/**
+ * svc_rdma_xb_linearize - Copy region of xdr_buf to flat buffer
+ * @xdr: xdr_buf containing portion of an RPC message to copy
+ * @data: pointer to arguments
+ *
+ * Returns:
+ * Always zero.
+ */
+static int svc_rdma_xb_linearize(const struct xdr_buf *xdr,
+ void *data)
+{
+ struct svc_rdma_pullup_data *args = data;
+ unsigned int len, remaining;
+ unsigned long pageoff;
+ struct page **ppages;
+
+ if (xdr->head[0].iov_len) {
+ memcpy(args->pd_dest, xdr->head[0].iov_base, xdr->head[0].iov_len);
+ args->pd_dest += xdr->head[0].iov_len;
}
ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
- page_off = xdr->page_base & ~PAGE_MASK;
+ pageoff = offset_in_page(xdr->page_base);
remaining = xdr->page_len;
while (remaining) {
- len = min_t(u32, PAGE_SIZE - page_off, remaining);
-
- ++ctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++,
- page_off, len);
- if (ret < 0)
- return ret;
-
+ len = min_t(u32, PAGE_SIZE - pageoff, remaining);
+ memcpy(args->pd_dest, page_address(*ppages) + pageoff, len);
remaining -= len;
- page_off = 0;
+ args->pd_dest += len;
+ pageoff = 0;
+ ppages++;
}
- base = xdr->tail[0].iov_base;
- len = xdr->tail[0].iov_len;
-tail:
- if (len) {
- ++ctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len);
- if (ret < 0)
- return ret;
+ if (xdr->tail[0].iov_len) {
+ memcpy(args->pd_dest, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
+ args->pd_dest += xdr->tail[0].iov_len;
}
+ args->pd_length += xdr->len;
return 0;
}
-/* The svc_rqst and all resources it owns are released as soon as
- * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
- * so they are released by the Send completion handler.
+/**
+ * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer
+ * @rdma: controlling transport
+ * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared
+ * @rctxt: Write and Reply chunks provided by client
+ * @xdr: prepared xdr_buf containing RPC message
+ *
+ * The device is not capable of sending the reply directly.
+ * Assemble the elements of @xdr into the transport header buffer.
+ *
+ * Assumptions:
+ * pull_up_needed has determined that @xdr will fit in the buffer.
+ *
+ * Returns:
+ * %0 if pull-up was successful
+ * %-EMSGSIZE if a buffer manipulation problem occurred
*/
-static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
- struct svc_rdma_send_ctxt *ctxt)
+static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ const struct xdr_buf *xdr)
{
- int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
+ struct svc_rdma_pullup_data args = {
+ .pd_dest = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len,
+ };
+ int ret;
- ctxt->sc_page_count += pages;
- for (i = 0; i < pages; i++) {
- ctxt->sc_pages[i] = rqstp->rq_respages[i];
- rqstp->rq_respages[i] = NULL;
- }
+ ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_linearize, &args);
+ if (ret < 0)
+ return ret;
+
+ sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len + args.pd_length;
+ trace_svcrdma_send_pullup(sctxt, args.pd_length);
+ return 0;
+}
+
+/* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message
+ * @rdma: controlling transport
+ * @sctxt: send_ctxt for the Send WR
+ * @rctxt: Write and Reply chunks provided by client
+ * @xdr: prepared xdr_buf containing RPC message
+ *
+ * Returns:
+ * %0 if DMA mapping was successful.
+ * %-EMSGSIZE if a buffer manipulation problem occurred
+ * %-EIO if DMA mapping failed
+ *
+ * The Send WR's num_sge field is set in all cases.
+ */
+int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ const struct xdr_buf *xdr)
+{
+ struct svc_rdma_map_data args = {
+ .md_rdma = rdma,
+ .md_ctxt = sctxt,
+ };
- /* Prevent svc_xprt_release from releasing pages in rq_pages */
- rqstp->rq_next_page = rqstp->rq_respages;
+ /* Set up the (persistently-mapped) transport header SGE. */
+ sctxt->sc_send_wr.num_sge = 1;
+ sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
+
+ /* If there is a Reply chunk, nothing follows the transport
+ * header, and we're done here.
+ */
+ if (!pcl_is_empty(&rctxt->rc_reply_pcl))
+ return 0;
+
+ /* For pull-up, svc_rdma_send() will sync the transport header.
+ * No additional DMA mapping is necessary.
+ */
+ if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr))
+ return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr);
+
+ return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_dma_map, &args);
}
/* Prepare the portion of the RPC Reply that will be transmitted
@@ -748,20 +819,14 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
*/
static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *sctxt,
- struct svc_rdma_recv_ctxt *rctxt,
- struct svc_rqst *rqstp,
- __be32 *wr_lst, __be32 *rp_ch)
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rqst *rqstp)
{
int ret;
- if (!rp_ch) {
- ret = svc_rdma_map_reply_msg(rdma, sctxt,
- &rqstp->rq_res, wr_lst);
- if (ret < 0)
- return ret;
- }
-
- svc_rdma_save_io_pages(rqstp, sctxt);
+ ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqstp->rq_res);
+ if (ret < 0)
+ return ret;
if (rctxt->rc_inv_rkey) {
sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
@@ -769,42 +834,84 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
} else {
sctxt->sc_send_wr.opcode = IB_WR_SEND;
}
- dprintk("svcrdma: posting Send WR with %u sge(s)\n",
- sctxt->sc_send_wr.num_sge);
- return svc_rdma_send(rdma, &sctxt->sc_send_wr);
+
+ ret = svc_rdma_send(rdma, sctxt);
+ if (ret < 0)
+ return ret;
+
+ ret = wait_for_completion_killable(&sctxt->sc_done);
+ svc_rdma_send_ctxt_put(rdma, sctxt);
+ return ret;
}
-/* Given the client-provided Write and Reply chunks, the server was not
- * able to form a complete reply. Return an RDMA_ERROR message so the
- * client can retire this RPC transaction. As above, the Send completion
- * routine releases payload pages that were part of a previous RDMA Write.
+/**
+ * svc_rdma_send_error_msg - Send an RPC/RDMA v1 error response
+ * @rdma: controlling transport context
+ * @sctxt: Send context for the response
+ * @rctxt: Receive context for incoming bad message
+ * @status: negative errno indicating error that occurred
*
- * Remote Invalidation is skipped for simplicity.
+ * Given the client-provided Read, Write, and Reply chunks, the
+ * server was not able to parse the Call or form a complete Reply.
+ * Return an RDMA_ERROR message so the client can retire the RPC
+ * transaction.
+ *
+ * The caller does not have to release @sctxt. It is released by
+ * Send completion, or by this function on error.
*/
-static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- struct svc_rqst *rqstp)
+void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *sctxt,
+ struct svc_rdma_recv_ctxt *rctxt,
+ int status)
{
+ __be32 *rdma_argp = rctxt->rc_recv_buf;
__be32 *p;
- int ret;
- p = ctxt->sc_xprt_buf;
- trace_svcrdma_err_chunk(*p);
- p += 3;
- *p++ = rdma_error;
- *p = err_chunk;
- svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_ERR);
+ rpcrdma_set_xdrlen(&sctxt->sc_hdrbuf, 0);
+ xdr_init_encode(&sctxt->sc_stream, &sctxt->sc_hdrbuf,
+ sctxt->sc_xprt_buf, NULL);
- svc_rdma_save_io_pages(rqstp, ctxt);
+ p = xdr_reserve_space(&sctxt->sc_stream,
+ rpcrdma_fixed_maxsz * sizeof(*p));
+ if (!p)
+ goto put_ctxt;
- ctxt->sc_send_wr.opcode = IB_WR_SEND;
- ret = svc_rdma_send(rdma, &ctxt->sc_send_wr);
- if (ret) {
- svc_rdma_send_ctxt_put(rdma, ctxt);
- return ret;
+ *p++ = *rdma_argp;
+ *p++ = *(rdma_argp + 1);
+ *p++ = rdma->sc_fc_credits;
+ *p = rdma_error;
+
+ switch (status) {
+ case -EPROTONOSUPPORT:
+ p = xdr_reserve_space(&sctxt->sc_stream, 3 * sizeof(*p));
+ if (!p)
+ goto put_ctxt;
+
+ *p++ = err_vers;
+ *p++ = rpcrdma_version;
+ *p = rpcrdma_version;
+ trace_svcrdma_err_vers(*rdma_argp);
+ break;
+ default:
+ p = xdr_reserve_space(&sctxt->sc_stream, sizeof(*p));
+ if (!p)
+ goto put_ctxt;
+
+ *p = err_chunk;
+ trace_svcrdma_err_chunk(*rdma_argp);
}
- return 0;
+ /* Remote Invalidation is skipped for simplicity. */
+ sctxt->sc_send_wr.num_sge = 1;
+ sctxt->sc_send_wr.opcode = IB_WR_SEND;
+ sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
+ if (svc_rdma_send(rdma, sctxt))
+ goto put_ctxt;
+
+ wait_for_completion_killable(&sctxt->sc_done);
+
+put_ctxt:
+ svc_rdma_send_ctxt_put(rdma, sctxt);
}
/**
@@ -825,78 +932,113 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
struct svcxprt_rdma *rdma =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
- __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
- struct xdr_buf *xdr = &rqstp->rq_res;
+ __be32 *rdma_argp = rctxt->rc_recv_buf;
struct svc_rdma_send_ctxt *sctxt;
+ unsigned int rc_size;
+ __be32 *p;
int ret;
- rdma_argp = rctxt->rc_recv_buf;
- svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
+ ret = -ENOTCONN;
+ if (svc_xprt_is_dead(xprt))
+ goto drop_connection;
- /* Create the RDMA response header. xprt->xpt_mutex,
- * acquired in svc_send(), serializes RPC replies. The
- * code path below that inserts the credit grant value
- * into each transport header runs only inside this
- * critical section.
- */
ret = -ENOMEM;
sctxt = svc_rdma_send_ctxt_get(rdma);
if (!sctxt)
- goto err0;
- rdma_resp = sctxt->sc_xprt_buf;
+ goto drop_connection;
+
+ ret = -EMSGSIZE;
+ p = xdr_reserve_space(&sctxt->sc_stream,
+ rpcrdma_fixed_maxsz * sizeof(*p));
+ if (!p)
+ goto put_ctxt;
+
+ ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
+ if (ret < 0)
+ goto reply_chunk;
+ rc_size = ret;
- p = rdma_resp;
*p++ = *rdma_argp;
*p++ = *(rdma_argp + 1);
*p++ = rdma->sc_fc_credits;
- *p++ = rp_ch ? rdma_nomsg : rdma_msg;
-
- /* Start with empty chunks */
- *p++ = xdr_zero;
- *p++ = xdr_zero;
- *p = xdr_zero;
+ *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg;
- if (wr_lst) {
- /* XXX: Presume the client sent only one Write chunk */
- ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
- if (ret < 0)
- goto err2;
- svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
- }
- if (rp_ch) {
- ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
- if (ret < 0)
- goto err2;
- svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
- }
+ ret = svc_rdma_encode_read_list(sctxt);
+ if (ret < 0)
+ goto put_ctxt;
+ ret = svc_rdma_encode_write_list(rctxt, sctxt);
+ if (ret < 0)
+ goto put_ctxt;
+ ret = svc_rdma_encode_reply_chunk(rctxt, sctxt, rc_size);
+ if (ret < 0)
+ goto put_ctxt;
- svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
- ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
- wr_lst, rp_ch);
+ ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
if (ret < 0)
- goto err1;
- ret = 0;
+ goto put_ctxt;
-out:
- rqstp->rq_xprt_ctxt = NULL;
- svc_rdma_recv_ctxt_put(rdma, rctxt);
- return ret;
+ /* Prevent svc_xprt_release() from releasing the page backing
+ * rq_res.head[0].iov_base. It's no longer being accessed by
+ * the I/O device. */
+ rqstp->rq_respages++;
+ return 0;
- err2:
+reply_chunk:
if (ret != -E2BIG && ret != -EINVAL)
- goto err1;
+ goto put_ctxt;
- ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp);
- if (ret < 0)
- goto err1;
- ret = 0;
- goto out;
+ svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret);
+ return 0;
- err1:
+put_ctxt:
svc_rdma_send_ctxt_put(rdma, sctxt);
- err0:
- trace_svcrdma_send_failed(rqstp, ret);
- set_bit(XPT_CLOSE, &xprt->xpt_flags);
- ret = -ENOTCONN;
- goto out;
+drop_connection:
+ trace_svcrdma_send_err(rqstp, ret);
+ svc_xprt_deferred_close(&rdma->sc_xprt);
+ return -ENOTCONN;
+}
+
+/**
+ * svc_rdma_result_payload - special processing for a result payload
+ * @rqstp: svc_rqst to operate on
+ * @offset: payload's byte offset in @xdr
+ * @length: size of payload, in bytes
+ *
+ * Return values:
+ * %0 if successful or nothing needed to be done
+ * %-EMSGSIZE on XDR buffer overflow
+ * %-E2BIG if the payload was larger than the Write chunk
+ * %-EINVAL if client provided too many segments
+ * %-ENOMEM if rdma_rw context pool was exhausted
+ * %-ENOTCONN if posting failed (connection is lost)
+ * %-EIO if rdma_rw initialization failed (DMA mapping, etc)
+ */
+int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
+{
+ struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
+ struct svc_rdma_chunk *chunk;
+ struct svcxprt_rdma *rdma;
+ struct xdr_buf subbuf;
+ int ret;
+
+ chunk = rctxt->rc_cur_result_payload;
+ if (!length || !chunk)
+ return 0;
+ rctxt->rc_cur_result_payload =
+ pcl_next_chunk(&rctxt->rc_write_pcl, chunk);
+ if (length > chunk->ch_length)
+ return -E2BIG;
+
+ chunk->ch_position = offset;
+ chunk->ch_payload_length = length;
+
+ if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length))
+ return -EMSGSIZE;
+
+ rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
+ ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf);
+ if (ret < 0)
+ return ret;
+ return 0;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 145a3615c319..199fa012f18a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -55,7 +55,6 @@
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/debug.h>
-#include <linux/sunrpc/rpc_rdma.h>
#include <linux/sunrpc/svc_xprt.h>
#include <linux/sunrpc/svc_rdma.h>
@@ -71,7 +70,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
struct sockaddr *sa, int salen,
int flags);
static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
-static void svc_rdma_release_rqst(struct svc_rqst *);
static void svc_rdma_detach(struct svc_xprt *xprt);
static void svc_rdma_free(struct svc_xprt *xprt);
static int svc_rdma_has_wspace(struct svc_xprt *xprt);
@@ -82,6 +80,7 @@ static const struct svc_xprt_ops svc_rdma_ops = {
.xpo_create = svc_rdma_create,
.xpo_recvfrom = svc_rdma_recvfrom,
.xpo_sendto = svc_rdma_sendto,
+ .xpo_result_payload = svc_rdma_result_payload,
.xpo_release_rqst = svc_rdma_release_rqst,
.xpo_detach = svc_rdma_detach,
.xpo_free = svc_rdma_free,
@@ -120,8 +119,7 @@ static void qp_event_handler(struct ib_event *event, void *context)
case IB_EVENT_QP_ACCESS_ERR:
case IB_EVENT_DEVICE_FATAL:
default:
- set_bit(XPT_CLOSE, &xprt->xpt_flags);
- svc_xprt_enqueue(xprt);
+ svc_xprt_deferred_close(xprt);
break;
}
}
@@ -138,10 +136,9 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
- INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
- INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts);
+ init_llist_head(&cma_xprt->sc_send_ctxts);
init_llist_head(&cma_xprt->sc_recv_ctxts);
- INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
+ init_llist_head(&cma_xprt->sc_rw_ctxts);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
@@ -211,7 +208,12 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
newxprt->sc_ord = param->initiator_depth;
sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
- svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+ newxprt->sc_xprt.xpt_remotelen = svc_addr_len(sa);
+ memcpy(&newxprt->sc_xprt.xpt_remote, sa,
+ newxprt->sc_xprt.xpt_remotelen);
+ snprintf(newxprt->sc_xprt.xpt_remotebuf,
+ sizeof(newxprt->sc_xprt.xpt_remotebuf) - 1, "%pISc", sa);
+
/* The remote port is arbitrary and not under the control of the
* client ULP. Set it to a fixed value so that the DRC continues
* to be effective after a reconnect.
@@ -233,72 +235,58 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
svc_xprt_enqueue(&listen_xprt->sc_xprt);
}
-/*
- * Handles events generated on the listening endpoint. These events will be
- * either be incoming connect requests or adapter removal events.
+/**
+ * svc_rdma_listen_handler - Handle CM events generated on a listening endpoint
+ * @cma_id: the server's listener rdma_cm_id
+ * @event: details of the event
+ *
+ * Return values:
+ * %0: Do not destroy @cma_id
+ * %1: Destroy @cma_id (never returned here)
+ *
+ * NB: There is never a DEVICE_REMOVAL event for INADDR_ANY listeners.
*/
-static int rdma_listen_handler(struct rdma_cm_id *cma_id,
- struct rdma_cm_event *event)
+static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id,
+ struct rdma_cm_event *event)
{
- struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr;
-
- trace_svcrdma_cm_event(event, sap);
-
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST:
- dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
- "event = %s (%d)\n", cma_id, cma_id->context,
- rdma_event_msg(event->event), event->event);
handle_connect_req(cma_id, &event->param.conn);
break;
default:
- /* NB: No device removal upcall for INADDR_ANY listeners */
- dprintk("svcrdma: Unexpected event on listening endpoint %p, "
- "event = %s (%d)\n", cma_id,
- rdma_event_msg(event->event), event->event);
break;
}
-
return 0;
}
-static int rdma_cma_handler(struct rdma_cm_id *cma_id,
- struct rdma_cm_event *event)
+/**
+ * svc_rdma_cma_handler - Handle CM events on client connections
+ * @cma_id: the server's listener rdma_cm_id
+ * @event: details of the event
+ *
+ * Return values:
+ * %0: Do not destroy @cma_id
+ * %1: Destroy @cma_id (never returned here)
+ */
+static int svc_rdma_cma_handler(struct rdma_cm_id *cma_id,
+ struct rdma_cm_event *event)
{
- struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.dst_addr;
struct svcxprt_rdma *rdma = cma_id->context;
struct svc_xprt *xprt = &rdma->sc_xprt;
- trace_svcrdma_cm_event(event, sap);
-
switch (event->event) {
case RDMA_CM_EVENT_ESTABLISHED:
- /* Accept complete */
- svc_xprt_get(xprt);
- dprintk("svcrdma: Connection completed on DTO xprt=%p, "
- "cm_id=%p\n", xprt, cma_id);
clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
+
+ /* Handle any requests that were received while
+ * CONN_PENDING was set. */
svc_xprt_enqueue(xprt);
break;
case RDMA_CM_EVENT_DISCONNECTED:
- dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
- xprt, cma_id);
- set_bit(XPT_CLOSE, &xprt->xpt_flags);
- svc_xprt_enqueue(xprt);
- svc_xprt_put(xprt);
- break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
- dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
- "event = %s (%d)\n", cma_id, xprt,
- rdma_event_msg(event->event), event->event);
- set_bit(XPT_CLOSE, &xprt->xpt_flags);
- svc_xprt_enqueue(xprt);
- svc_xprt_put(xprt);
+ svc_xprt_deferred_close(xprt);
break;
default:
- dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
- "event = %s (%d)\n", cma_id,
- rdma_event_msg(event->event), event->event);
break;
}
return 0;
@@ -316,22 +304,18 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
struct svcxprt_rdma *cma_xprt;
int ret;
- dprintk("svcrdma: Creating RDMA listener\n");
- if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) {
- dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
+ if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6)
return ERR_PTR(-EAFNOSUPPORT);
- }
cma_xprt = svc_rdma_create_xprt(serv, net);
if (!cma_xprt)
return ERR_PTR(-ENOMEM);
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener");
- listen_id = rdma_create_id(net, rdma_listen_handler, cma_xprt,
+ listen_id = rdma_create_id(net, svc_rdma_listen_handler, cma_xprt,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(listen_id)) {
ret = PTR_ERR(listen_id);
- dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
goto err0;
}
@@ -340,23 +324,17 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
*/
#if IS_ENABLED(CONFIG_IPV6)
ret = rdma_set_afonly(listen_id, 1);
- if (ret) {
- dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret);
+ if (ret)
goto err1;
- }
#endif
ret = rdma_bind_addr(listen_id, sa);
- if (ret) {
- dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
+ if (ret)
goto err1;
- }
cma_xprt->sc_cm_id = listen_id;
ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
- if (ret) {
- dprintk("svcrdma: rdma_listen failed = %d\n", ret);
+ if (ret)
goto err1;
- }
/*
* We need to use the address from the cm_id in case the
@@ -412,9 +390,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
if (!newxprt)
return NULL;
- dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
- newxprt, newxprt->sc_cm_id);
-
dev = newxprt->sc_cm_id->device;
newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
@@ -429,11 +404,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_req_size = svcrdma_max_req_size;
newxprt->sc_max_requests = svcrdma_max_requests;
newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
- rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests;
+ newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH;
+ rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests +
+ newxprt->sc_recv_batch;
if (rq_depth > dev->attrs.max_qp_wr) {
pr_warn("svcrdma: reducing receive depth to %d\n",
dev->attrs.max_qp_wr);
rq_depth = dev->attrs.max_qp_wr;
+ newxprt->sc_recv_batch = 1;
newxprt->sc_max_requests = rq_depth - 2;
newxprt->sc_max_bc_requests = 2;
}
@@ -450,21 +428,17 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_pd = ib_alloc_pd(dev, 0);
if (IS_ERR(newxprt->sc_pd)) {
- dprintk("svcrdma: error creating PD for connect request\n");
+ trace_svcrdma_pd_err(newxprt, PTR_ERR(newxprt->sc_pd));
goto errout;
}
newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth,
IB_POLL_WORKQUEUE);
- if (IS_ERR(newxprt->sc_sq_cq)) {
- dprintk("svcrdma: error creating SQ CQ for connect request\n");
+ if (IS_ERR(newxprt->sc_sq_cq))
goto errout;
- }
newxprt->sc_rq_cq =
ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE);
- if (IS_ERR(newxprt->sc_rq_cq)) {
- dprintk("svcrdma: error creating RQ CQ for connect request\n");
+ if (IS_ERR(newxprt->sc_rq_cq))
goto errout;
- }
memset(&qp_attr, 0, sizeof qp_attr);
qp_attr.event_handler = qp_event_handler;
@@ -488,7 +462,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
if (ret) {
- dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
+ trace_svcrdma_qp_err(newxprt, ret);
goto errout;
}
newxprt->sc_qp = newxprt->sc_cm_id->qp;
@@ -496,15 +470,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
newxprt->sc_snd_w_inv = false;
if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) &&
- !rdma_ib_or_roce(dev, newxprt->sc_port_num))
+ !rdma_ib_or_roce(dev, newxprt->sc_port_num)) {
+ trace_svcrdma_fabric_err(newxprt, -EINVAL);
goto errout;
+ }
if (!svc_rdma_post_recvs(newxprt))
goto errout;
- /* Swap out the handler */
- newxprt->sc_cm_id->event_handler = rdma_cma_handler;
-
/* Construct RDMA-CM private message */
pmsg.cp_magic = rpcrdma_cmp_magic;
pmsg.cp_version = RPCRDMA_CMP_VERSION;
@@ -519,15 +492,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
conn_param.initiator_depth = min_t(int, newxprt->sc_ord,
dev->attrs.max_qp_init_rd_atom);
if (!conn_param.initiator_depth) {
- dprintk("svcrdma: invalid ORD setting\n");
ret = -EINVAL;
+ trace_svcrdma_initdepth_err(newxprt, ret);
goto errout;
}
conn_param.private_data = &pmsg;
conn_param.private_data_len = sizeof(pmsg);
+ rdma_lock_handler(newxprt->sc_cm_id);
+ newxprt->sc_cm_id->event_handler = svc_rdma_cma_handler;
ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
- if (ret)
+ rdma_unlock_handler(newxprt->sc_cm_id);
+ if (ret) {
+ trace_svcrdma_accept_err(newxprt, ret);
goto errout;
+ }
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
dprintk("svcrdma: new connection %p accepted:\n", newxprt);
@@ -542,12 +520,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dprintk(" ord : %d\n", conn_param.initiator_depth);
#endif
- trace_svcrdma_xprt_accept(&newxprt->sc_xprt);
return &newxprt->sc_xprt;
errout:
- dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
- trace_svcrdma_xprt_fail(&newxprt->sc_xprt);
/* Take a reference in case the DTO handler runs */
svc_xprt_get(&newxprt->sc_xprt);
if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
@@ -558,28 +533,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
return NULL;
}
-static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
-{
-}
-
-/*
- * When connected, an svc_xprt has at least two references:
- *
- * - A reference held by the cm_id between the ESTABLISHED and
- * DISCONNECTED events. If the remote peer disconnected first, this
- * reference could be gone.
- *
- * - A reference held by the svc_recv code that called this function
- * as part of close processing.
- *
- * At a minimum one references should still be held.
- */
static void svc_rdma_detach(struct svc_xprt *xprt)
{
struct svcxprt_rdma *rdma =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
- /* Disconnect and flush posted WQE */
rdma_disconnect(rdma->sc_cm_id);
}
@@ -587,21 +545,13 @@ static void __svc_rdma_free(struct work_struct *work)
{
struct svcxprt_rdma *rdma =
container_of(work, struct svcxprt_rdma, sc_work);
- struct svc_xprt *xprt = &rdma->sc_xprt;
-
- trace_svcrdma_xprt_free(xprt);
+ /* This blocks until the Completion Queues are empty */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
ib_drain_qp(rdma->sc_qp);
svc_rdma_flush_recv_queues(rdma);
- /* Final put of backchannel client transport */
- if (xprt->xpt_bc_xprt) {
- xprt_put(xprt->xpt_bc_xprt);
- xprt->xpt_bc_xprt = NULL;
- }
-
svc_rdma_destroy_rw_ctxts(rdma);
svc_rdma_send_ctxts_destroy(rdma);
svc_rdma_recv_ctxts_destroy(rdma);
@@ -652,7 +602,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
static void svc_rdma_secure_port(struct svc_rqst *rqstp)
{
- set_bit(RQ_SECURE, &rqstp->rq_flags);
+ __set_bit(RQ_SECURE, &rqstp->rq_flags);
}
static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 3cfeba68ee9a..10bb2b929c6d 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -60,19 +60,16 @@
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_TRANS
-#endif
-
/*
* tunables
*/
-unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
+static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR;
int xprt_rdma_pad_optimize;
+static struct xprt_class xprt_rdma;
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -238,26 +235,35 @@ xprt_rdma_connect_worker(struct work_struct *work)
struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
rx_connect_worker.work);
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ unsigned int pflags = current->flags;
int rc;
- rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ if (atomic_read(&xprt->swapper))
+ current->flags |= PF_MEMALLOC;
+ rc = rpcrdma_xprt_connect(r_xprt);
xprt_clear_connecting(xprt);
- if (r_xprt->rx_ep.rep_connected > 0) {
+ if (!rc) {
+ xprt->connect_cookie++;
xprt->stat.connect_count++;
xprt->stat.connect_time += (long)jiffies -
xprt->stat.connect_start;
xprt_set_connected(xprt);
rc = -EAGAIN;
- }
+ } else
+ rpcrdma_xprt_disconnect(r_xprt);
+ xprt_unlock_connect(xprt, r_xprt);
xprt_wake_pending_tasks(xprt, rc);
+ current_restore_flags(pflags, PF_MEMALLOC);
}
/**
* xprt_rdma_inject_disconnect - inject a connection fault
* @xprt: transport context
*
- * If @xprt is connected, disconnect it to simulate spurious connection
- * loss.
+ * If @xprt is connected, disconnect it to simulate spurious
+ * connection loss. Caller must hold @xprt's send lock to
+ * ensure that data structures and hardware resources are
+ * stable during the rdma_disconnect() call.
*/
static void
xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
@@ -265,7 +271,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
trace_xprtrdma_op_inject_dsc(r_xprt);
- rdma_disconnect(r_xprt->rx_ia.ri_id);
+ rdma_disconnect(r_xprt->rx_ep->re_id);
}
/**
@@ -280,13 +286,10 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- trace_xprtrdma_op_destroy(r_xprt);
-
cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
- rpcrdma_ep_destroy(r_xprt);
+ rpcrdma_xprt_disconnect(r_xprt);
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
- rpcrdma_ia_close(&r_xprt->rx_ia);
xprt_rdma_free_addresses(xprt);
xprt_free(xprt);
@@ -316,10 +319,15 @@ xprt_setup_rdma(struct xprt_create *args)
if (args->addrlen > sizeof(xprt->addr))
return ERR_PTR(-EBADF);
+ if (!try_module_get(THIS_MODULE))
+ return ERR_PTR(-EIO);
+
xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0,
xprt_rdma_slot_table_entries);
- if (!xprt)
+ if (!xprt) {
+ module_put(THIS_MODULE);
return ERR_PTR(-ENOMEM);
+ }
xprt->timeout = &xprt_rdma_default_timeout;
xprt->connect_timeout = xprt->timeout->to_initval;
@@ -339,6 +347,7 @@ xprt_setup_rdma(struct xprt_create *args)
/* Ensure xprt->addr holds valid server TCP (not RDMA)
* address, for any side protocols which peek at it */
xprt->prot = IPPROTO_TCP;
+ xprt->xprt_class = &xprt_rdma;
xprt->addrlen = args->addrlen;
memcpy(&xprt->addr, sap, xprt->addrlen);
@@ -347,43 +356,20 @@ xprt_setup_rdma(struct xprt_create *args)
xprt_rdma_format_addresses(xprt, sap);
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt);
- if (rc)
- goto out1;
-
- rc = rpcrdma_ep_create(new_xprt);
- if (rc)
- goto out2;
-
rc = rpcrdma_buffer_create(new_xprt);
- if (rc)
- goto out3;
-
- if (!try_module_get(THIS_MODULE))
- goto out4;
+ if (rc) {
+ xprt_rdma_free_addresses(xprt);
+ xprt_free(xprt);
+ module_put(THIS_MODULE);
+ return ERR_PTR(rc);
+ }
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
+
xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
- dprintk("RPC: %s: %s:%s\n", __func__,
- xprt->address_strings[RPC_DISPLAY_ADDR],
- xprt->address_strings[RPC_DISPLAY_PORT]);
- trace_xprtrdma_create(new_xprt);
return xprt;
-
-out4:
- rpcrdma_buffer_destroy(&new_xprt->rx_buf);
- rc = -ENODEV;
-out3:
- rpcrdma_ep_destroy(new_xprt);
-out2:
- rpcrdma_ia_close(&new_xprt->rx_ia);
-out1:
- trace_xprtrdma_op_destroy(new_xprt);
- xprt_rdma_free_addresses(xprt);
- xprt_free(xprt);
- return ERR_PTR(rc);
}
/**
@@ -398,26 +384,9 @@ out1:
void xprt_rdma_close(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- might_sleep();
- trace_xprtrdma_op_close(r_xprt);
+ rpcrdma_xprt_disconnect(r_xprt);
- /* Prevent marshaling and sending of new requests */
- xprt_clear_connected(xprt);
-
- if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) {
- rpcrdma_ia_remove(ia);
- goto out;
- }
-
- if (ep->rep_connected == -ENODEV)
- return;
- rpcrdma_ep_disconnect(ep, ia);
-
-out:
xprt->reestablish_timeout = 0;
++xprt->connect_cookie;
xprt_disconnect_done(xprt);
@@ -445,9 +414,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
snprintf(buf, sizeof(buf), "%4hx", port);
xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
-
- trace_xprtrdma_op_setport(container_of(xprt, struct rpcrdma_xprt,
- rx_xprt));
}
/**
@@ -517,16 +483,18 @@ static void
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned long delay;
+ WARN_ON_ONCE(!xprt_lock_connect(xprt, task, r_xprt));
+
delay = 0;
- if (r_xprt->rx_ep.rep_connected != 0) {
+ if (ep && ep->re_connect_status != 0) {
delay = xprt_reconnect_delay(xprt);
xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
}
trace_xprtrdma_op_connect(r_xprt, delay);
- queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker,
- delay);
+ queue_delayed_work(system_long_wq, &r_xprt->rx_connect_worker, delay);
}
/**
@@ -552,9 +520,8 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
return;
out_sleep:
- set_bit(XPRT_CONGESTED, &xprt->state);
- rpc_sleep_on(&xprt->backlog, task, NULL);
- task->tk_status = -EAGAIN;
+ task->tk_status = -ENOMEM;
+ xprt_add_backlog(xprt, task);
}
/**
@@ -569,10 +536,11 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
struct rpcrdma_xprt *r_xprt =
container_of(xprt, struct rpcrdma_xprt, rx_xprt);
- memset(rqst, 0, sizeof(*rqst));
- rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
- if (unlikely(!rpc_wake_up_next(&xprt->backlog)))
- clear_bit(XPRT_CONGESTED, &xprt->state);
+ rpcrdma_reply_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
+ if (!xprt_wake_up_backlog(xprt, rqst)) {
+ memset(rqst, 0, sizeof(*rqst));
+ rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
+ }
}
static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt,
@@ -602,11 +570,7 @@ xprt_rdma_allocate(struct rpc_task *task)
struct rpc_rqst *rqst = task->tk_rqstp;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- gfp_t flags;
-
- flags = RPCRDMA_DEF_GFP;
- if (RPC_IS_SWAPPER(task))
- flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
+ gfp_t flags = rpc_task_gfp_mask();
if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize,
flags))
@@ -617,11 +581,9 @@ xprt_rdma_allocate(struct rpc_task *task)
rqst->rq_buffer = rdmab_data(req->rl_sendbuf);
rqst->rq_rbuffer = rdmab_data(req->rl_recvbuf);
- trace_xprtrdma_op_allocate(task, req);
return 0;
out_fail:
- trace_xprtrdma_op_allocate(task, NULL);
return -ENOMEM;
}
@@ -635,13 +597,12 @@ static void
xprt_rdma_free(struct rpc_task *task)
{
struct rpc_rqst *rqst = task->tk_rqstp;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- trace_xprtrdma_op_free(task, req);
-
- if (!list_empty(&req->rl_registered))
- frwr_unmap_sync(r_xprt, req);
+ if (unlikely(!list_empty(&req->rl_registered))) {
+ trace_xprtrdma_mrs_zap(task);
+ frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req);
+ }
/* XXX: If the RPC is completing because of a signal and
* not because a reply was received, we ought to ensure
@@ -694,7 +655,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst)
goto drop_connection;
rqst->rq_xtime = ktime_get();
- if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ if (frwr_send(r_xprt, req))
goto drop_connection;
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
@@ -806,6 +767,7 @@ static struct xprt_class xprt_rdma = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_RDMA,
.setup = xprt_setup_rdma,
+ .netid = { "rdma", "rdma6", "" },
};
void xprt_rdma_cleanup(void)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 353f61ac8d51..44b87e4274b4 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -63,17 +63,6 @@
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
-/*
- * Globals/Macros
- */
-
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_TRANS
-#endif
-
-/*
- * internal functions
- */
static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
@@ -84,9 +73,10 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_ep_get(struct rpcrdma_ep *ep);
+static int rpcrdma_ep_put(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
-rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
- gfp_t flags);
+rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction);
static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb);
static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
@@ -96,72 +86,86 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
*/
static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct rdma_cm_id *id = ep->re_id;
+
+ /* Wait for rpcrdma_post_recvs() to leave its critical
+ * section.
+ */
+ if (atomic_inc_return(&ep->re_receiving) > 1)
+ wait_for_completion(&ep->re_done);
/* Flush Receives, then wait for deferred Reply work
* to complete.
*/
- ib_drain_rq(ia->ri_id->qp);
+ ib_drain_rq(id->qp);
/* Deferred Reply processing might have scheduled
* local invalidations.
*/
- ib_drain_sq(ia->ri_id->qp);
+ ib_drain_sq(id->qp);
+
+ rpcrdma_ep_put(ep);
+}
+
+/* Ensure xprt_force_disconnect() is invoked exactly once when a
+ * connection is closed or lost. (The important thing is it needs
+ * to be invoked "at least" once).
+ */
+void rpcrdma_force_disconnect(struct rpcrdma_ep *ep)
+{
+ if (atomic_add_unless(&ep->re_force_disconnect, 1, 1))
+ xprt_force_disconnect(ep->re_xprt);
}
/**
- * rpcrdma_qp_event_handler - Handle one QP event (error notification)
- * @event: details of the event
- * @context: ep that owns QP where event occurred
+ * rpcrdma_flush_disconnect - Disconnect on flushed completion
+ * @r_xprt: transport to disconnect
+ * @wc: work completion entry
*
- * Called from the RDMA provider (device driver) possibly in an interrupt
- * context.
+ * Must be called in process context.
*/
-static void
-rpcrdma_qp_event_handler(struct ib_event *event, void *context)
+void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc)
{
- struct rpcrdma_ep *ep = context;
- struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
- rx_ep);
-
- trace_xprtrdma_qp_event(r_xprt, event);
+ if (wc->status != IB_WC_SUCCESS)
+ rpcrdma_force_disconnect(r_xprt->rx_ep);
}
/**
* rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
* @cq: completion queue
- * @wc: completed WR
+ * @wc: WCE for a completed Send WR
*
*/
-static void
-rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_sendctx *sc =
container_of(cqe, struct rpcrdma_sendctx, sc_cqe);
+ struct rpcrdma_xprt *r_xprt = cq->cq_context;
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_send(sc, wc);
- rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc);
+ trace_xprtrdma_wc_send(wc, &sc->sc_cid);
+ rpcrdma_sendctx_put_locked(r_xprt, sc);
+ rpcrdma_flush_disconnect(r_xprt, wc);
}
/**
* rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed Receive WR
*
*/
-static void
-rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
rr_cqe);
- struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+ struct rpcrdma_xprt *r_xprt = cq->cq_context;
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_receive(wc);
- --r_xprt->rx_ep.rep_receive_count;
+ trace_xprtrdma_wc_receive(wc, &rep->rr_cid);
+ --r_xprt->rx_ep->re_receive_count;
if (wc->status != IB_WC_SUCCESS)
goto out_flushed;
@@ -178,35 +182,33 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
return;
out_flushed:
- rpcrdma_rep_destroy(rep);
+ rpcrdma_flush_disconnect(r_xprt, wc);
+ rpcrdma_rep_put(&r_xprt->rx_buf, rep);
}
-static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
+static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
struct rdma_conn_param *param)
{
const struct rpcrdma_connect_private *pmsg = param->private_data;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
unsigned int rsize, wsize;
/* Default settings for RPC-over-RDMA Version One */
- r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
if (pmsg &&
pmsg->cp_magic == rpcrdma_cmp_magic &&
pmsg->cp_version == RPCRDMA_CMP_VERSION) {
- r_xprt->rx_ia.ri_implicit_roundup = true;
rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
}
- if (rsize < ep->rep_inline_recv)
- ep->rep_inline_recv = rsize;
- if (wsize < ep->rep_inline_send)
- ep->rep_inline_send = wsize;
+ if (rsize < ep->re_inline_recv)
+ ep->re_inline_recv = rsize;
+ if (wsize < ep->re_inline_send)
+ ep->re_inline_send = wsize;
- rpcrdma_set_max_header_sizes(r_xprt);
+ rpcrdma_set_max_header_sizes(ep);
}
/**
@@ -220,116 +222,100 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
static int
rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
- struct rpcrdma_xprt *r_xprt = id->context;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr;
+ struct rpcrdma_ep *ep = id->context;
might_sleep();
- trace_xprtrdma_cm_event(r_xprt, event);
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
case RDMA_CM_EVENT_ROUTE_RESOLVED:
- ia->ri_async_rc = 0;
- complete(&ia->ri_done);
+ ep->re_async_rc = 0;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_ADDR_ERROR:
- ia->ri_async_rc = -EPROTO;
- complete(&ia->ri_done);
+ ep->re_async_rc = -EPROTO;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_ROUTE_ERROR:
- ia->ri_async_rc = -ENETUNREACH;
- complete(&ia->ri_done);
+ ep->re_async_rc = -ENETUNREACH;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- pr_info("rpcrdma: removing device %s for %s:%s\n",
- ia->ri_id->device->name,
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
-#endif
- init_completion(&ia->ri_remove_done);
- set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
- ep->rep_connected = -ENODEV;
- xprt_force_disconnect(xprt);
- wait_for_completion(&ia->ri_remove_done);
-
- ia->ri_id = NULL;
- /* Return 1 to ensure the core destroys the id. */
- return 1;
+ pr_info("rpcrdma: removing device %s for %pISpc\n",
+ ep->re_id->device->name, sap);
+ fallthrough;
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ ep->re_connect_status = -ENODEV;
+ goto disconnected;
case RDMA_CM_EVENT_ESTABLISHED:
- ++xprt->connect_cookie;
- ep->rep_connected = 1;
- rpcrdma_update_cm_private(r_xprt, &event->param.conn);
- trace_xprtrdma_inline_thresh(r_xprt);
- wake_up_all(&ep->rep_connect_wait);
+ rpcrdma_ep_get(ep);
+ ep->re_connect_status = 1;
+ rpcrdma_update_cm_private(ep, &event->param.conn);
+ trace_xprtrdma_inline_thresh(ep);
+ wake_up_all(&ep->re_connect_wait);
break;
case RDMA_CM_EVENT_CONNECT_ERROR:
- ep->rep_connected = -ENOTCONN;
- goto disconnected;
+ ep->re_connect_status = -ENOTCONN;
+ goto wake_connect_worker;
case RDMA_CM_EVENT_UNREACHABLE:
- ep->rep_connected = -ENETUNREACH;
- goto disconnected;
+ ep->re_connect_status = -ENETUNREACH;
+ goto wake_connect_worker;
case RDMA_CM_EVENT_REJECTED:
- dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
- rdma_reject_msg(id, event->status));
- ep->rep_connected = -ECONNREFUSED;
+ ep->re_connect_status = -ECONNREFUSED;
if (event->status == IB_CM_REJ_STALE_CONN)
- ep->rep_connected = -EAGAIN;
- goto disconnected;
+ ep->re_connect_status = -ENOTCONN;
+wake_connect_worker:
+ wake_up_all(&ep->re_connect_wait);
+ return 0;
case RDMA_CM_EVENT_DISCONNECTED:
- ep->rep_connected = -ECONNABORTED;
+ ep->re_connect_status = -ECONNABORTED;
disconnected:
- xprt_force_disconnect(xprt);
- wake_up_all(&ep->rep_connect_wait);
- break;
+ rpcrdma_force_disconnect(ep);
+ return rpcrdma_ep_put(ep);
default:
break;
}
- dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__,
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
- ia->ri_id->device->name, rdma_event_msg(event->event));
return 0;
}
-static struct rdma_cm_id *
-rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
+static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_ep *ep)
{
unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
struct rdma_cm_id *id;
int rc;
- init_completion(&ia->ri_done);
+ init_completion(&ep->re_done);
- id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
- xprt, RDMA_PS_TCP, IB_QPT_RC);
+ id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(id))
return id;
- ia->ri_async_rc = -ETIMEDOUT;
- rc = rdma_resolve_addr(id, NULL,
- (struct sockaddr *)&xprt->rx_xprt.addr,
+ ep->re_async_rc = -ETIMEDOUT;
+ rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr,
RDMA_RESOLVE_TIMEOUT);
if (rc)
goto out;
- rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
if (rc < 0)
goto out;
- rc = ia->ri_async_rc;
+ rc = ep->re_async_rc;
if (rc)
goto out;
- ia->ri_async_rc = -ETIMEDOUT;
+ ep->re_async_rc = -ETIMEDOUT;
rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
if (rc)
goto out;
- rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
if (rc < 0)
goto out;
- rc = ia->ri_async_rc;
+ rc = ep->re_async_rc;
if (rc)
goto out;
@@ -340,423 +326,248 @@ out:
return ERR_PTR(rc);
}
-/*
- * Exported functions.
- */
-
-/**
- * rpcrdma_ia_open - Open and initialize an Interface Adapter.
- * @xprt: transport with IA to (re)initialize
- *
- * Returns 0 on success, negative errno if an appropriate
- * Interface Adapter could not be found and opened.
- */
-int
-rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
+static void rpcrdma_ep_destroy(struct kref *kref)
{
- struct rpcrdma_ia *ia = &xprt->rx_ia;
- int rc;
+ struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
- ia->ri_id = rpcrdma_create_id(xprt, ia);
- if (IS_ERR(ia->ri_id)) {
- rc = PTR_ERR(ia->ri_id);
- goto out_err;
+ if (ep->re_id->qp) {
+ rdma_destroy_qp(ep->re_id);
+ ep->re_id->qp = NULL;
}
- ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0);
- if (IS_ERR(ia->ri_pd)) {
- rc = PTR_ERR(ia->ri_pd);
- pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
- goto out_err;
- }
+ if (ep->re_attr.recv_cq)
+ ib_free_cq(ep->re_attr.recv_cq);
+ ep->re_attr.recv_cq = NULL;
+ if (ep->re_attr.send_cq)
+ ib_free_cq(ep->re_attr.send_cq);
+ ep->re_attr.send_cq = NULL;
- return 0;
+ if (ep->re_pd)
+ ib_dealloc_pd(ep->re_pd);
+ ep->re_pd = NULL;
-out_err:
- rpcrdma_ia_close(ia);
- return rc;
+ kfree(ep);
+ module_put(THIS_MODULE);
}
-/**
- * rpcrdma_ia_remove - Handle device driver unload
- * @ia: interface adapter being removed
- *
- * Divest transport H/W resources associated with this adapter,
- * but allow it to be restored later.
- *
- * Caller must hold the transport send lock.
- */
-void
-rpcrdma_ia_remove(struct rpcrdma_ia *ia)
+static noinline void rpcrdma_ep_get(struct rpcrdma_ep *ep)
{
- struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
- rx_ia);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-
- /* This is similar to rpcrdma_ep_destroy, but:
- * - Don't cancel the connect worker.
- * - Don't call rpcrdma_ep_disconnect, which waits
- * for another conn upcall, which will deadlock.
- * - rdma_disconnect is unneeded, the underlying
- * connection is already gone.
- */
- if (ia->ri_id->qp) {
- rpcrdma_xprt_drain(r_xprt);
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
- ib_free_cq(ep->rep_attr.recv_cq);
- ep->rep_attr.recv_cq = NULL;
- ib_free_cq(ep->rep_attr.send_cq);
- ep->rep_attr.send_cq = NULL;
-
- /* The ULP is responsible for ensuring all DMA
- * mappings and MRs are gone.
- */
- rpcrdma_reps_unmap(r_xprt);
- rpcrdma_reqs_reset(r_xprt);
- rpcrdma_mrs_destroy(r_xprt);
- rpcrdma_sendctxs_destroy(r_xprt);
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
-
- /* Allow waiters to continue */
- complete(&ia->ri_remove_done);
-
- trace_xprtrdma_remove(r_xprt);
+ kref_get(&ep->re_kref);
}
-/**
- * rpcrdma_ia_close - Clean up/close an IA.
- * @ia: interface adapter to close
- *
+/* Returns:
+ * %0 if @ep still has a positive kref count, or
+ * %1 if @ep was destroyed successfully.
*/
-void
-rpcrdma_ia_close(struct rpcrdma_ia *ia)
+static noinline int rpcrdma_ep_put(struct rpcrdma_ep *ep)
{
- if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
- if (ia->ri_id->qp)
- rdma_destroy_qp(ia->ri_id);
- rdma_destroy_id(ia->ri_id);
- }
- ia->ri_id = NULL;
-
- /* If the pd is still busy, xprtrdma missed freeing a resource */
- if (ia->ri_pd && !IS_ERR(ia->ri_pd))
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
+ return kref_put(&ep->re_kref, rpcrdma_ep_destroy);
}
-/**
- * rpcrdma_ep_create - Create unconnected endpoint
- * @r_xprt: transport to instantiate
- *
- * Returns zero on success, or a negative errno.
- */
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
+static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
- struct ib_cq *sendcq, *recvcq;
+ struct rpcrdma_connect_private *pmsg;
+ struct ib_device *device;
+ struct rdma_cm_id *id;
+ struct rpcrdma_ep *ep;
int rc;
- ep->rep_max_requests = r_xprt->rx_xprt.max_reqs;
- ep->rep_inline_send = xprt_rdma_max_inline_write;
- ep->rep_inline_recv = xprt_rdma_max_inline_read;
+ ep = kzalloc(sizeof(*ep), XPRTRDMA_GFP_FLAGS);
+ if (!ep)
+ return -ENOTCONN;
+ ep->re_xprt = &r_xprt->rx_xprt;
+ kref_init(&ep->re_kref);
- rc = frwr_query_device(r_xprt, ia->ri_id->device);
- if (rc)
- return rc;
- r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests);
-
- ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
- ep->rep_attr.qp_context = ep;
- ep->rep_attr.srq = NULL;
- ep->rep_attr.cap.max_inline_data = 0;
- ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
- ep->rep_attr.qp_type = IB_QPT_RC;
- ep->rep_attr.port_num = ~0;
-
- dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
- "iovs: send %d recv %d\n",
- __func__,
- ep->rep_attr.cap.max_send_wr,
- ep->rep_attr.cap.max_recv_wr,
- ep->rep_attr.cap.max_send_sge,
- ep->rep_attr.cap.max_recv_sge);
-
- ep->rep_send_batch = ep->rep_max_requests >> 3;
- ep->rep_send_count = ep->rep_send_batch;
- init_waitqueue_head(&ep->rep_connect_wait);
- ep->rep_receive_count = 0;
-
- sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt,
- ep->rep_attr.cap.max_send_wr + 1,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(sendcq)) {
- rc = PTR_ERR(sendcq);
- goto out1;
+ id = rpcrdma_create_id(r_xprt, ep);
+ if (IS_ERR(id)) {
+ kfree(ep);
+ return PTR_ERR(id);
}
+ __module_get(THIS_MODULE);
+ device = id->device;
+ ep->re_id = id;
+ reinit_completion(&ep->re_done);
+
+ ep->re_max_requests = r_xprt->rx_xprt.max_reqs;
+ ep->re_inline_send = xprt_rdma_max_inline_write;
+ ep->re_inline_recv = xprt_rdma_max_inline_read;
+ rc = frwr_query_device(ep, device);
+ if (rc)
+ goto out_destroy;
- recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
- ep->rep_attr.cap.max_recv_wr + 1,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(recvcq)) {
- rc = PTR_ERR(recvcq);
- goto out2;
+ r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests);
+
+ ep->re_attr.srq = NULL;
+ ep->re_attr.cap.max_inline_data = 0;
+ ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ ep->re_attr.qp_type = IB_QPT_RC;
+ ep->re_attr.port_num = ~0;
+
+ ep->re_send_batch = ep->re_max_requests >> 3;
+ ep->re_send_count = ep->re_send_batch;
+ init_waitqueue_head(&ep->re_connect_wait);
+
+ ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt,
+ ep->re_attr.cap.max_send_wr,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(ep->re_attr.send_cq)) {
+ rc = PTR_ERR(ep->re_attr.send_cq);
+ ep->re_attr.send_cq = NULL;
+ goto out_destroy;
}
- ep->rep_attr.send_cq = sendcq;
- ep->rep_attr.recv_cq = recvcq;
+ ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt,
+ ep->re_attr.cap.max_recv_wr,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(ep->re_attr.recv_cq)) {
+ rc = PTR_ERR(ep->re_attr.recv_cq);
+ ep->re_attr.recv_cq = NULL;
+ goto out_destroy;
+ }
+ ep->re_receive_count = 0;
/* Initialize cma parameters */
- memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
+ memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma));
/* Prepare RDMA-CM private message */
+ pmsg = &ep->re_cm_private;
pmsg->cp_magic = rpcrdma_cmp_magic;
pmsg->cp_version = RPCRDMA_CMP_VERSION;
pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK;
- pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send);
- pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv);
- ep->rep_remote_cma.private_data = pmsg;
- ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
+ pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send);
+ pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv);
+ ep->re_remote_cma.private_data = pmsg;
+ ep->re_remote_cma.private_data_len = sizeof(*pmsg);
/* Client offers RDMA Read but does not initiate */
- ep->rep_remote_cma.initiator_depth = 0;
- ep->rep_remote_cma.responder_resources =
- min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom);
+ ep->re_remote_cma.initiator_depth = 0;
+ ep->re_remote_cma.responder_resources =
+ min_t(int, U8_MAX, device->attrs.max_qp_rd_atom);
/* Limit transport retries so client can detect server
* GID changes quickly. RPC layer handles re-establishing
* transport connection and retransmission.
*/
- ep->rep_remote_cma.retry_count = 6;
+ ep->re_remote_cma.retry_count = 6;
/* RPC-over-RDMA handles its own flow control. In addition,
* make all RNR NAKs visible so we know that RPC-over-RDMA
* flow control is working correctly (no NAKs should be seen).
*/
- ep->rep_remote_cma.flow_control = 0;
- ep->rep_remote_cma.rnr_retry_count = 0;
-
- return 0;
-
-out2:
- ib_free_cq(sendcq);
-out1:
- return rc;
-}
-
-/**
- * rpcrdma_ep_destroy - Disconnect and destroy endpoint.
- * @r_xprt: transport instance to shut down
- *
- */
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- if (ia->ri_id && ia->ri_id->qp) {
- rpcrdma_ep_disconnect(ep, ia);
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
-
- if (ep->rep_attr.recv_cq)
- ib_free_cq(ep->rep_attr.recv_cq);
- if (ep->rep_attr.send_cq)
- ib_free_cq(ep->rep_attr.send_cq);
-}
-
-/* Re-establish a connection after a device removal event.
- * Unlike a normal reconnection, a fresh PD and a new set
- * of MRs and buffers is needed.
- */
-static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
- struct ib_qp_init_attr *qp_init_attr)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- int rc, err;
-
- trace_xprtrdma_reinsert(r_xprt);
-
- rc = -EHOSTUNREACH;
- if (rpcrdma_ia_open(r_xprt))
- goto out1;
-
- rc = -ENOMEM;
- err = rpcrdma_ep_create(r_xprt);
- if (err) {
- pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
- goto out2;
- }
- memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr));
-
- rc = -ENETUNREACH;
- err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr);
- if (err) {
- pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
- goto out3;
- }
- return 0;
-
-out3:
- rpcrdma_ep_destroy(r_xprt);
-out2:
- rpcrdma_ia_close(ia);
-out1:
- return rc;
-}
-
-static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt,
- struct ib_qp_init_attr *qp_init_attr)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rdma_cm_id *id, *old;
- int err, rc;
-
- rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia);
+ ep->re_remote_cma.flow_control = 0;
+ ep->re_remote_cma.rnr_retry_count = 0;
- rc = -EHOSTUNREACH;
- id = rpcrdma_create_id(r_xprt, ia);
- if (IS_ERR(id))
- goto out;
-
- /* As long as the new ID points to the same device as the
- * old ID, we can reuse the transport's existing PD and all
- * previously allocated MRs. Also, the same device means
- * the transport's previous DMA mappings are still valid.
- *
- * This is a sanity check only. There should be no way these
- * point to two different devices here.
- */
- old = id;
- rc = -ENETUNREACH;
- if (ia->ri_id->device != id->device) {
- pr_err("rpcrdma: can't reconnect on different device!\n");
+ ep->re_pd = ib_alloc_pd(device, 0);
+ if (IS_ERR(ep->re_pd)) {
+ rc = PTR_ERR(ep->re_pd);
+ ep->re_pd = NULL;
goto out_destroy;
}
- err = rdma_create_qp(id, ia->ri_pd, qp_init_attr);
- if (err)
+ rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr);
+ if (rc)
goto out_destroy;
- /* Atomically replace the transport's ID and QP. */
- rc = 0;
- old = ia->ri_id;
- ia->ri_id = id;
- rdma_destroy_qp(old);
+ r_xprt->rx_ep = ep;
+ return 0;
out_destroy:
- rdma_destroy_id(old);
-out:
+ rpcrdma_ep_put(ep);
+ rdma_destroy_id(id);
return rc;
}
-/*
- * Connect unconnected endpoint.
+/**
+ * rpcrdma_xprt_connect - Connect an unconnected transport
+ * @r_xprt: controlling transport instance
+ *
+ * Returns 0 on success or a negative errno.
*/
-int
-rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
- rx_ia);
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct ib_qp_init_attr qp_init_attr;
+ struct rpcrdma_ep *ep;
int rc;
-retry:
- memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr));
- switch (ep->rep_connected) {
- case 0:
- rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr);
- if (rc) {
- rc = -ENETUNREACH;
- goto out_noupdate;
- }
- break;
- case -ENODEV:
- rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr);
- if (rc)
- goto out_noupdate;
- break;
- default:
- rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr);
- if (rc)
- goto out;
- }
+ rc = rpcrdma_ep_create(r_xprt);
+ if (rc)
+ return rc;
+ ep = r_xprt->rx_ep;
- ep->rep_connected = 0;
xprt_clear_connected(xprt);
-
rpcrdma_reset_cwnd(r_xprt);
- rpcrdma_post_recvs(r_xprt, true);
- rc = rpcrdma_sendctxs_create(r_xprt);
- if (rc)
- goto out;
+ /* Bump the ep's reference count while there are
+ * outstanding Receives.
+ */
+ rpcrdma_ep_get(ep);
+ rpcrdma_post_recvs(r_xprt, 1, true);
- rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+ rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
if (rc)
goto out;
if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
- wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
- if (ep->rep_connected <= 0) {
- if (ep->rep_connected == -EAGAIN)
- goto retry;
- rc = ep->rep_connected;
+ wait_event_interruptible(ep->re_connect_wait,
+ ep->re_connect_status != 0);
+ if (ep->re_connect_status <= 0) {
+ rc = ep->re_connect_status;
+ goto out;
+ }
+
+ rc = rpcrdma_sendctxs_create(r_xprt);
+ if (rc) {
+ rc = -ENOTCONN;
goto out;
}
rc = rpcrdma_reqs_setup(r_xprt);
if (rc) {
- rpcrdma_ep_disconnect(ep, ia);
+ rc = -ENOTCONN;
goto out;
}
rpcrdma_mrs_create(r_xprt);
+ frwr_wp_create(r_xprt);
out:
- if (rc)
- ep->rep_connected = rc;
-
-out_noupdate:
trace_xprtrdma_connect(r_xprt, rc);
return rc;
}
/**
- * rpcrdma_ep_disconnect - Disconnect underlying transport
- * @ep: endpoint to disconnect
- * @ia: associated interface adapter
+ * rpcrdma_xprt_disconnect - Disconnect underlying transport
+ * @r_xprt: controlling transport instance
*
* Caller serializes. Either the transport send lock is held,
* or we're being called to destroy the transport.
+ *
+ * On return, @r_xprt is completely divested of all hardware
+ * resources and prepared for the next ->connect operation.
*/
-void
-rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
- rx_ep);
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct rdma_cm_id *id;
int rc;
- /* returns without wait if ID is not connected */
- rc = rdma_disconnect(ia->ri_id);
- if (!rc)
- wait_event_interruptible(ep->rep_connect_wait,
- ep->rep_connected != 1);
- else
- ep->rep_connected = rc;
+ if (!ep)
+ return;
+
+ id = ep->re_id;
+ rc = rdma_disconnect(id);
trace_xprtrdma_disconnect(r_xprt, rc);
rpcrdma_xprt_drain(r_xprt);
+ rpcrdma_reps_unmap(r_xprt);
rpcrdma_reqs_reset(r_xprt);
rpcrdma_mrs_destroy(r_xprt);
rpcrdma_sendctxs_destroy(r_xprt);
+
+ if (rpcrdma_ep_put(ep))
+ rdma_destroy_id(id);
+
+ r_xprt->rx_ep = NULL;
}
/* Fixed-size circular FIFO queue. This implementation is wait-free and
@@ -793,12 +604,15 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
{
struct rpcrdma_sendctx *sc;
- sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge),
- GFP_KERNEL);
+ sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge),
+ XPRTRDMA_GFP_FLAGS);
if (!sc)
return NULL;
sc->sc_cqe.done = rpcrdma_wc_send;
+ sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id;
+ sc->sc_cid.ci_completion_id =
+ atomic_inc_return(&ep->re_completion_ids);
return sc;
}
@@ -813,14 +627,14 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
* the ->send_request call to fail temporarily before too many
* Sends are posted.
*/
- i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS;
- buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
+ i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS;
+ buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), XPRTRDMA_GFP_FLAGS);
if (!buf->rb_sc_ctxs)
return -ENOMEM;
buf->rb_sc_last = i - 1;
for (i = 0; i <= buf->rb_sc_last; i++) {
- sc = rpcrdma_sendctx_create(&r_xprt->rx_ep);
+ sc = rpcrdma_sendctx_create(r_xprt->rx_ep);
if (!sc)
return -ENOMEM;
@@ -924,25 +738,26 @@ static void
rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct ib_device *device = ep->re_id->device;
unsigned int count;
- for (count = 0; count < ia->ri_max_rdma_segs; count++) {
+ /* Try to allocate enough to perform one full-sized I/O */
+ for (count = 0; count < ep->re_max_rdma_segs; count++) {
struct rpcrdma_mr *mr;
int rc;
- mr = kzalloc(sizeof(*mr), GFP_NOFS);
+ mr = kzalloc_node(sizeof(*mr), XPRTRDMA_GFP_FLAGS,
+ ibdev_to_node(device));
if (!mr)
break;
- rc = frwr_init_mr(ia, mr);
+ rc = frwr_mr_init(r_xprt, mr);
if (rc) {
kfree(mr);
break;
}
- mr->mr_xprt = r_xprt;
-
spin_lock(&buf->rb_lock);
rpcrdma_mr_push(mr, &buf->rb_mrs);
list_add(&mr->mr_all, &buf->rb_all_mrs);
@@ -973,43 +788,38 @@ rpcrdma_mr_refresh_worker(struct work_struct *work)
void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
- /* If there is no underlying device, it's no use to
- * wake the refresh worker.
+ /* If there is no underlying connection, it's no use
+ * to wake the refresh worker.
*/
- if (ep->rep_connected != -ENODEV) {
- /* The work is scheduled on a WQ_MEM_RECLAIM
- * workqueue in order to prevent MR allocation
- * from recursing into NFS during direct reclaim.
- */
- queue_work(xprtiod_workqueue, &buf->rb_refresh_worker);
- }
+ if (ep->re_connect_status != 1)
+ return;
+ queue_work(system_highpri_wq, &buf->rb_refresh_worker);
}
/**
* rpcrdma_req_create - Allocate an rpcrdma_req object
* @r_xprt: controlling r_xprt
* @size: initial size, in bytes, of send and receive buffers
- * @flags: GFP flags passed to memory allocators
*
* Returns an allocated and fully initialized rpcrdma_req or NULL.
*/
-struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
- gfp_t flags)
+struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt,
+ size_t size)
{
struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
struct rpcrdma_req *req;
- req = kzalloc(sizeof(*req), flags);
+ req = kzalloc(sizeof(*req), XPRTRDMA_GFP_FLAGS);
if (req == NULL)
goto out1;
- req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags);
+ req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE);
if (!req->rl_sendbuf)
goto out2;
- req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags);
+ req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE);
if (!req->rl_recvbuf)
goto out3;
@@ -1042,10 +852,10 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Compute maximum header buffer size in bytes */
maxhdrsize = rpcrdma_fixed_maxsz + 3 +
- r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz;
+ r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz;
maxhdrsize *= sizeof(__be32);
rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
- DMA_TO_DEVICE, GFP_KERNEL);
+ DMA_TO_DEVICE);
if (!rb)
goto out;
@@ -1091,6 +901,8 @@ static void rpcrdma_req_reset(struct rpcrdma_req *req)
rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
+
+ frwr_reset(req);
}
/* ASSUMPTION: the rb_allreqs list is stable for the duration,
@@ -1107,27 +919,28 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
rpcrdma_req_reset(req);
}
-/* No locking needed here. This function is called only by the
- * Receive completion handler.
- */
static noinline
struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
bool temp)
{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_rep *rep;
- rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+ rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS);
if (rep == NULL)
goto out;
- rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv,
- DMA_FROM_DEVICE, GFP_KERNEL);
+ rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv,
+ DMA_FROM_DEVICE);
if (!rep->rr_rdmabuf)
goto out_free;
if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
goto out_free_regbuf;
+ rep->rr_cid.ci_completion_id =
+ atomic_inc_return(&r_xprt->rx_ep->re_completion_ids);
+
xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf),
rdmab_length(rep->rr_rdmabuf));
rep->rr_cqe.done = rpcrdma_wc_receive;
@@ -1137,7 +950,10 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
rep->rr_recv_wr.num_sge = 1;
rep->rr_temp = temp;
- list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps);
+
+ spin_lock(&buf->rb_lock);
+ list_add(&rep->rr_all, &buf->rb_all_reps);
+ spin_unlock(&buf->rb_lock);
return rep;
out_free_regbuf:
@@ -1148,16 +964,23 @@ out:
return NULL;
}
-/* No locking needed here. This function is invoked only by the
- * Receive completion handler, or during transport shutdown.
- */
-static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
+static void rpcrdma_rep_free(struct rpcrdma_rep *rep)
{
- list_del(&rep->rr_all);
rpcrdma_regbuf_free(rep->rr_rdmabuf);
kfree(rep);
}
+static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
+{
+ struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf;
+
+ spin_lock(&buf->rb_lock);
+ list_del(&rep->rr_all);
+ spin_unlock(&buf->rb_lock);
+
+ rpcrdma_rep_free(rep);
+}
+
static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
{
struct llist_node *node;
@@ -1169,12 +992,21 @@ static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
return llist_entry(node, struct rpcrdma_rep, rr_node);
}
-static void rpcrdma_rep_put(struct rpcrdma_buffer *buf,
- struct rpcrdma_rep *rep)
+/**
+ * rpcrdma_rep_put - Release rpcrdma_rep back to free list
+ * @buf: buffer pool
+ * @rep: rep to release
+ *
+ */
+void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep)
{
llist_add(&rep->rr_node, &buf->rb_free_reps);
}
+/* Caller must ensure the QP is quiescent (RQ is drained) before
+ * invoking this function, to guarantee rb_all_reps is not
+ * changing.
+ */
static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
@@ -1182,7 +1014,7 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
list_for_each_entry(rep, &buf->rb_all_reps, rr_all) {
rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
- rep->rr_temp = true;
+ rep->rr_temp = true; /* Mark this rep for destruction */
}
}
@@ -1190,8 +1022,18 @@ static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_rep *rep;
- while ((rep = rpcrdma_rep_get_locked(buf)) != NULL)
- rpcrdma_rep_destroy(rep);
+ spin_lock(&buf->rb_lock);
+ while ((rep = list_first_entry_or_null(&buf->rb_all_reps,
+ struct rpcrdma_rep,
+ rr_all)) != NULL) {
+ list_del(&rep->rr_all);
+ spin_unlock(&buf->rb_lock);
+
+ rpcrdma_rep_free(rep);
+
+ spin_lock(&buf->rb_lock);
+ }
+ spin_unlock(&buf->rb_lock);
}
/**
@@ -1219,8 +1061,8 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
for (i = 0; i < r_xprt->rx_xprt.max_reqs; i++) {
struct rpcrdma_req *req;
- req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2,
- GFP_KERNEL);
+ req = rpcrdma_req_create(r_xprt,
+ RPCRDMA_V1_DEF_INLINE_SIZE * 2);
if (!req)
goto out;
list_add(&req->rl_list, &buf->rb_send_bufs);
@@ -1254,7 +1096,7 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req)
list_del(&mr->mr_all);
spin_unlock(&buf->rb_lock);
- frwr_release_mr(mr);
+ frwr_mr_release(mr);
}
rpcrdma_regbuf_free(req->rl_recvbuf);
@@ -1285,7 +1127,7 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt)
list_del(&mr->mr_all);
spin_unlock(&buf->rb_lock);
- frwr_release_mr(mr);
+ frwr_mr_release(mr);
spin_lock(&buf->rb_lock);
}
@@ -1335,22 +1177,17 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
}
/**
- * rpcrdma_mr_put - DMA unmap an MR and release it
- * @mr: MR to release
+ * rpcrdma_reply_put - Put reply buffers back into pool
+ * @buffers: buffer pool
+ * @req: object to return
*
*/
-void rpcrdma_mr_put(struct rpcrdma_mr *mr)
+void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
{
- struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-
- if (mr->mr_dir != DMA_NONE) {
- trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
- mr->mr_sg, mr->mr_nents, mr->mr_dir);
- mr->mr_dir = DMA_NONE;
+ if (req->rl_reply) {
+ rpcrdma_rep_put(buffers, req->rl_reply);
+ req->rl_reply = NULL;
}
-
- rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
}
/**
@@ -1381,26 +1218,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
*/
void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
{
- if (req->rl_reply)
- rpcrdma_rep_put(buffers, req->rl_reply);
- req->rl_reply = NULL;
+ rpcrdma_reply_put(buffers, req);
spin_lock(&buffers->rb_lock);
list_add(&req->rl_list, &buffers->rb_send_bufs);
spin_unlock(&buffers->rb_lock);
}
-/**
- * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list
- * @rep: rep to release
- *
- * Used after error conditions.
- */
-void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
-{
- rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep);
-}
-
/* Returns a pointer to a rpcrdma_regbuf object, or NULL.
*
* xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
@@ -1408,15 +1232,14 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
* or Replies they may be registered externally via frwr_map.
*/
static struct rpcrdma_regbuf *
-rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
- gfp_t flags)
+rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction)
{
struct rpcrdma_regbuf *rb;
- rb = kmalloc(sizeof(*rb), flags);
+ rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS);
if (!rb)
return NULL;
- rb->rg_data = kmalloc(size, flags);
+ rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS);
if (!rb->rg_data) {
kfree(rb);
return NULL;
@@ -1463,7 +1286,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags)
bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_regbuf *rb)
{
- struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+ struct ib_device *device = r_xprt->rx_ep->re_id->device;
if (rb->rg_direction == DMA_NONE)
return false;
@@ -1476,7 +1299,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
}
rb->rg_device = device;
- rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey;
+ rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey;
return true;
}
@@ -1502,61 +1325,32 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
}
/**
- * rpcrdma_ep_post - Post WRs to a transport's Send Queue
- * @ia: transport's device information
- * @ep: transport's RDMA endpoint information
- * @req: rpcrdma_req containing the Send WR to post
- *
- * Returns 0 if the post was successful, otherwise -ENOTCONN
- * is returned.
- */
-int
-rpcrdma_ep_post(struct rpcrdma_ia *ia,
- struct rpcrdma_ep *ep,
- struct rpcrdma_req *req)
-{
- struct ib_send_wr *send_wr = &req->rl_wr;
- int rc;
-
- if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) {
- send_wr->send_flags |= IB_SEND_SIGNALED;
- ep->rep_send_count = ep->rep_send_batch;
- } else {
- send_wr->send_flags &= ~IB_SEND_SIGNALED;
- --ep->rep_send_count;
- }
-
- rc = frwr_send(ia, req);
- trace_xprtrdma_post_send(req, rc);
- if (rc)
- return -ENOTCONN;
- return 0;
-}
-
-/**
* rpcrdma_post_recvs - Refill the Receive Queue
* @r_xprt: controlling transport instance
- * @temp: mark Receive buffers to be deleted after use
+ * @needed: current credit grant
+ * @temp: mark Receive buffers to be deleted after one use
*
*/
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_recv_wr *wr, *bad_wr;
struct rpcrdma_rep *rep;
- int needed, count, rc;
+ int count, rc;
rc = 0;
count = 0;
- needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
- if (likely(ep->rep_receive_count > needed))
+ if (likely(ep->re_receive_count > needed))
goto out;
- needed -= ep->rep_receive_count;
+ needed -= ep->re_receive_count;
if (!temp)
needed += RPCRDMA_MAX_RECV_BATCH;
+ if (atomic_inc_return(&ep->re_receiving) > 1)
+ goto out;
+
/* fast path: all needed reps can be found on the free list */
wr = NULL;
while (needed) {
@@ -1570,6 +1364,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
if (!rep)
break;
+ rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id;
trace_xprtrdma_post_recv(rep);
rep->rr_recv_wr.next = wr;
wr = &rep->rr_recv_wr;
@@ -1579,20 +1374,24 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
if (!wr)
goto out;
- rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
+ rc = ib_post_recv(ep->re_id->qp, wr,
(const struct ib_recv_wr **)&bad_wr);
-out:
- trace_xprtrdma_post_recvs(r_xprt, count, rc);
if (rc) {
+ trace_xprtrdma_post_recvs_err(r_xprt, rc);
for (wr = bad_wr; wr;) {
struct rpcrdma_rep *rep;
rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr);
wr = wr->next;
- rpcrdma_recv_buffer_put(rep);
+ rpcrdma_rep_put(buf, rep);
--count;
}
}
- ep->rep_receive_count += count;
+ if (atomic_dec_return(&ep->re_receiving) > 0)
+ complete(&ep->re_done);
+
+out:
+ trace_xprtrdma_post_recvs(r_xprt, count);
+ ep->re_receive_count += count;
return;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 37d5080c250b..5e5ff6784ef5 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -53,6 +53,7 @@
#include <rdma/ib_verbs.h> /* RDMA verbs api */
#include <linux/sunrpc/clnt.h> /* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */
#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
@@ -65,48 +66,45 @@
#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
/*
- * Interface Adapter -- one per transport instance
+ * RDMA Endpoint -- connection endpoint details
*/
-struct rpcrdma_ia {
- struct rdma_cm_id *ri_id;
- struct ib_pd *ri_pd;
- int ri_async_rc;
- unsigned int ri_max_rdma_segs;
- unsigned int ri_max_frwr_depth;
- bool ri_implicit_roundup;
- enum ib_mr_type ri_mrtype;
- unsigned long ri_flags;
- struct completion ri_done;
- struct completion ri_remove_done;
-};
-
-enum {
- RPCRDMA_IAF_REMOVING = 0,
-};
-
-/*
- * RDMA Endpoint -- one per transport instance
- */
-
+struct rpcrdma_mr;
struct rpcrdma_ep {
- unsigned int rep_send_count;
- unsigned int rep_send_batch;
- unsigned int rep_max_inline_send;
- unsigned int rep_max_inline_recv;
- int rep_connected;
- struct ib_qp_init_attr rep_attr;
- wait_queue_head_t rep_connect_wait;
- struct rpcrdma_connect_private rep_cm_private;
- struct rdma_conn_param rep_remote_cma;
- unsigned int rep_max_requests; /* depends on device */
- unsigned int rep_inline_send; /* negotiated */
- unsigned int rep_inline_recv; /* negotiated */
- int rep_receive_count;
+ struct kref re_kref;
+ struct rdma_cm_id *re_id;
+ struct ib_pd *re_pd;
+ unsigned int re_max_rdma_segs;
+ unsigned int re_max_fr_depth;
+ struct rpcrdma_mr *re_write_pad_mr;
+ enum ib_mr_type re_mrtype;
+ struct completion re_done;
+ unsigned int re_send_count;
+ unsigned int re_send_batch;
+ unsigned int re_max_inline_send;
+ unsigned int re_max_inline_recv;
+ int re_async_rc;
+ int re_connect_status;
+ atomic_t re_receiving;
+ atomic_t re_force_disconnect;
+ struct ib_qp_init_attr re_attr;
+ wait_queue_head_t re_connect_wait;
+ struct rpc_xprt *re_xprt;
+ struct rpcrdma_connect_private
+ re_cm_private;
+ struct rdma_conn_param re_remote_cma;
+ int re_receive_count;
+ unsigned int re_max_requests; /* depends on device */
+ unsigned int re_inline_send; /* negotiated */
+ unsigned int re_inline_recv; /* negotiated */
+
+ atomic_t re_completion_ids;
+
+ char re_write_pad[XDR_UNIT];
};
-/* Pre-allocate extra Work Requests for handling backward receives
- * and sends. This is a fixed value because the Work Queues are
- * allocated when the forward channel is set up, long before the
+/* Pre-allocate extra Work Requests for handling reverse-direction
+ * Receives and Sends. This is a fixed value because the Work Queues
+ * are allocated when the forward channel is set up, long before the
* backchannel is provisioned. This value is two times
* NFS4_DEF_CB_SLOT_TABLE_SIZE.
*/
@@ -151,7 +149,11 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb)
return rb->rg_data;
}
-#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
+/* Do not use emergency memory reserves, and fail quickly if memory
+ * cannot be allocated easily. These flags may be used wherever there
+ * is robust logic to handle a failure to allocate.
+ */
+#define XPRTRDMA_GFP_FLAGS (__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN)
/* To ensure a transport can always make forward progress,
* the number of RDMA segments allowed in header chunk lists
@@ -189,6 +191,8 @@ enum {
struct rpcrdma_rep {
struct ib_cqe rr_cqe;
+ struct rpc_rdma_cid rr_cid;
+
__be32 rr_xid;
__be32 rr_vers;
__be32 rr_proc;
@@ -220,6 +224,7 @@ enum {
struct rpcrdma_req;
struct rpcrdma_sendctx {
struct ib_cqe sc_cqe;
+ struct rpc_rdma_cid sc_cid;
struct rpcrdma_req *sc_req;
unsigned int sc_unmap_count;
struct ib_sge sc_sges[];
@@ -231,29 +236,28 @@ struct rpcrdma_sendctx {
* An external memory region is any buffer or page that is registered
* on the fly (ie, not pre-registered).
*/
-struct rpcrdma_frwr {
- struct ib_mr *fr_mr;
- struct ib_cqe fr_cqe;
- struct completion fr_linv_done;
- union {
- struct ib_reg_wr fr_regwr;
- struct ib_send_wr fr_invwr;
- };
-};
-
struct rpcrdma_req;
struct rpcrdma_mr {
struct list_head mr_list;
struct rpcrdma_req *mr_req;
+
+ struct ib_mr *mr_ibmr;
+ struct ib_device *mr_device;
struct scatterlist *mr_sg;
int mr_nents;
enum dma_data_direction mr_dir;
- struct rpcrdma_frwr frwr;
+ struct ib_cqe mr_cqe;
+ struct completion mr_linv_done;
+ union {
+ struct ib_reg_wr mr_regwr;
+ struct ib_send_wr mr_invwr;
+ };
struct rpcrdma_xprt *mr_xprt;
u32 mr_handle;
u32 mr_length;
u64 mr_offset;
struct list_head mr_all;
+ struct rpc_rdma_cid mr_cid;
};
/*
@@ -284,10 +288,11 @@ enum {
RPCRDMA_MAX_IOV_SEGS,
};
-struct rpcrdma_mr_seg { /* chunk descriptors */
- u32 mr_len; /* length of chunk or segment */
- struct page *mr_page; /* owning page, if any */
- char *mr_offset; /* kva if no page, else offset */
+/* Arguments for DMA mapping and registration */
+struct rpcrdma_mr_seg {
+ u32 mr_len; /* length of segment */
+ struct page *mr_page; /* underlying struct page */
+ u64 mr_offset; /* IN: page offset, OUT: iova */
};
/* The Send SGE array is provisioned to send a maximum size
@@ -422,8 +427,7 @@ struct rpcrdma_stats {
*/
struct rpcrdma_xprt {
struct rpc_xprt rx_xprt;
- struct rpcrdma_ia rx_ia;
- struct rpcrdma_ep rx_ep;
+ struct rpcrdma_ep *rx_ep;
struct rpcrdma_buffer rx_buf;
struct delayed_work rx_connect_worker;
struct rpc_timeout rx_timeout;
@@ -455,29 +459,20 @@ extern int xprt_rdma_pad_optimize;
extern unsigned int xprt_rdma_memreg_strategy;
/*
- * Interface Adapter calls - xprtrdma/verbs.c
- */
-int rpcrdma_ia_open(struct rpcrdma_xprt *xprt);
-void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
-void rpcrdma_ia_close(struct rpcrdma_ia *);
-
-/*
* Endpoint calls - xprtrdma/verbs.c
*/
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_force_disconnect(struct rpcrdma_ep *ep);
+void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc);
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
- struct rpcrdma_req *);
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp);
/*
* Buffer calls - xprtrdma/verbs.c
*/
-struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
- gfp_t flags);
+struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt,
+ size_t size);
int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void rpcrdma_req_destroy(struct rpcrdma_req *req);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
@@ -485,13 +480,13 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_mr_put(struct rpcrdma_mr *mr);
void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers,
struct rpcrdma_req *req);
-void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep);
+void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req);
bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size,
gfp_t flags);
@@ -536,18 +531,18 @@ rpcrdma_data_dir(bool writing)
/* Memory registration calls xprtrdma/frwr_ops.c
*/
void frwr_reset(struct rpcrdma_req *req);
-int frwr_query_device(struct rpcrdma_xprt *r_xprt,
- const struct ib_device *device);
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
-void frwr_release_mr(struct rpcrdma_mr *mr);
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
+void frwr_mr_release(struct rpcrdma_mr *mr);
struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr *mr);
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+int frwr_wp_create(struct rpcrdma_xprt *r_xprt);
/*
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
@@ -569,9 +564,10 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
enum rpcrdma_chunktype rtype);
void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc);
int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep);
void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt);
void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
+void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep);
void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index d86c664ea6af..915b9902f673 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -54,9 +54,11 @@
#include <trace/events/sunrpc.h>
+#include "socklib.h"
#include "sunrpc.h"
static void xs_close(struct rpc_xprt *xprt);
+static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock);
static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
struct socket *sock);
@@ -90,6 +92,11 @@ static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT;
static struct ctl_table_header *sunrpc_table_header;
+static struct xprt_class xs_local_transport;
+static struct xprt_class xs_udp_transport;
+static struct xprt_class xs_tcp_transport;
+static struct xprt_class xs_bc_tcp_transport;
+
/*
* FIXME: changing the UDP slot table size should also resize the UDP
* socket buffers for existing UDP transports
@@ -254,7 +261,7 @@ static void xs_format_common_peer_addresses(struct rpc_xprt *xprt)
switch (sap->sa_family) {
case AF_LOCAL:
sun = xs_addr_un(xprt);
- strlcpy(buf, sun->sun_path, sizeof(buf));
+ strscpy(buf, sun->sun_path, sizeof(buf));
xprt->address_strings[RPC_DISPLAY_ADDR] =
kstrdup(buf, GFP_KERNEL);
break;
@@ -421,9 +428,9 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
offset += want;
}
- want = xs_alloc_sparse_pages(buf,
- min_t(size_t, count - offset, buf->page_len),
- GFP_KERNEL);
+ want = xs_alloc_sparse_pages(
+ buf, min_t(size_t, count - offset, buf->page_len),
+ GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
if (seek < want) {
ret = xs_read_bvec(sock, msg, flags, buf->bvec,
xdr_buf_pagecount(buf),
@@ -432,7 +439,8 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
if (ret <= 0)
goto sock_err;
xs_flush_bvec(buf->bvec, ret, seek + buf->page_base);
- offset += ret - buf->page_base;
+ ret -= buf->page_base;
+ offset += ret;
if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
goto out;
if (ret != want)
@@ -495,8 +503,8 @@ xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg,
int flags, struct rpc_rqst *req)
{
struct xdr_buf *buf = &req->rq_private_buf;
- size_t want, uninitialized_var(read);
- ssize_t uninitialized_var(ret);
+ size_t want, read;
+ ssize_t ret;
xs_read_header(transport, buf);
@@ -556,6 +564,10 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
struct rpc_rqst *req;
ssize_t ret;
+ /* Is this transport associated with the backchannel? */
+ if (!xprt->bc_serv)
+ return -ESHUTDOWN;
+
/* Look up and lock the request corresponding to the given XID */
req = xprt_lookup_bc_request(xprt, transport->recv.xid);
if (!req) {
@@ -749,141 +761,19 @@ xs_stream_start_connect(struct sock_xprt *transport)
#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
-static int xs_sendmsg(struct socket *sock, struct msghdr *msg, size_t seek)
-{
- if (seek)
- iov_iter_advance(&msg->msg_iter, seek);
- return sock_sendmsg(sock, msg);
-}
-
-static int xs_send_kvec(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t seek)
-{
- iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len);
- return xs_sendmsg(sock, msg, seek);
-}
-
-static int xs_send_pagedata(struct socket *sock, struct msghdr *msg, struct xdr_buf *xdr, size_t base)
-{
- int err;
-
- err = xdr_alloc_bvec(xdr, GFP_KERNEL);
- if (err < 0)
- return err;
-
- iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec,
- xdr_buf_pagecount(xdr),
- xdr->page_len + xdr->page_base);
- return xs_sendmsg(sock, msg, base + xdr->page_base);
-}
-
-#define xs_record_marker_len() sizeof(rpc_fraghdr)
-
-/* Common case:
- * - stream transport
- * - sending from byte 0 of the message
- * - the message is wholly contained in @xdr's head iovec
- */
-static int xs_send_rm_and_kvec(struct socket *sock, struct msghdr *msg,
- rpc_fraghdr marker, struct kvec *vec, size_t base)
-{
- struct kvec iov[2] = {
- [0] = {
- .iov_base = &marker,
- .iov_len = sizeof(marker)
- },
- [1] = *vec,
- };
- size_t len = iov[0].iov_len + iov[1].iov_len;
-
- iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len);
- return xs_sendmsg(sock, msg, base);
-}
-
-/**
- * xs_sendpages - write pages directly to a socket
- * @sock: socket to send on
- * @addr: UDP only -- address of destination
- * @addrlen: UDP only -- length of destination address
- * @xdr: buffer containing this request
- * @base: starting position in the buffer
- * @rm: stream record marker field
- * @sent_p: return the total number of bytes successfully queued for sending
- *
- */
-static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, rpc_fraghdr rm, int *sent_p)
-{
- struct msghdr msg = {
- .msg_name = addr,
- .msg_namelen = addrlen,
- .msg_flags = XS_SENDMSG_FLAGS | MSG_MORE,
- };
- unsigned int rmsize = rm ? sizeof(rm) : 0;
- unsigned int remainder = rmsize + xdr->len - base;
- unsigned int want;
- int err = 0;
-
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- want = xdr->head[0].iov_len + rmsize;
- if (base < want) {
- unsigned int len = want - base;
- remainder -= len;
- if (remainder == 0)
- msg.msg_flags &= ~MSG_MORE;
- if (rmsize)
- err = xs_send_rm_and_kvec(sock, &msg, rm,
- &xdr->head[0], base);
- else
- err = xs_send_kvec(sock, &msg, &xdr->head[0], base);
- if (remainder == 0 || err != len)
- goto out;
- *sent_p += err;
- base = 0;
- } else
- base -= want;
-
- if (base < xdr->page_len) {
- unsigned int len = xdr->page_len - base;
- remainder -= len;
- if (remainder == 0)
- msg.msg_flags &= ~MSG_MORE;
- err = xs_send_pagedata(sock, &msg, xdr, base);
- if (remainder == 0 || err != len)
- goto out;
- *sent_p += err;
- base = 0;
- } else
- base -= xdr->page_len;
-
- if (base >= xdr->tail[0].iov_len)
- return 0;
- msg.msg_flags &= ~MSG_MORE;
- err = xs_send_kvec(sock, &msg, &xdr->tail[0], base);
-out:
- if (err > 0) {
- *sent_p += err;
- err = 0;
- }
- return err;
-}
-
/**
* xs_nospace - handle transmit was incomplete
* @req: pointer to RPC request
+ * @transport: pointer to struct sock_xprt
*
*/
-static int xs_nospace(struct rpc_rqst *req)
+static int xs_nospace(struct rpc_rqst *req, struct sock_xprt *transport)
{
- struct rpc_xprt *xprt = req->rq_xprt;
- struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ struct rpc_xprt *xprt = &transport->xprt;
struct sock *sk = transport->inet;
int ret = -EAGAIN;
- dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
- req->rq_task->tk_pid,
- req->rq_slen - transport->xmit.offset,
- req->rq_slen);
+ trace_rpc_socket_nospace(req, transport);
/* Protect against races with write_space */
spin_lock(&xprt->transport_lock);
@@ -891,32 +781,50 @@ static int xs_nospace(struct rpc_rqst *req)
/* Don't race with disconnect */
if (xprt_connected(xprt)) {
/* wait for more buffer space */
+ set_bit(XPRT_SOCK_NOSPACE, &transport->sock_state);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
xprt_wait_for_buffer_space(xprt);
} else
ret = -ENOTCONN;
spin_unlock(&xprt->transport_lock);
+ return ret;
+}
- /* Race breaker in case memory is freed before above code is called */
- if (ret == -EAGAIN) {
- struct socket_wq *wq;
+static int xs_sock_nospace(struct rpc_rqst *req)
+{
+ struct sock_xprt *transport =
+ container_of(req->rq_xprt, struct sock_xprt, xprt);
+ struct sock *sk = transport->inet;
+ int ret = -EAGAIN;
- rcu_read_lock();
- wq = rcu_dereference(sk->sk_wq);
- set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
- rcu_read_unlock();
+ lock_sock(sk);
+ if (!sock_writeable(sk))
+ ret = xs_nospace(req, transport);
+ release_sock(sk);
+ return ret;
+}
- sk->sk_write_space(sk);
- }
+static int xs_stream_nospace(struct rpc_rqst *req, bool vm_wait)
+{
+ struct sock_xprt *transport =
+ container_of(req->rq_xprt, struct sock_xprt, xprt);
+ struct sock *sk = transport->inet;
+ int ret = -EAGAIN;
+
+ if (vm_wait)
+ return -ENOBUFS;
+ lock_sock(sk);
+ if (!sk_stream_memory_free(sk))
+ ret = xs_nospace(req, transport);
+ release_sock(sk);
return ret;
}
-static void
-xs_stream_prepare_request(struct rpc_rqst *req)
+static int xs_stream_prepare_request(struct rpc_rqst *req, struct xdr_buf *buf)
{
- xdr_free_bvec(&req->rq_rcv_buf);
- req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_KERNEL);
+ return xdr_alloc_bvec(buf, rpc_task_gfp_mask());
}
/*
@@ -949,7 +857,7 @@ xs_stream_record_marker(struct xdr_buf *xdr)
* EAGAIN: The socket was blocked, please call again later to
* complete the request
* ENOTCONN: Caller needs to invoke connect logic then call again
- * other: Some other error occured, the request was not sent
+ * other: Some other error occurred, the request was not sent
*/
static int xs_local_send_request(struct rpc_rqst *req)
{
@@ -959,27 +867,30 @@ static int xs_local_send_request(struct rpc_rqst *req)
struct xdr_buf *xdr = &req->rq_snd_buf;
rpc_fraghdr rm = xs_stream_record_marker(xdr);
unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen;
+ struct msghdr msg = {
+ .msg_flags = XS_SENDMSG_FLAGS,
+ };
+ bool vm_wait;
+ unsigned int sent;
int status;
- int sent = 0;
/* Close the stream if the previous transmission was incomplete */
if (xs_send_request_was_aborted(transport, req)) {
- xs_close(xprt);
+ xprt_force_disconnect(xprt);
return -ENOTCONN;
}
xs_pktdump("packet data:",
req->rq_svec->iov_base, req->rq_svec->iov_len);
+ vm_wait = sk_stream_is_writeable(transport->inet) ? true : false;
+
req->rq_xtime = ktime_get();
- status = xs_sendpages(transport->sock, NULL, 0, xdr,
- transport->xmit.offset, rm, &sent);
+ status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
+ transport->xmit.offset, rm, &sent);
dprintk("RPC: %s(%u) = %d\n",
__func__, xdr->len - transport->xmit.offset, status);
- if (status == -EAGAIN && sock_writeable(transport->inet))
- status = -ENOBUFS;
-
if (likely(sent > 0) || status == 0) {
transport->xmit.offset += sent;
req->rq_bytes_sent = transport->xmit.offset;
@@ -989,20 +900,19 @@ static int xs_local_send_request(struct rpc_rqst *req)
return 0;
}
status = -EAGAIN;
+ vm_wait = false;
}
switch (status) {
- case -ENOBUFS:
- break;
case -EAGAIN:
- status = xs_nospace(req);
+ status = xs_stream_nospace(req, vm_wait);
break;
default:
dprintk("RPC: sendmsg returned unrecognized error %d\n",
-status);
- /* fall through */
+ fallthrough;
case -EPIPE:
- xs_close(xprt);
+ xprt_force_disconnect(xprt);
status = -ENOTCONN;
}
@@ -1025,7 +935,12 @@ static int xs_udp_send_request(struct rpc_rqst *req)
struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf;
- int sent = 0;
+ struct msghdr msg = {
+ .msg_name = xs_addr(xprt),
+ .msg_namelen = xprt->addrlen,
+ .msg_flags = XS_SENDMSG_FLAGS,
+ };
+ unsigned int sent;
int status;
xs_pktdump("packet data:",
@@ -1038,9 +953,11 @@ static int xs_udp_send_request(struct rpc_rqst *req)
if (!xprt_request_get_cong(xprt, req))
return -EBADSLT;
+ status = xdr_alloc_bvec(xdr, rpc_task_gfp_mask());
+ if (status < 0)
+ return status;
req->rq_xtime = ktime_get();
- status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
- xdr, 0, 0, &sent);
+ status = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, 0, &sent);
dprintk("RPC: xs_udp_send_request(%u) = %d\n",
xdr->len, status);
@@ -1067,7 +984,7 @@ process_status:
/* Should we call xs_close() here? */
break;
case -EAGAIN:
- status = xs_nospace(req);
+ status = xs_sock_nospace(req);
break;
case -ENETUNREACH:
case -ENOBUFS:
@@ -1106,9 +1023,12 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
struct xdr_buf *xdr = &req->rq_snd_buf;
rpc_fraghdr rm = xs_stream_record_marker(xdr);
unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen;
- bool vm_wait = false;
+ struct msghdr msg = {
+ .msg_flags = XS_SENDMSG_FLAGS,
+ };
+ bool vm_wait;
+ unsigned int sent;
int status;
- int sent;
/* Close the stream if the previous transmission was incomplete */
if (xs_send_request_was_aborted(transport, req)) {
@@ -1116,6 +1036,8 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
kernel_sock_shutdown(transport->sock, SHUT_RDWR);
return -ENOTCONN;
}
+ if (!transport->inet)
+ return -ENOTCONN;
xs_pktdump("packet data:",
req->rq_svec->iov_base,
@@ -1124,14 +1046,19 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state))
xs_tcp_set_socket_timeouts(xprt, transport->sock);
+ xs_set_srcport(transport, transport->sock);
+
/* Continue transmitting the packet/record. We must be careful
* to cope with writespace callbacks arriving _after_ we have
* called sendmsg(). */
req->rq_xtime = ktime_get();
- while (1) {
- sent = 0;
- status = xs_sendpages(transport->sock, NULL, 0, xdr,
- transport->xmit.offset, rm, &sent);
+ tcp_sock_set_cork(transport->inet, true);
+
+ vm_wait = sk_stream_is_writeable(transport->inet) ? true : false;
+
+ do {
+ status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
+ transport->xmit.offset, rm, &sent);
dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
xdr->len - transport->xmit.offset, status);
@@ -1143,36 +1070,17 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
if (likely(req->rq_bytes_sent >= msglen)) {
req->rq_xmit_bytes_sent += transport->xmit.offset;
transport->xmit.offset = 0;
+ if (atomic_long_read(&xprt->xmit_queuelen) == 1)
+ tcp_sock_set_cork(transport->inet, false);
return 0;
}
WARN_ON_ONCE(sent == 0 && status == 0);
- if (status == -EAGAIN ) {
- /*
- * Return EAGAIN if we're sure we're hitting the
- * socket send buffer limits.
- */
- if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
- break;
- /*
- * Did we hit a memory allocation failure?
- */
- if (sent == 0) {
- status = -ENOBUFS;
- if (vm_wait)
- break;
- /* Retry, knowing now that we're below the
- * socket send buffer limit
- */
- vm_wait = true;
- }
- continue;
- }
- if (status < 0)
- break;
- vm_wait = false;
- }
+ if (sent > 0)
+ vm_wait = false;
+
+ } while (status == 0);
switch (status) {
case -ENOTSOCK:
@@ -1180,7 +1088,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
/* Should we call xs_close() here? */
break;
case -EAGAIN:
- status = xs_nospace(req);
+ status = xs_stream_nospace(req, vm_wait);
break;
case -ECONNRESET:
case -ECONNREFUSED:
@@ -1221,6 +1129,7 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state);
clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state);
clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state);
+ clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state);
}
static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr)
@@ -1231,6 +1140,7 @@ static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr)
static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
{
+ xprt->connect_cookie++;
smp_mb__before_atomic();
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
clear_bit(XPRT_CLOSING, &xprt->state);
@@ -1250,14 +1160,13 @@ static void xs_error_report(struct sock *sk)
struct sock_xprt *transport;
struct rpc_xprt *xprt;
- read_lock_bh(&sk->sk_callback_lock);
if (!(xprt = xprt_from_sock(sk)))
- goto out;
+ return;
transport = container_of(xprt, struct sock_xprt, xprt);
transport->xprt_err = -sk->sk_err;
if (transport->xprt_err == 0)
- goto out;
+ return;
dprintk("RPC: xs_error_report client %p, error=%d...\n",
xprt, -transport->xprt_err);
trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err);
@@ -1265,8 +1174,6 @@ static void xs_error_report(struct sock *sk)
/* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */
smp_mb__before_atomic();
xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR);
- out:
- read_unlock_bh(&sk->sk_callback_lock);
}
static void xs_reset_transport(struct sock_xprt *transport)
@@ -1278,6 +1185,16 @@ static void xs_reset_transport(struct sock_xprt *transport)
if (sk == NULL)
return;
+ /*
+ * Make sure we're calling this in a context from which it is safe
+ * to call __fput_sync(). In practice that means rpciod and the
+ * system workqueue.
+ */
+ if (!(current->flags & PF_WQ_WORKER)) {
+ WARN_ON_ONCE(1);
+ set_bit(XPRT_CLOSE_WAIT, &xprt->state);
+ return;
+ }
if (atomic_read(&transport->xprt.swapper))
sk_clear_memalloc(sk);
@@ -1285,7 +1202,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
kernel_sock_shutdown(sock, SHUT_RDWR);
mutex_lock(&transport->recv_mutex);
- write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
transport->inet = NULL;
transport->sock = NULL;
transport->file = NULL;
@@ -1294,14 +1211,14 @@ static void xs_reset_transport(struct sock_xprt *transport)
xs_restore_old_callbacks(transport, sk);
xprt_clear_connected(xprt);
- write_unlock_bh(&sk->sk_callback_lock);
xs_sock_reset_connection_flags(xprt);
/* Reset stream record info */
xs_stream_reset_connect(transport);
+ release_sock(sk);
mutex_unlock(&transport->recv_mutex);
trace_rpc_socket_close(xprt, sock);
- fput(filp);
+ __fput_sync(filp);
xprt_disconnect_done(xprt);
}
@@ -1430,7 +1347,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
if (sk == NULL)
goto out;
for (;;) {
- skb = skb_recv_udp(sk, 0, 1, &err);
+ skb = skb_recv_udp(sk, MSG_DONTWAIT, &err);
if (skb == NULL)
break;
xs_udp_data_read_skb(&transport->xprt, sk, skb);
@@ -1453,7 +1370,7 @@ static void xs_udp_data_receive_workfn(struct work_struct *work)
}
/**
- * xs_data_ready - "data ready" callback for UDP sockets
+ * xs_data_ready - "data ready" callback for sockets
* @sk: socket with data to read
*
*/
@@ -1461,12 +1378,13 @@ static void xs_data_ready(struct sock *sk)
{
struct rpc_xprt *xprt;
- read_lock_bh(&sk->sk_callback_lock);
- dprintk("RPC: xs_data_ready...\n");
xprt = xprt_from_sock(sk);
if (xprt != NULL) {
struct sock_xprt *transport = container_of(xprt,
struct sock_xprt, xprt);
+
+ trace_xs_data_ready(xprt);
+
transport->old_data_ready(sk);
/* Any data means we had a useful conversation, so
* then we don't need to delay the next reconnect
@@ -1476,7 +1394,6 @@ static void xs_data_ready(struct sock *sk)
if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
queue_work(xprtiod_workqueue, &transport->recv_worker);
}
- read_unlock_bh(&sk->sk_callback_lock);
}
/*
@@ -1496,6 +1413,26 @@ static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
/**
+ * xs_local_state_change - callback to handle AF_LOCAL socket state changes
+ * @sk: socket whose state has changed
+ *
+ */
+static void xs_local_state_change(struct sock *sk)
+{
+ struct rpc_xprt *xprt;
+ struct sock_xprt *transport;
+
+ if (!(xprt = xprt_from_sock(sk)))
+ return;
+ transport = container_of(xprt, struct sock_xprt, xprt);
+ if (sk->sk_shutdown & SHUTDOWN_MASK) {
+ clear_bit(XPRT_CONNECTED, &xprt->state);
+ /* Trigger the socket release */
+ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
+ }
+}
+
+/**
* xs_tcp_state_change - callback to handle TCP socket state changes
* @sk: socket whose state has changed
*
@@ -1505,9 +1442,8 @@ static void xs_tcp_state_change(struct sock *sk)
struct rpc_xprt *xprt;
struct sock_xprt *transport;
- read_lock_bh(&sk->sk_callback_lock);
if (!(xprt = xprt_from_sock(sk)))
- goto out;
+ return;
dprintk("RPC: xs_tcp_state_change client %p...\n", xprt);
dprintk("RPC: state %x conn %d dead %d zapped %d sk_shutdown %d\n",
sk->sk_state, xprt_connected(xprt),
@@ -1545,7 +1481,7 @@ static void xs_tcp_state_change(struct sock *sk)
xprt->connect_cookie++;
clear_bit(XPRT_CONNECTED, &xprt->state);
xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
- /* fall through */
+ fallthrough;
case TCP_CLOSING:
/*
* If the server closed down the connection, make sure that
@@ -1568,13 +1504,10 @@ static void xs_tcp_state_change(struct sock *sk)
/* Trigger the socket release */
xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
}
- out:
- read_unlock_bh(&sk->sk_callback_lock);
}
static void xs_write_space(struct sock *sk)
{
- struct socket_wq *wq;
struct sock_xprt *transport;
struct rpc_xprt *xprt;
@@ -1585,15 +1518,10 @@ static void xs_write_space(struct sock *sk)
if (unlikely(!(xprt = xprt_from_sock(sk))))
return;
transport = container_of(xprt, struct sock_xprt, xprt);
- rcu_read_lock();
- wq = rcu_dereference(sk->sk_wq);
- if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
- goto out;
-
+ if (!test_and_clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state))
+ return;
xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE);
sk->sk_write_pending--;
-out:
- rcu_read_unlock();
}
/**
@@ -1608,13 +1536,9 @@ out:
*/
static void xs_udp_write_space(struct sock *sk)
{
- read_lock_bh(&sk->sk_callback_lock);
-
/* from net/core/sock.c:sock_def_write_space */
if (sock_writeable(sk))
xs_write_space(sk);
-
- read_unlock_bh(&sk->sk_callback_lock);
}
/**
@@ -1629,13 +1553,9 @@ static void xs_udp_write_space(struct sock *sk)
*/
static void xs_tcp_write_space(struct sock *sk)
{
- read_lock_bh(&sk->sk_callback_lock);
-
/* from net/core/stream.c:sk_stream_write_space */
if (sk_stream_is_writeable(sk))
xs_write_space(sk);
-
- read_unlock_bh(&sk->sk_callback_lock);
}
static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt)
@@ -1699,25 +1619,10 @@ static int xs_get_random_port(void)
if (max < min)
return -EADDRINUSE;
range = max - min + 1;
- rand = (unsigned short) prandom_u32() % range;
+ rand = prandom_u32_max(range);
return rand + min;
}
-/**
- * xs_set_reuseaddr_port - set the socket's port and address reuse options
- * @sock: socket
- *
- * Note that this function has to be called on all sockets that share the
- * same port, and it must be called before binding.
- */
-static void xs_sock_set_reuseport(struct socket *sock)
-{
- int opt = 1;
-
- kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT,
- (char *)&opt, sizeof(opt));
-}
-
static unsigned short xs_sock_getport(struct socket *sock)
{
struct sockaddr_storage buf;
@@ -1765,6 +1670,36 @@ static int xs_get_srcport(struct sock_xprt *transport)
return port;
}
+static unsigned short xs_sock_srcport(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt);
+ unsigned short ret = 0;
+ mutex_lock(&sock->recv_mutex);
+ if (sock->sock)
+ ret = xs_sock_getport(sock->sock);
+ mutex_unlock(&sock->recv_mutex);
+ return ret;
+}
+
+static int xs_sock_srcaddr(struct rpc_xprt *xprt, char *buf, size_t buflen)
+{
+ struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt);
+ union {
+ struct sockaddr sa;
+ struct sockaddr_storage st;
+ } saddr;
+ int ret = -ENOTCONN;
+
+ mutex_lock(&sock->recv_mutex);
+ if (sock->sock) {
+ ret = kernel_getsockname(sock->sock, &saddr.sa);
+ if (ret >= 0)
+ ret = snprintf(buf, buflen, "%pISc", &saddr.sa);
+ }
+ mutex_unlock(&sock->recv_mutex);
+ return ret;
+}
+
static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port)
{
if (transport->srcport != 0)
@@ -1791,7 +1726,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
* This ensures that we can continue to establish TCP
* connections even when all local ephemeral ports are already
* a part of some TCP connection. This makes no difference
- * for UDP sockets, but also doens't harm them.
+ * for UDP sockets, but also doesn't harm them.
*
* If we're asking for any reserved port (i.e. port == 0 &&
* transport->xprt.resvport == 1) xs_get_srcport above will
@@ -1806,7 +1741,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
err = kernel_bind(sock, (struct sockaddr *)&myaddr,
transport->xprt.addrlen);
if (err == 0) {
- transport->srcport = port;
+ if (transport->xprt.reuseport)
+ transport->srcport = port;
break;
}
last = port;
@@ -1839,15 +1775,15 @@ static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port)
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct lock_class_key xs_key[2];
-static struct lock_class_key xs_slock_key[2];
+static struct lock_class_key xs_key[3];
+static struct lock_class_key xs_slock_key[3];
static inline void xs_reclassify_socketu(struct socket *sock)
{
struct sock *sk = sock->sk;
sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC",
- &xs_slock_key[1], "sk_lock-AF_LOCAL-RPC", &xs_key[1]);
+ &xs_slock_key[0], "sk_lock-AF_LOCAL-RPC", &xs_key[0]);
}
static inline void xs_reclassify_socket4(struct socket *sock)
@@ -1855,7 +1791,7 @@ static inline void xs_reclassify_socket4(struct socket *sock)
struct sock *sk = sock->sk;
sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC",
- &xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]);
+ &xs_slock_key[1], "sk_lock-AF_INET-RPC", &xs_key[1]);
}
static inline void xs_reclassify_socket6(struct socket *sock)
@@ -1863,7 +1799,7 @@ static inline void xs_reclassify_socket6(struct socket *sock)
struct sock *sk = sock->sk;
sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC",
- &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]);
+ &xs_slock_key[2], "sk_lock-AF_INET6-RPC", &xs_key[2]);
}
static inline void xs_reclassify_socket(int family, struct socket *sock)
@@ -1910,7 +1846,7 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt,
xs_reclassify_socket(family, sock);
if (reuseport)
- xs_sock_set_reuseport(sock);
+ sock_set_reuseport(sock->sk);
err = xs_bind(transport, sock);
if (err) {
@@ -1937,14 +1873,14 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
if (!transport->inet) {
struct sock *sk = sock->sk;
- write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
xs_save_old_callbacks(transport, sk);
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
- sock_set_flag(sk, SOCK_FASYNC);
+ sk->sk_state_change = xs_local_state_change;
sk->sk_error_report = xs_error_report;
xprt_clear_connected(xprt);
@@ -1953,7 +1889,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
transport->sock = sock;
transport->inet = sk;
- write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
}
xs_stream_start_connect(transport);
@@ -1970,7 +1906,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
struct rpc_xprt *xprt = &transport->xprt;
struct file *filp;
struct socket *sock;
- int status = -EIO;
+ int status;
status = __sock_create(xprt->xprt_net, AF_LOCAL,
SOCK_STREAM, 0, &sock, 1);
@@ -2001,6 +1937,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
xprt->stat.connect_time += (long)jiffies -
xprt->stat.connect_start;
xprt_set_connected(xprt);
+ break;
case -ENOBUFS:
break;
case -ENOENT:
@@ -2028,7 +1965,10 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
int ret;
- if (RPC_IS_ASYNC(task)) {
+ if (transport->file)
+ goto force_disconnect;
+
+ if (RPC_IS_ASYNC(task)) {
/*
* We want the AF_LOCAL connect to be resolved in the
* filesystem namespace of the process making the rpc
@@ -2038,20 +1978,25 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
* we'll need to figure out how to pass a namespace to
* connect.
*/
- task->tk_rpc_status = -ENOTCONN;
- rpc_exit(task, -ENOTCONN);
- return;
+ rpc_task_set_rpc_status(task, -ENOTCONN);
+ goto out_wake;
}
ret = xs_local_setup_socket(transport);
if (ret && !RPC_IS_SOFTCONN(task))
msleep_interruptible(15000);
+ return;
+force_disconnect:
+ xprt_force_disconnect(xprt);
+out_wake:
+ xprt_clear_connecting(xprt);
+ xprt_wake_pending_tasks(xprt, -ENOTCONN);
}
#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
/*
- * Note that this should be called with XPRT_LOCKED held (or when we otherwise
- * know that we have exclusive access to the socket), to guard against
- * races with xs_reset_transport.
+ * Note that this should be called with XPRT_LOCKED held, or recv_mutex
+ * held, or when we otherwise know that we have exclusive access to the
+ * socket, to guard against races with xs_reset_transport.
*/
static void xs_set_memalloc(struct rpc_xprt *xprt)
{
@@ -2080,13 +2025,11 @@ xs_enable_swap(struct rpc_xprt *xprt)
{
struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
- if (atomic_inc_return(&xprt->swapper) != 1)
- return 0;
- if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
- return -ERESTARTSYS;
- if (xs->inet)
+ mutex_lock(&xs->recv_mutex);
+ if (atomic_inc_return(&xprt->swapper) == 1 &&
+ xs->inet)
sk_set_memalloc(xs->inet);
- xprt_release_xprt(xprt, NULL);
+ mutex_unlock(&xs->recv_mutex);
return 0;
}
@@ -2102,13 +2045,11 @@ xs_disable_swap(struct rpc_xprt *xprt)
{
struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
- if (!atomic_dec_and_test(&xprt->swapper))
- return;
- if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
- return;
- if (xs->inet)
+ mutex_lock(&xs->recv_mutex);
+ if (atomic_dec_and_test(&xprt->swapper) &&
+ xs->inet)
sk_clear_memalloc(xs->inet);
- xprt_release_xprt(xprt, NULL);
+ mutex_unlock(&xs->recv_mutex);
}
#else
static void xs_set_memalloc(struct rpc_xprt *xprt)
@@ -2134,14 +2075,13 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
if (!transport->inet) {
struct sock *sk = sock->sk;
- write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
xs_save_old_callbacks(transport, sk);
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
- sock_set_flag(sk, SOCK_FASYNC);
xprt_set_connected(xprt);
@@ -2151,7 +2091,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
xs_set_memalloc(xprt);
- write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
}
xs_udp_do_set_buffer_size(xprt);
@@ -2165,7 +2105,10 @@ static void xs_udp_setup_socket(struct work_struct *work)
struct rpc_xprt *xprt = &transport->xprt;
struct socket *sock;
int status = -EIO;
+ unsigned int pflags = current->flags;
+ if (atomic_read(&xprt->swapper))
+ current->flags |= PF_MEMALLOC;
sock = xs_create_sock(xprt, transport,
xs_addr(xprt)->sa_family, SOCK_DGRAM,
IPPROTO_UDP, false);
@@ -2185,6 +2128,7 @@ out:
xprt_clear_connecting(xprt);
xprt_unlock_connect(xprt, transport);
xprt_wake_pending_tasks(xprt, status);
+ current_restore_flags(pflags, PF_MEMALLOC);
}
/**
@@ -2202,13 +2146,20 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt)
if (sock == NULL)
return;
+ if (!xprt->reuseport) {
+ xs_close(xprt);
+ return;
+ }
switch (skst) {
- default:
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
+ break;
+ case TCP_ESTABLISHED:
+ case TCP_CLOSE_WAIT:
kernel_sock_shutdown(sock, SHUT_RDWR);
trace_rpc_socket_shutdown(xprt, sock);
break;
- case TCP_CLOSE:
- case TCP_TIME_WAIT:
+ default:
xs_reset_transport(transport);
}
}
@@ -2219,7 +2170,6 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
unsigned int keepidle;
unsigned int keepcnt;
- unsigned int opt_on = 1;
unsigned int timeo;
spin_lock(&xprt->transport_lock);
@@ -2231,18 +2181,13 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
spin_unlock(&xprt->transport_lock);
/* TCP Keepalive options */
- kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
- (char *)&opt_on, sizeof(opt_on));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
- (char *)&keepidle, sizeof(keepidle));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
- (char *)&keepidle, sizeof(keepidle));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
- (char *)&keepcnt, sizeof(keepcnt));
+ sock_set_keepalive(sock->sk);
+ tcp_sock_set_keepidle(sock->sk, keepidle);
+ tcp_sock_set_keepintvl(sock->sk, keepidle);
+ tcp_sock_set_keepcnt(sock->sk, keepcnt);
/* TCP user timeout (see RFC5482) */
- kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
- (char *)&timeo, sizeof(timeo));
+ tcp_sock_set_user_timeout(sock->sk, timeo);
}
static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
@@ -2276,11 +2221,9 @@ static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
- int ret = -ENOTCONN;
if (!transport->inet) {
struct sock *sk = sock->sk;
- unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC;
/* Avoid temporary address, they are bad for long-lived
* connections such as NFS mounts.
@@ -2289,12 +2232,15 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
* knowledge about the normal duration of connections,
* MAY override this as appropriate.
*/
- kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES,
- (char *)&addr_pref, sizeof(addr_pref));
+ if (xs_addr(xprt)->sa_family == PF_INET6) {
+ ip6_sock_set_addr_preferences(sk,
+ IPV6_PREFER_SRC_PUBLIC);
+ }
xs_tcp_set_socket_timeouts(xprt, sock);
+ tcp_sock_set_nodelay(sk);
- write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
xs_save_old_callbacks(transport, sk);
@@ -2302,12 +2248,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_data_ready = xs_data_ready;
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
- sock_set_flag(sk, SOCK_FASYNC);
sk->sk_error_report = xs_error_report;
/* socket options */
sock_reset_flag(sk, SOCK_LINGER);
- tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
xprt_clear_connected(xprt);
@@ -2315,11 +2259,11 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
transport->sock = sock;
transport->inet = sk;
- write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
}
if (!xprt_bound(xprt))
- goto out;
+ return -ENOTCONN;
xs_set_memalloc(xprt);
@@ -2327,22 +2271,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
/* Tell the socket layer to start connecting... */
set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state);
- ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
- switch (ret) {
- case 0:
- xs_set_srcport(transport, sock);
- /* fall through */
- case -EINPROGRESS:
- /* SYN_SENT! */
- if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
- xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
- break;
- case -EADDRNOTAVAIL:
- /* Source port number is unavailable. Try a new one! */
- transport->srcport = 0;
- }
-out:
- return ret;
+ return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
}
/**
@@ -2357,14 +2286,22 @@ static void xs_tcp_setup_socket(struct work_struct *work)
container_of(work, struct sock_xprt, connect_worker.work);
struct socket *sock = transport->sock;
struct rpc_xprt *xprt = &transport->xprt;
- int status = -EIO;
+ int status;
+ unsigned int pflags = current->flags;
- if (!sock) {
- sock = xs_create_sock(xprt, transport,
- xs_addr(xprt)->sa_family, SOCK_STREAM,
- IPPROTO_TCP, true);
+ if (atomic_read(&xprt->swapper))
+ current->flags |= PF_MEMALLOC;
+
+ if (xprt_connected(xprt))
+ goto out;
+ if (test_and_clear_bit(XPRT_SOCK_CONNECT_SENT,
+ &transport->sock_state) ||
+ !sock) {
+ xs_reset_transport(transport);
+ sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, true);
if (IS_ERR(sock)) {
- status = PTR_ERR(sock);
+ xprt_wake_pending_tasks(xprt, PTR_ERR(sock));
goto out;
}
}
@@ -2381,21 +2318,20 @@ static void xs_tcp_setup_socket(struct work_struct *work)
xprt, -status, xprt_connected(xprt),
sock->sk->sk_state);
switch (status) {
- default:
- printk("%s: connect returned unhandled error %d\n",
- __func__, status);
- /* fall through */
- case -EADDRNOTAVAIL:
- /* We're probably in TIME_WAIT. Get rid of existing socket,
- * and retry
- */
- xs_tcp_force_close(xprt);
- break;
case 0:
case -EINPROGRESS:
+ /* SYN_SENT! */
+ set_bit(XPRT_SOCK_CONNECT_SENT, &transport->sock_state);
+ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+ xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+ fallthrough;
case -EALREADY:
- xprt_unlock_connect(xprt, transport);
- return;
+ goto out_unlock;
+ case -EADDRNOTAVAIL:
+ /* Source port number is unavailable. Try a new one! */
+ transport->srcport = 0;
+ status = -EAGAIN;
+ break;
case -EINVAL:
/* Happens, for instance, if the user specified a link
* local IPv6 address without a scope-id.
@@ -2407,20 +2343,23 @@ static void xs_tcp_setup_socket(struct work_struct *work)
case -EHOSTUNREACH:
case -EADDRINUSE:
case -ENOBUFS:
- /*
- * xs_tcp_force_close() wakes tasks with -EIO.
- * We need to wake them first to ensure the
- * correct error code.
- */
- xprt_wake_pending_tasks(xprt, status);
- xs_tcp_force_close(xprt);
- goto out;
+ break;
+ default:
+ printk("%s: connect returned unhandled error %d\n",
+ __func__, status);
+ status = -EAGAIN;
}
- status = -EAGAIN;
+
+ /* xs_tcp_force_close() wakes tasks with a fixed error code.
+ * We need to wake them first to ensure the correct error code.
+ */
+ xprt_wake_pending_tasks(xprt, status);
+ xs_tcp_force_close(xprt);
out:
xprt_clear_connecting(xprt);
+out_unlock:
xprt_unlock_connect(xprt, transport);
- xprt_wake_pending_tasks(xprt, status);
+ current_restore_flags(pflags, PF_MEMALLOC);
}
/**
@@ -2446,11 +2385,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
if (transport->sock != NULL) {
dprintk("RPC: xs_connect delayed xprt %p for %lu "
- "seconds\n",
- xprt, xprt->reestablish_timeout / HZ);
-
- /* Start by resetting any existing state */
- xs_reset_transport(transport);
+ "seconds\n", xprt, xprt->reestablish_timeout / HZ);
delay = xprt_reconnect_delay(xprt);
xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO);
@@ -2511,7 +2446,7 @@ static void xs_error_handle(struct work_struct *work)
}
/**
- * xs_local_print_stats - display AF_LOCAL socket-specifc stats
+ * xs_local_print_stats - display AF_LOCAL socket-specific stats
* @xprt: rpc_xprt struct containing statistics
* @seq: output file
*
@@ -2540,7 +2475,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
}
/**
- * xs_udp_print_stats - display UDP socket-specifc stats
+ * xs_udp_print_stats - display UDP socket-specific stats
* @xprt: rpc_xprt struct containing statistics
* @seq: output file
*
@@ -2564,7 +2499,7 @@ static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
}
/**
- * xs_tcp_print_stats - display TCP socket-specifc stats
+ * xs_tcp_print_stats - display TCP socket-specific stats
* @xprt: rpc_xprt struct containing statistics
* @seq: output file
*
@@ -2612,7 +2547,7 @@ static int bc_malloc(struct rpc_task *task)
return -EINVAL;
}
- page = alloc_page(GFP_KERNEL);
+ page = alloc_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
if (!page)
return -ENOMEM;
@@ -2636,50 +2571,40 @@ static void bc_free(struct rpc_task *task)
free_page((unsigned long)buf);
}
-/*
- * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex
- * held. Borrows heavily from svc_tcp_sendto and xs_tcp_send_request.
- */
static int bc_sendto(struct rpc_rqst *req)
{
- int len;
- struct xdr_buf *xbufp = &req->rq_snd_buf;
+ struct xdr_buf *xdr = &req->rq_snd_buf;
struct sock_xprt *transport =
container_of(req->rq_xprt, struct sock_xprt, xprt);
- unsigned long headoff;
- unsigned long tailoff;
- struct page *tailpage;
struct msghdr msg = {
- .msg_flags = MSG_MORE
+ .msg_flags = 0,
};
rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
- (u32)xbufp->len);
- struct kvec iov = {
- .iov_base = &marker,
- .iov_len = sizeof(marker),
- };
+ (u32)xdr->len);
+ unsigned int sent = 0;
+ int err;
req->rq_xtime = ktime_get();
-
- len = kernel_sendmsg(transport->sock, &msg, &iov, 1, iov.iov_len);
- if (len != iov.iov_len)
- return -EAGAIN;
-
- tailpage = NULL;
- if (xbufp->tail[0].iov_len)
- tailpage = virt_to_page(xbufp->tail[0].iov_base);
- tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK;
- headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK;
- len = svc_send_common(transport->sock, xbufp,
- virt_to_page(xbufp->head[0].iov_base), headoff,
- tailpage, tailoff);
- if (len != xbufp->len)
+ err = xdr_alloc_bvec(xdr, rpc_task_gfp_mask());
+ if (err < 0)
+ return err;
+ err = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, marker, &sent);
+ xdr_free_bvec(xdr);
+ if (err < 0 || sent != (xdr->len + sizeof(marker)))
return -EAGAIN;
- return len;
+ return sent;
}
-/*
- * The send routine. Borrows from svc_send
+/**
+ * bc_send_request - Send a backchannel Call on a TCP socket
+ * @req: rpc_rqst containing Call message to be sent
+ *
+ * xpt_mutex ensures @rqstp's whole message is written to the socket
+ * without interruption.
+ *
+ * Return values:
+ * %0 if the message was sent successfully
+ * %ENOTCONN if the message was not sent
*/
static int bc_send_request(struct rpc_rqst *req)
{
@@ -2714,6 +2639,7 @@ static int bc_send_request(struct rpc_rqst *req)
static void bc_close(struct rpc_xprt *xprt)
{
+ xprt_disconnect_done(xprt);
}
/*
@@ -2758,6 +2684,8 @@ static const struct rpc_xprt_ops xs_udp_ops = {
.rpcbind = rpcb_getport_async,
.set_port = xs_set_port,
.connect = xs_connect,
+ .get_srcaddr = xs_sock_srcaddr,
+ .get_srcport = xs_sock_srcport,
.buf_alloc = rpc_malloc,
.buf_free = rpc_free,
.send_request = xs_udp_send_request,
@@ -2780,6 +2708,8 @@ static const struct rpc_xprt_ops xs_tcp_ops = {
.rpcbind = rpcb_getport_async,
.set_port = xs_set_port,
.connect = xs_connect,
+ .get_srcaddr = xs_sock_srcaddr,
+ .get_srcport = xs_sock_srcport,
.buf_alloc = rpc_malloc,
.buf_free = rpc_free,
.prepare_request = xs_stream_prepare_request,
@@ -2914,6 +2844,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
transport = container_of(xprt, struct sock_xprt, xprt);
xprt->prot = 0;
+ xprt->xprt_class = &xs_local_transport;
xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
xprt->bind_timeout = XS_BIND_TO;
@@ -2937,9 +2868,6 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
}
xprt_set_bound(xprt);
xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL);
- ret = ERR_PTR(xs_local_setup_socket(transport));
- if (ret)
- goto out_err;
break;
default:
ret = ERR_PTR(-EAFNOSUPPORT);
@@ -2983,6 +2911,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
transport = container_of(xprt, struct sock_xprt, xprt);
xprt->prot = IPPROTO_UDP;
+ xprt->xprt_class = &xs_udp_transport;
/* XXX: header size can vary due to auth type, IPv6, etc. */
xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
@@ -3063,6 +2992,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
transport = container_of(xprt, struct sock_xprt, xprt);
xprt->prot = IPPROTO_TCP;
+ xprt->xprt_class = &xs_tcp_transport;
xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
xprt->bind_timeout = XS_BIND_TO;
@@ -3136,6 +3066,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
transport = container_of(xprt, struct sock_xprt, xprt);
xprt->prot = IPPROTO_TCP;
+ xprt->xprt_class = &xs_bc_tcp_transport;
xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
xprt->timeout = &xs_tcp_default_timeout;
@@ -3203,6 +3134,7 @@ static struct xprt_class xs_local_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_LOCAL,
.setup = xs_setup_local,
+ .netid = { "" },
};
static struct xprt_class xs_udp_transport = {
@@ -3211,6 +3143,7 @@ static struct xprt_class xs_udp_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_UDP,
.setup = xs_setup_udp,
+ .netid = { "udp", "udp6", "" },
};
static struct xprt_class xs_tcp_transport = {
@@ -3219,6 +3152,7 @@ static struct xprt_class xs_tcp_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_TCP,
.setup = xs_setup_tcp,
+ .netid = { "tcp", "tcp6", "" },
};
static struct xprt_class xs_bc_tcp_transport = {
@@ -3227,6 +3161,7 @@ static struct xprt_class xs_bc_tcp_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_BC_TCP,
.setup = xs_setup_bc_tcp,
+ .netid = { "" },
};
/**
@@ -3263,24 +3198,6 @@ void cleanup_socket_xprt(void)
xprt_unregister_transport(&xs_bc_tcp_transport);
}
-static int param_set_uint_minmax(const char *val,
- const struct kernel_param *kp,
- unsigned int min, unsigned int max)
-{
- unsigned int num;
- int ret;
-
- if (!val)
- return -EINVAL;
- ret = kstrtouint(val, 0, &num);
- if (ret)
- return ret;
- if (num < min || num > max)
- return -EINVAL;
- *((unsigned int *)kp->arg) = num;
- return 0;
-}
-
static int param_set_portnr(const char *val, const struct kernel_param *kp)
{
return param_set_uint_minmax(val, kp,