From cc886c9ff1607eda04062bdcec963e2f8e6a3eb1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:49:12 -0500 Subject: svcrdma: Improve allocation of struct svc_rdma_op_ctxt When the maximum payload size of NFS READ and WRITE was increased by commit cc9a903d915c ("svcrdma: Change maximum server payload back to RPCSVC_MAXPAYLOAD"), the size of struct svc_rdma_op_ctxt increased to over 6KB (on x86_64). That makes allocating one of these from a kmem_cache more likely to fail in situations when system memory is exhausted. Since I'm about to add a caller where this allocation must always work _and_ it cannot sleep, pre-allocate ctxts for each connection. Another motivation for this change is that NFSv4.x servers are required by specification not to drop NFS requests. Pre-allocating memory resources reduces the likelihood of a drop. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index f869807a0d0e..be2804b72cd8 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -69,6 +69,7 @@ extern atomic_t rdma_stat_sq_prod; * completes. */ struct svc_rdma_op_ctxt { + struct list_head free; struct svc_rdma_op_ctxt *read_hdr; struct svc_rdma_fastreg_mr *frmr; int hdr_count; @@ -141,7 +142,10 @@ struct svcxprt_rdma { struct ib_pd *sc_pd; atomic_t sc_dma_used; - atomic_t sc_ctxt_used; + spinlock_t sc_ctxt_lock; + struct list_head sc_ctxts; + int sc_ctxt_used; + struct list_head sc_rq_dto_q; spinlock_t sc_rq_dto_lock; struct ib_qp *sc_qp; -- cgit v1.2.3-59-g8ed1b From 2fe81b239dbb00d0a2fd8858ac9dd4ef4a8841ee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:49:20 -0500 Subject: svcrdma: Improve allocation of struct svc_rdma_req_map To ensure this allocation cannot fail and will not sleep, pre-allocate the req_map structures per-connection. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 8 ++- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 6 +-- net/sunrpc/xprtrdma/svc_rdma_transport.c | 85 ++++++++++++++++++++++++++++---- 3 files changed, 84 insertions(+), 15 deletions(-) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index be2804b72cd8..05bf4febad44 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -113,6 +113,7 @@ struct svc_rdma_fastreg_mr { struct list_head frmr_list; }; struct svc_rdma_req_map { + struct list_head free; unsigned long count; union { struct kvec sge[RPCSVC_MAXPAGES]; @@ -145,6 +146,8 @@ struct svcxprt_rdma { spinlock_t sc_ctxt_lock; struct list_head sc_ctxts; int sc_ctxt_used; + spinlock_t sc_map_lock; + struct list_head sc_maps; struct list_head sc_rq_dto_q; spinlock_t sc_rq_dto_lock; @@ -223,8 +226,9 @@ extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt); -extern struct svc_rdma_req_map *svc_rdma_get_req_map(void); -extern void svc_rdma_put_req_map(struct svc_rdma_req_map *); +extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *); +extern void svc_rdma_put_req_map(struct svcxprt_rdma *, + struct svc_rdma_req_map *); extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *); extern void svc_rdma_put_frmr(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 969a1ab75fc3..9a097f95e10b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -591,7 +591,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) /* Build an req vec for the XDR */ ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; - vec = svc_rdma_get_req_map(); + vec = svc_rdma_get_req_map(rdma); ret = map_xdr(rdma, &rqstp->rq_res, vec); if (ret) goto err0; @@ -630,14 +630,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, inline_bytes); - svc_rdma_put_req_map(vec); + svc_rdma_put_req_map(rdma, vec); dprintk("svcrdma: send_reply returns %d\n", ret); return ret; err1: put_page(res_page); err0: - svc_rdma_put_req_map(vec); + svc_rdma_put_req_map(rdma, vec); svc_rdma_put_context(ctxt, 0); return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 9801115f6e59..0b9e17ea777e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -273,23 +273,83 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) } } -/* - * Temporary NFS req mappings are shared across all transport - * instances. These are short lived and should be bounded by the number - * of concurrent server threads * depth of the SQ. - */ -struct svc_rdma_req_map *svc_rdma_get_req_map(void) +static struct svc_rdma_req_map *alloc_req_map(gfp_t flags) { struct svc_rdma_req_map *map; - map = kmem_cache_alloc(svc_rdma_map_cachep, - GFP_KERNEL | __GFP_NOFAIL); + + map = kmalloc(sizeof(*map), flags); + if (map) + INIT_LIST_HEAD(&map->free); + return map; +} + +static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt) +{ + int i; + + /* One for each receive buffer on this connection. */ + i = xprt->sc_max_requests; + + while (i--) { + struct svc_rdma_req_map *map; + + map = alloc_req_map(GFP_KERNEL); + if (!map) { + dprintk("svcrdma: No memory for request map\n"); + return false; + } + list_add(&map->free, &xprt->sc_maps); + } + return true; +} + +struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_req_map *map = NULL; + + spin_lock(&xprt->sc_map_lock); + if (list_empty(&xprt->sc_maps)) + goto out_empty; + + map = list_first_entry(&xprt->sc_maps, + struct svc_rdma_req_map, free); + list_del_init(&map->free); + spin_unlock(&xprt->sc_map_lock); + +out: map->count = 0; return map; + +out_empty: + spin_unlock(&xprt->sc_map_lock); + + /* Pre-allocation amount was incorrect */ + map = alloc_req_map(GFP_NOIO); + if (map) + goto out; + + WARN_ONCE(1, "svcrdma: empty request map list?\n"); + return NULL; } -void svc_rdma_put_req_map(struct svc_rdma_req_map *map) +void svc_rdma_put_req_map(struct svcxprt_rdma *xprt, + struct svc_rdma_req_map *map) { - kmem_cache_free(svc_rdma_map_cachep, map); + spin_lock(&xprt->sc_map_lock); + list_add(&map->free, &xprt->sc_maps); + spin_unlock(&xprt->sc_map_lock); +} + +static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt) +{ + while (!list_empty(&xprt->sc_maps)) { + struct svc_rdma_req_map *map; + + map = list_first_entry(&xprt->sc_maps, + struct svc_rdma_req_map, free); + list_del(&map->free); + kfree(map); + } } /* ib_cq event handler */ @@ -593,12 +653,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); INIT_LIST_HEAD(&cma_xprt->sc_ctxts); + INIT_LIST_HEAD(&cma_xprt->sc_maps); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); spin_lock_init(&cma_xprt->sc_frmr_q_lock); spin_lock_init(&cma_xprt->sc_ctxt_lock); + spin_lock_init(&cma_xprt->sc_map_lock); if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); @@ -988,6 +1050,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (!svc_rdma_prealloc_ctxts(newxprt)) goto errout; + if (!svc_rdma_prealloc_maps(newxprt)) + goto errout; /* * Limit ORD based on client limit, local device limit, and @@ -1259,6 +1323,7 @@ static void __svc_rdma_free(struct work_struct *work) rdma_dealloc_frmr_q(rdma); svc_rdma_destroy_ctxts(rdma); + svc_rdma_destroy_maps(rdma); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) -- cgit v1.2.3-59-g8ed1b From 71810ef3271d1a06f7002c55c7e354d8c3233762 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:49:28 -0500 Subject: svcrdma: Remove unused req_map and ctxt kmem_caches Clean up. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma.c | 35 ----------------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 7 ------- 3 files changed, 1 insertion(+), 42 deletions(-) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 05bf4febad44..141edbbb73b3 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -242,6 +242,7 @@ extern struct svc_xprt_class svc_rdma_bc_class; #endif /* svc_rdma.c */ +extern struct workqueue_struct *svc_rdma_wq; extern int svc_rdma_init(void); extern void svc_rdma_cleanup(void); diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 1b7051bdbdc8..e894e0698c45 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -71,10 +71,6 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; -/* Temporary NFS request map and context caches */ -struct kmem_cache *svc_rdma_map_cachep; -struct kmem_cache *svc_rdma_ctxt_cachep; - struct workqueue_struct *svc_rdma_wq; /* @@ -243,8 +239,6 @@ void svc_rdma_cleanup(void) svc_unreg_xprt_class(&svc_rdma_bc_class); #endif svc_unreg_xprt_class(&svc_rdma_class); - kmem_cache_destroy(svc_rdma_map_cachep); - kmem_cache_destroy(svc_rdma_ctxt_cachep); } int svc_rdma_init(void) @@ -264,39 +258,10 @@ int svc_rdma_init(void) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); - /* Create the temporary map cache */ - svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache", - sizeof(struct svc_rdma_req_map), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!svc_rdma_map_cachep) { - printk(KERN_INFO "Could not allocate map cache.\n"); - goto err0; - } - - /* Create the temporary context cache */ - svc_rdma_ctxt_cachep = - kmem_cache_create("svc_rdma_ctxt_cache", - sizeof(struct svc_rdma_op_ctxt), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!svc_rdma_ctxt_cachep) { - printk(KERN_INFO "Could not allocate WR ctxt cache.\n"); - goto err1; - } - /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); #if defined(CONFIG_SUNRPC_BACKCHANNEL) svc_reg_xprt_class(&svc_rdma_bc_class); #endif return 0; - err1: - kmem_cache_destroy(svc_rdma_map_cachep); - err0: - unregister_sysctl_table(svcrdma_table_header); - destroy_workqueue(svc_rdma_wq); - return -ENOMEM; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 419719198caf..72276c7907e4 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -528,11 +528,4 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *); void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* Temporary NFS request map cache. Created in svc_rdma.c */ -extern struct kmem_cache *svc_rdma_map_cachep; -/* WR context cache. Created in svc_rdma.c */ -extern struct kmem_cache *svc_rdma_ctxt_cachep; -/* Workqueue created in svc_rdma.c */ -extern struct workqueue_struct *svc_rdma_wq; - #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ -- cgit v1.2.3-59-g8ed1b From 39b09a1a121cb22820c374f4e92f7ca34be1b75d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:49:37 -0500 Subject: svcrdma: Add gfp flags to svc_rdma_post_recv() svc_rdma_post_recv() allocates pages for receive buffers on-demand. It uses GFP_KERNEL so the allocator tries hard, and may sleep. But I'm about to add a call to svc_rdma_post_recv() from a function that may not sleep. Since all svc_rdma_post_recv() call sites can tolerate its failure, allow it to fail if the page allocator returns nothing. Longer term, receive buffers, being a finite resource per-connection, should be pre-allocated and re-used. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 2 +- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_transport.c | 8 +++++--- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 141edbbb73b3..729ff356c18a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -221,7 +221,7 @@ extern struct rpcrdma_read_chunk * extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, enum rpcrdma_errcode); -extern int svc_rdma_post_recv(struct svcxprt_rdma *); +extern int svc_rdma_post_recv(struct svcxprt_rdma *, gfp_t); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 9a097f95e10b..aeaec7aa34df 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -465,7 +465,7 @@ static int send_reply(struct svcxprt_rdma *rdma, int ret; /* Post a recv buffer to handle another request. */ - ret = svc_rdma_post_recv(rdma); + ret = svc_rdma_post_recv(rdma, GFP_KERNEL); if (ret) { printk(KERN_INFO "svcrdma: could not post a receive buffer, err=%d." diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 0b9e17ea777e..20fc095d6b12 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -668,7 +668,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return cma_xprt; } -int svc_rdma_post_recv(struct svcxprt_rdma *xprt) +int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) { struct ib_recv_wr recv_wr, *bad_recv_wr; struct svc_rdma_op_ctxt *ctxt; @@ -686,7 +686,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err_put_ctxt; } - page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + page = alloc_page(flags); + if (!page) + goto err_put_ctxt; ctxt->pages[sge_no] = page; pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, @@ -1182,7 +1184,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Post receive buffers */ for (i = 0; i < newxprt->sc_max_requests; i++) { - ret = svc_rdma_post_recv(newxprt); + ret = svc_rdma_post_recv(newxprt, GFP_KERNEL); if (ret) { dprintk("svcrdma: failure posting receive buffers\n"); goto errout; -- cgit v1.2.3-59-g8ed1b From ba986c96f907a513215fb7f1c0a89261c97251ca Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:49:53 -0500 Subject: svcrdma: Make map_xdr non-static Pre-requisite to use map_xdr in the backchannel code. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 2 ++ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 729ff356c18a..aeffa30655ce 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -213,6 +213,8 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, u32, u32, u64, bool); /* svc_rdma_sendto.c */ +extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, + struct svc_rdma_req_map *); extern int svc_rdma_sendto(struct svc_rqst *); extern struct rpcrdma_read_chunk * svc_rdma_get_read_chunk(struct rpcrdma_msg *); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index de7df7b4d855..3c250523f7cc 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -50,9 +50,9 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -static int map_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) +int svc_rdma_map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) { int sge_no; u32 sge_bytes; @@ -62,7 +62,7 @@ static int map_xdr(struct svcxprt_rdma *xprt, if (xdr->len != (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) { - pr_err("svcrdma: map_xdr: XDR buffer length error\n"); + pr_err("svcrdma: %s: XDR buffer length error\n", __func__); return -EIO; } @@ -97,9 +97,9 @@ static int map_xdr(struct svcxprt_rdma *xprt, sge_no++; } - dprintk("svcrdma: map_xdr: sge_no %d page_no %d " + dprintk("svcrdma: %s: sge_no %d page_no %d " "page_base %u page_len %u head_len %zu tail_len %zu\n", - sge_no, page_no, xdr->page_base, xdr->page_len, + __func__, sge_no, page_no, xdr->page_base, xdr->page_len, xdr->head[0].iov_len, xdr->tail[0].iov_len); vec->count = sge_no; @@ -592,7 +592,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; vec = svc_rdma_get_req_map(rdma); - ret = map_xdr(rdma, &rqstp->rq_res, vec); + ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec); if (ret) goto err0; inline_bytes = rqstp->rq_res.len; -- cgit v1.2.3-59-g8ed1b From 03fe9931536fe4782e9e34f7f499d588acd2015b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:50:02 -0500 Subject: svcrdma: Define maximum number of backchannel requests Extra resources for handling backchannel requests have to be pre-allocated when a transport instance is created. Set up additional fields in svcxprt_rdma to track these resources. The max_requests fields are elements of the RPC-over-RDMA protocol, so they should be u32. To ensure that unsigned arithmetic is used everywhere, some other fields in the svcxprt_rdma struct are updated. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 13 ++++++++++--- net/sunrpc/xprtrdma/svc_rdma.c | 6 ++++-- net/sunrpc/xprtrdma/svc_rdma_transport.c | 24 ++++++++++++++---------- 3 files changed, 28 insertions(+), 15 deletions(-) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index aeffa30655ce..9a2c418dc690 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -51,6 +51,7 @@ /* RPC/RDMA parameters and stats */ extern unsigned int svcrdma_ord; extern unsigned int svcrdma_max_requests; +extern unsigned int svcrdma_max_bc_requests; extern unsigned int svcrdma_max_req_size; extern atomic_t rdma_stat_recv; @@ -134,10 +135,11 @@ struct svcxprt_rdma { int sc_max_sge; int sc_max_sge_rd; /* max sge for read target */ - int sc_sq_depth; /* Depth of SQ */ atomic_t sc_sq_count; /* Number of SQ WR on queue */ - - int sc_max_requests; /* Depth of RQ */ + unsigned int sc_sq_depth; /* Depth of SQ */ + unsigned int sc_rq_depth; /* Depth of RQ */ + u32 sc_max_requests; /* Forward credits */ + u32 sc_max_bc_requests;/* Backward credits */ int sc_max_req_size; /* Size of each RQ WR buf */ struct ib_pd *sc_pd; @@ -186,6 +188,11 @@ struct svcxprt_rdma { #define RPCRDMA_MAX_REQUESTS 32 #define RPCRDMA_MAX_REQ_SIZE 4096 +/* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our + * current NFSv4.1 implementation supports one backchannel slot. + */ +#define RPCRDMA_MAX_BC_REQUESTS 2 + #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD /* svc_rdma_marshal.c */ diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index e894e0698c45..c846ca9f1eba 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -55,6 +55,7 @@ unsigned int svcrdma_ord = RPCRDMA_ORD; static unsigned int min_ord = 1; static unsigned int max_ord = 4096; unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; +unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS; static unsigned int min_max_requests = 4; static unsigned int max_max_requests = 16384; unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; @@ -245,9 +246,10 @@ int svc_rdma_init(void) { dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); - dprintk("\tmax_requests : %d\n", svcrdma_max_requests); - dprintk("\tsq_depth : %d\n", + dprintk("\tmax_requests : %u\n", svcrdma_max_requests); + dprintk("\tsq_depth : %u\n", svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); + dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 8b3ee04f5bb4..af86dfeceb4a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -169,12 +169,12 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) { - int i; + unsigned int i; /* Each RPC/RDMA credit can consume a number of send * and receive WQEs. One ctxt is allocated for each. */ - i = xprt->sc_sq_depth + xprt->sc_max_requests; + i = xprt->sc_sq_depth + xprt->sc_rq_depth; while (i--) { struct svc_rdma_op_ctxt *ctxt; @@ -285,7 +285,7 @@ static struct svc_rdma_req_map *alloc_req_map(gfp_t flags) static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt) { - int i; + unsigned int i; /* One for each receive buffer on this connection. */ i = xprt->sc_max_requests; @@ -1016,8 +1016,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct ib_device *dev; int uninitialized_var(dma_mr_acc); int need_dma_mr = 0; + unsigned int i; int ret = 0; - int i; listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); clear_bit(XPT_CONN, &xprt->xpt_flags); @@ -1046,9 +1046,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd, RPCSVC_MAXPAGES); newxprt->sc_max_req_size = svcrdma_max_req_size; - newxprt->sc_max_requests = min((size_t)dev->attrs.max_qp_wr, - (size_t)svcrdma_max_requests); - newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; + newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, + svcrdma_max_requests); + newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, + svcrdma_max_bc_requests); + newxprt->sc_rq_depth = newxprt->sc_max_requests + + newxprt->sc_max_bc_requests; + newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth; if (!svc_rdma_prealloc_ctxts(newxprt)) goto errout; @@ -1077,7 +1081,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } - cq_attr.cqe = newxprt->sc_max_requests; + cq_attr.cqe = newxprt->sc_rq_depth; newxprt->sc_rq_cq = ib_create_cq(dev, rq_comp_handler, cq_event_handler, @@ -1092,7 +1096,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = &newxprt->sc_xprt; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; - qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; + qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; qp_attr.cap.max_send_sge = newxprt->sc_max_sge; qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -1183,7 +1187,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_dma_lkey = dev->local_dma_lkey; /* Post receive buffers */ - for (i = 0; i < newxprt->sc_max_requests; i++) { + for (i = 0; i < newxprt->sc_rq_depth; i++) { ret = svc_rdma_post_recv(newxprt, GFP_KERNEL); if (ret) { dprintk("svcrdma: failure posting receive buffers\n"); -- cgit v1.2.3-59-g8ed1b From 5d252f90a800cee5bc57c76d636ae60464f7a887 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:50:10 -0500 Subject: svcrdma: Add class for RDMA backwards direction transport To support the server-side of an NFSv4.1 backchannel on RDMA connections, add a transport class that enables backward direction messages on an existing forward channel connection. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 5 + net/sunrpc/xprt.c | 1 + net/sunrpc/xprtrdma/Makefile | 2 +- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 371 +++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 52 ++++ net/sunrpc/xprtrdma/svc_rdma_transport.c | 14 +- net/sunrpc/xprtrdma/transport.c | 30 ++- net/sunrpc/xprtrdma/xprt_rdma.h | 15 ++ 8 files changed, 475 insertions(+), 15 deletions(-) create mode 100644 net/sunrpc/xprtrdma/svc_rdma_backchannel.c (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 9a2c418dc690..b13513a0caf4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -195,6 +195,11 @@ struct svcxprt_rdma { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +/* svc_rdma_backchannel.c */ +extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, + struct rpcrdma_msg *rmsgp, + struct xdr_buf *rcvbuf); + /* svc_rdma_marshal.c */ extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2e98f4a243e5..37edea6fa92d 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1425,3 +1425,4 @@ void xprt_put(struct rpc_xprt *xprt) if (atomic_dec_and_test(&xprt->count)) xprt_destroy(xprt); } +EXPORT_SYMBOL_GPL(xprt_put); diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 33f99d3004f2..dc9f3b513a05 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o \ fmr_ops.o frwr_ops.o physical_ops.o \ - svc_rdma.o svc_rdma_transport.o \ + svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c new file mode 100644 index 000000000000..deff06a82914 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * + * Support for backward direction RPCs on RPC/RDMA (server-side). + */ + +#include +#include "xprt_rdma.h" + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +#undef SVCRDMA_BACKCHANNEL_DEBUG + +int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, + struct xdr_buf *rcvbuf) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct kvec *dst, *src = &rcvbuf->head[0]; + struct rpc_rqst *req; + unsigned long cwnd; + u32 credits; + size_t len; + __be32 xid; + __be32 *p; + int ret; + + p = (__be32 *)src->iov_base; + len = src->iov_len; + xid = rmsgp->rm_xid; + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: xid=%08x, length=%zu\n", + __func__, be32_to_cpu(xid), len); + pr_info("%s: RPC/RDMA: %*ph\n", + __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp); + pr_info("%s: RPC: %*ph\n", + __func__, (int)len, p); +#endif + + ret = -EAGAIN; + if (src->iov_len < 24) + goto out_shortreply; + + spin_lock_bh(&xprt->transport_lock); + req = xprt_lookup_rqst(xprt, xid); + if (!req) + goto out_notfound; + + dst = &req->rq_private_buf.head[0]; + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); + if (dst->iov_len < len) + goto out_unlock; + memcpy(dst->iov_base, p, len); + + credits = be32_to_cpu(rmsgp->rm_credit); + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > r_xprt->rx_buf.rb_bc_max_requests) + credits = r_xprt->rx_buf.rb_bc_max_requests; + + cwnd = xprt->cwnd; + xprt->cwnd = credits << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(req->rq_task); + + ret = 0; + xprt_complete_rqst(req->rq_task, rcvbuf->len); + rcvbuf->len = 0; + +out_unlock: + spin_unlock_bh(&xprt->transport_lock); +out: + return ret; + +out_shortreply: + dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n", + xprt, src->iov_len); + goto out; + +out_notfound: + dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n", + xprt, be32_to_cpu(xid)); + + goto out_unlock; +} + +/* Send a backwards direction RPC call. + * + * Caller holds the connection's mutex and has already marshaled + * the RPC/RDMA request. + * + * This is similar to svc_rdma_reply, but takes an rpc_rqst + * instead, does not support chunks, and avoids blocking memory + * allocation. + * + * XXX: There is still an opportunity to block in svc_rdma_send() + * if there are no SQ entries to post the Send. This may occur if + * the adapter has a small maximum SQ depth. + */ +static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, + struct rpc_rqst *rqst) +{ + struct xdr_buf *sndbuf = &rqst->rq_snd_buf; + struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_req_map *vec; + struct ib_send_wr send_wr; + int ret; + + vec = svc_rdma_get_req_map(rdma); + ret = svc_rdma_map_xdr(rdma, sndbuf, vec); + if (ret) + goto out_err; + + /* Post a recv buffer to handle the reply for this request. */ + ret = svc_rdma_post_recv(rdma, GFP_NOIO); + if (ret) { + pr_err("svcrdma: Failed to post bc receive buffer, err=%d.\n", + ret); + pr_err("svcrdma: closing transport %p.\n", rdma); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + ret = -ENOTCONN; + goto out_err; + } + + ctxt = svc_rdma_get_context(rdma); + ctxt->pages[0] = virt_to_page(rqst->rq_buffer); + ctxt->count = 1; + + ctxt->wr_op = IB_WR_SEND; + ctxt->direction = DMA_TO_DEVICE; + ctxt->sge[0].lkey = rdma->sc_dma_lkey; + ctxt->sge[0].length = sndbuf->len; + ctxt->sge[0].addr = + ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0, + sndbuf->len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) { + ret = -EIO; + goto out_unmap; + } + atomic_inc(&rdma->sc_dma_used); + + memset(&send_wr, 0, sizeof(send_wr)); + send_wr.wr_id = (unsigned long)ctxt; + send_wr.sg_list = ctxt->sge; + send_wr.num_sge = 1; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = svc_rdma_send(rdma, &send_wr); + if (ret) { + ret = -EIO; + goto out_unmap; + } + +out_err: + svc_rdma_put_req_map(rdma, vec); + dprintk("svcrdma: %s returns %d\n", __func__, ret); + return ret; + +out_unmap: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + goto out_err; +} + +/* Server-side transport endpoint wants a whole page for its send + * buffer. The client RPC code constructs the RPC header in this + * buffer before it invokes ->send_request. + * + * Returns NULL if there was a temporary allocation failure. + */ +static void * +xprt_rdma_bc_allocate(struct rpc_task *task, size_t size) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma; + struct page *page; + + rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); + + /* Prevent an infinite loop: try to make this case work */ + if (size > PAGE_SIZE) + WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", + size); + + page = alloc_page(RPCRDMA_DEF_GFP); + if (!page) + return NULL; + + return page_address(page); +} + +static void +xprt_rdma_bc_free(void *buffer) +{ + /* No-op: ctxt and page have already been freed. */ +} + +static int +rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; + int rc; + + /* Space in the send buffer for an RPC/RDMA header is reserved + * via xprt->tsh_size. + */ + headerp->rm_xid = rqst->rq_xid; + headerp->rm_vers = rpcrdma_version; + headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); + headerp->rm_type = rdma_msg; + headerp->rm_body.rm_chunks[0] = xdr_zero; + headerp->rm_body.rm_chunks[1] = xdr_zero; + headerp->rm_body.rm_chunks[2] = xdr_zero; + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); +#endif + + rc = svc_rdma_bc_sendto(rdma, rqst); + if (rc) + goto drop_connection; + return rc; + +drop_connection: + dprintk("svcrdma: failed to send bc call\n"); + xprt_disconnect_done(xprt); + return -ENOTCONN; +} + +/* Send an RPC call on the passive end of a transport + * connection. + */ +static int +xprt_rdma_bc_send_request(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma; + int ret; + + dprintk("svcrdma: sending bc call with xid: %08x\n", + be32_to_cpu(rqst->rq_xid)); + + if (!mutex_trylock(&sxprt->xpt_mutex)) { + rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL); + if (!mutex_trylock(&sxprt->xpt_mutex)) + return -EAGAIN; + rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task); + } + + ret = -ENOTCONN; + rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); + if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) + ret = rpcrdma_bc_send_request(rdma, rqst); + + mutex_unlock(&sxprt->xpt_mutex); + + if (ret < 0) + return ret; + return 0; +} + +static void +xprt_rdma_bc_close(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); +} + +static void +xprt_rdma_bc_put(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + + xprt_free(xprt); + module_put(THIS_MODULE); +} + +static struct rpc_xprt_ops xprt_rdma_bc_procs = { + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, + .alloc_slot = xprt_alloc_slot, + .release_request = xprt_release_rqst_cong, + .buf_alloc = xprt_rdma_bc_allocate, + .buf_free = xprt_rdma_bc_free, + .send_request = xprt_rdma_bc_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = xprt_rdma_bc_close, + .destroy = xprt_rdma_bc_put, + .print_stats = xprt_rdma_print_stats +}; + +static const struct rpc_timeout xprt_rdma_bc_timeout = { + .to_initval = 60 * HZ, + .to_maxval = 60 * HZ, +}; + +/* It shouldn't matter if the number of backchannel session slots + * doesn't match the number of RPC/RDMA credits. That just means + * one or the other will have extra slots that aren't used. + */ +static struct rpc_xprt * +xprt_setup_rdma_bc(struct xprt_create *args) +{ + struct rpc_xprt *xprt; + struct rpcrdma_xprt *new_xprt; + + if (args->addrlen > sizeof(xprt->addr)) { + dprintk("RPC: %s: address too large\n", __func__); + return ERR_PTR(-EBADF); + } + + xprt = xprt_alloc(args->net, sizeof(*new_xprt), + RPCRDMA_MAX_BC_REQUESTS, + RPCRDMA_MAX_BC_REQUESTS); + if (!xprt) { + dprintk("RPC: %s: couldn't allocate rpc_xprt\n", + __func__); + return ERR_PTR(-ENOMEM); + } + + xprt->timeout = &xprt_rdma_bc_timeout; + xprt_set_bound(xprt); + xprt_set_connected(xprt); + xprt->bind_timeout = RPCRDMA_BIND_TO; + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + + xprt->prot = XPRT_TRANSPORT_BC_RDMA; + xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32); + xprt->ops = &xprt_rdma_bc_procs; + + memcpy(&xprt->addr, args->dstaddr, args->addrlen); + xprt->addrlen = args->addrlen; + xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr); + xprt->resvport = 0; + + xprt->max_payload = xprt_rdma_max_inline_read; + + new_xprt = rpcx_to_rdmax(xprt); + new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs; + + xprt_get(xprt); + args->bc_xprt->xpt_bc_xprt = xprt; + xprt->bc_xprt = args->bc_xprt; + + if (!try_module_get(THIS_MODULE)) + goto out_fail; + + /* Final put for backchannel xprt is in __svc_rdma_free */ + xprt_get(xprt); + return xprt; + +out_fail: + xprt_rdma_free_addresses(xprt); + args->bc_xprt->xpt_bc_xprt = NULL; + xprt_put(xprt); + xprt_free(xprt); + return ERR_PTR(-EINVAL); +} + +struct xprt_class xprt_rdma_bc = { + .list = LIST_HEAD_INIT(xprt_rdma_bc.list), + .name = "rdma backchannel", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_BC_RDMA, + .setup = xprt_setup_rdma_bc, +}; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ff4f01e527ec..3dfe4642ec92 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -567,6 +567,38 @@ static int rdma_read_complete(struct svc_rqst *rqstp, return ret; } +/* By convention, backchannel calls arrive via rdma_msg type + * messages, and never populate the chunk lists. This makes + * the RPC/RDMA header small and fixed in size, so it is + * straightforward to check the RPC header's direction field. + */ +static bool +svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp) +{ + __be32 *p = (__be32 *)rmsgp; + + if (!xprt->xpt_bc_xprt) + return false; + + if (rmsgp->rm_type != rdma_msg) + return false; + if (rmsgp->rm_body.rm_chunks[0] != xdr_zero) + return false; + if (rmsgp->rm_body.rm_chunks[1] != xdr_zero) + return false; + if (rmsgp->rm_body.rm_chunks[2] != xdr_zero) + return false; + + /* sanity */ + if (p[7] != rmsgp->rm_xid) + return false; + /* call direction */ + if (p[8] == cpu_to_be32(RPC_CALL)) + return false; + + return true; +} + /* * Set up the rqstp thread context to point to the RQ buffer. If * necessary, pull additional data from the client with an RDMA_READ @@ -632,6 +664,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto close_out; } + if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) { + ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp, + &rqstp->rq_arg); + svc_rdma_put_context(ctxt, 0); + if (ret) + goto repost; + return ret; + } + /* Read read-list data. */ ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); if (ret > 0) { @@ -668,4 +709,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) set_bit(XPT_CLOSE, &xprt->xpt_flags); defer: return 0; + +repost: + ret = svc_rdma_post_recv(rdma_xprt, GFP_KERNEL); + if (ret) { + pr_err("svcrdma: could not post a receive buffer, err=%d.\n", + ret); + pr_err("svcrdma: closing transport %p.\n", rdma_xprt); + set_bit(XPT_CLOSE, &rdma_xprt->sc_xprt.xpt_flags); + ret = -ENOTCONN; + } + return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index af86dfeceb4a..7fd23955f1d4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1287,12 +1287,14 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); - dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + struct svc_xprt *xprt = &rdma->sc_xprt; + + dprintk("svcrdma: %s(%p)\n", __func__, rdma); /* We should only be called from kref_put */ - if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0) + if (atomic_read(&xprt->xpt_ref.refcount) != 0) pr_err("svcrdma: sc_xprt still in use? (%d)\n", - atomic_read(&rdma->sc_xprt.xpt_ref.refcount)); + atomic_read(&xprt->xpt_ref.refcount)); /* * Destroy queued, but not processed read completions. Note @@ -1327,6 +1329,12 @@ static void __svc_rdma_free(struct work_struct *work) pr_err("svcrdma: dma still in use? (%d)\n", atomic_read(&rdma->sc_dma_used)); + /* Final put of backchannel client transport */ + if (xprt->xpt_bc_xprt) { + xprt_put(xprt->xpt_bc_xprt); + xprt->xpt_bc_xprt = NULL; + } + rdma_dealloc_frmr_q(rdma); svc_rdma_destroy_ctxts(rdma); svc_rdma_destroy_maps(rdma); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8c545f7d7525..5c7d235672fa 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -63,7 +63,7 @@ */ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; -static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; +unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; @@ -143,12 +143,7 @@ static struct ctl_table sunrpc_table[] = { #endif -#define RPCRDMA_BIND_TO (60U * HZ) -#define RPCRDMA_INIT_REEST_TO (5U * HZ) -#define RPCRDMA_MAX_REEST_TO (30U * HZ) -#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) - -static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ +static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */ static void xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) @@ -174,7 +169,7 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; } -static void +void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) { char buf[128]; @@ -203,7 +198,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; } -static void +void xprt_rdma_free_addresses(struct rpc_xprt *xprt) { unsigned int i; @@ -499,7 +494,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) if (req == NULL) return NULL; - flags = GFP_NOIO | __GFP_NOWARN; + flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; @@ -639,7 +634,7 @@ drop_connection: return -ENOTCONN; /* implies disconnect */ } -static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); long idle_time = 0; @@ -740,6 +735,11 @@ void xprt_rdma_cleanup(void) rpcrdma_destroy_wq(); frwr_destroy_recovery_wq(); + + rc = xprt_unregister_transport(&xprt_rdma_bc); + if (rc) + dprintk("RPC: %s: xprt_unregister(bc) returned %i\n", + __func__, rc); } int xprt_rdma_init(void) @@ -763,6 +763,14 @@ int xprt_rdma_init(void) return rc; } + rc = xprt_register_transport(&xprt_rdma_bc); + if (rc) { + xprt_unregister_transport(&xprt_rdma); + rpcrdma_destroy_wq(); + frwr_destroy_recovery_wq(); + return rc; + } + dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); dprintk("Defaults:\n"); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 72276c7907e4..5a38236d0516 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -55,6 +55,11 @@ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ +#define RPCRDMA_BIND_TO (60U * HZ) +#define RPCRDMA_INIT_REEST_TO (5U * HZ) +#define RPCRDMA_MAX_REEST_TO (30U * HZ) +#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) + /* * Interface Adapter -- one per transport instance */ @@ -147,6 +152,8 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } +#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) + /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -308,6 +315,8 @@ struct rpcrdma_buffer { u32 rb_bc_srv_max_requests; spinlock_t rb_reqslock; /* protect rb_allreqs */ struct list_head rb_allreqs; + + u32 rb_bc_max_requests; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -513,6 +522,10 @@ int rpcrdma_marshal_req(struct rpc_rqst *); /* RPC/RDMA module init - xprtrdma/transport.c */ +extern unsigned int xprt_rdma_max_inline_read; +void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); +void xprt_rdma_free_addresses(struct rpc_xprt *xprt); +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); @@ -528,4 +541,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *); void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ +extern struct xprt_class xprt_rdma_bc; + #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ -- cgit v1.2.3-59-g8ed1b From 5fe1043da84887369d32459514f2c7d98ff37936 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jan 2016 23:53:41 -0800 Subject: svc_rdma: use local_dma_lkey We now alwasy have a per-PD local_dma_lkey available. Make use of that fact in svc_rdma and stop registering our own MR. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Jason Gunthorpe Reviewed-by: Chuck Lever Reviewed-by: Steve Wise Acked-by: J. Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 2 -- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 4 ++-- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 6 ++--- net/sunrpc/xprtrdma/svc_rdma_transport.c | 36 ++++-------------------------- 5 files changed, 10 insertions(+), 40 deletions(-) (limited to 'include/linux/sunrpc') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index b13513a0caf4..5322fea6fe4c 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -156,13 +156,11 @@ struct svcxprt_rdma { struct ib_qp *sc_qp; struct ib_cq *sc_rq_cq; struct ib_cq *sc_sq_cq; - struct ib_mr *sc_phys_mr; /* MR for server memory */ int (*sc_reader)(struct svcxprt_rdma *, struct svc_rqst *, struct svc_rdma_op_ctxt *, int *, u32 *, u32, u32, u64, bool); u32 sc_dev_caps; /* distilled device caps */ - u32 sc_dma_lkey; /* local dma key */ unsigned int sc_frmr_pg_list_len; struct list_head sc_frmr_q; spinlock_t sc_frmr_q_lock; diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index deff06a82914..65a7c232a345 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -128,7 +128,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, ctxt->wr_op = IB_WR_SEND; ctxt->direction = DMA_TO_DEVICE; - ctxt->sge[0].lkey = rdma->sc_dma_lkey; + ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[0].length = sndbuf->len; ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0, diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 3dfe4642ec92..c8b8a8b4181e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -144,6 +144,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; head->arg.page_len += len; + head->arg.len += len; if (!pg_off) head->count++; @@ -160,8 +161,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, goto err; atomic_inc(&xprt->sc_dma_used); - /* The lkey here is either a local dma lkey or a dma_mr lkey */ - ctxt->sge[pno].lkey = xprt->sc_dma_lkey; + ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[pno].length = len; ctxt->count++; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 3c250523f7cc..df57f3ce6cd2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -265,7 +265,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge[sge_no].addr)) goto err; atomic_inc(&xprt->sc_dma_used); - sge[sge_no].lkey = xprt->sc_dma_lkey; + sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count++; sge_off = 0; sge_no++; @@ -480,7 +480,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->count = 1; /* Prepare the SGE for the RPCRDMA Header */ - ctxt->sge[0].lkey = rdma->sc_dma_lkey; + ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, @@ -504,7 +504,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[sge_no].addr)) goto err; atomic_inc(&rdma->sc_dma_used); - ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; + ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } if (byte_count != 0) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 7fd23955f1d4..5763825d09bf 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -232,11 +232,11 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { /* * Unmap the DMA addr in the SGE if the lkey matches - * the sc_dma_lkey, otherwise, ignore it since it is + * the local_dma_lkey, otherwise, ignore it since it is * an FRMR lkey and will be unmapped later when the * last WR that uses it completes. */ - if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { + if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) { atomic_dec(&xprt->sc_dma_used); ib_dma_unmap_page(xprt->sc_cm_id->device, ctxt->sge[i].addr, @@ -698,7 +698,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) atomic_inc(&xprt->sc_dma_used); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; - ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; + ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count = sge_no + 1; buflen += PAGE_SIZE; } @@ -1014,8 +1014,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct ib_cq_init_attr cq_attr = {}; struct ib_qp_init_attr qp_attr; struct ib_device *dev; - int uninitialized_var(dma_mr_acc); - int need_dma_mr = 0; unsigned int i; int ret = 0; @@ -1160,32 +1158,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) goto errout; - if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) || - !(dev->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) && - !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) - dma_mr_acc |= IB_ACCESS_REMOTE_WRITE; - } - if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; - /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ - if (need_dma_mr) { - /* Register all of physical memory */ - newxprt->sc_phys_mr = - ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); - if (IS_ERR(newxprt->sc_phys_mr)) { - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", - ret); - goto errout; - } - newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; - } else - newxprt->sc_dma_lkey = dev->local_dma_lkey; - /* Post receive buffers */ for (i = 0; i < newxprt->sc_rq_depth; i++) { ret = svc_rdma_post_recv(newxprt, GFP_KERNEL); @@ -1349,9 +1324,6 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) ib_destroy_cq(rdma->sc_rq_cq); - if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr)) - ib_dereg_mr(rdma->sc_phys_mr); - if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd); @@ -1479,7 +1451,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, return; } atomic_inc(&xprt->sc_dma_used); - ctxt->sge[0].lkey = xprt->sc_dma_lkey; + ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[0].length = length; /* Prepare SEND WR */ -- cgit v1.2.3-59-g8ed1b