From ef739b2175dde9c05594f768cb78149f1ce2ac36 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:14 -0400 Subject: xprtrdma: Reset credit grant properly after a disconnect On a fresh connection, an RPC/RDMA client is supposed to send only one RPC Call until it gets a credit grant in the first RPC Reply from the server [RFC 8166, Section 3.3.3]. There is a bug in the Linux client's credit accounting mechanism introduced by commit e7ce710a8802 ("xprtrdma: Avoid deadlock when credit window is reset"). On connect, it simply dumps all pending RPC Calls onto the new connection. Servers have been tolerant of this bad behavior. Currently no server implementation ever changes its credit grant over reconnects, and servers always repost enough Receives before connections are fully established. To correct this issue, ensure that the client resets both the credit grant _and_ the congestion window when handling a reconnect. Fixes: e7ce710a8802 ("xprtrdma: Avoid deadlock when credit ... ") Signed-off-by: Chuck Lever Cc: stable@kernel.org Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/sunrpc/xprtrdma/transport.c') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 143ce2579ba9..98cbc7b060ba 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -468,6 +468,12 @@ xprt_rdma_close(struct rpc_xprt *xprt) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); rpcrdma_ep_disconnect(ep, ia); + + /* Prepare @xprt for the next connection by reinitializing + * its credit grant to one (see RFC 8166, Section 3.3.3). + */ + r_xprt->rx_buf.rb_credits = 1; + xprt->cwnd = RPC_CWNDSHIFT; } /** -- cgit v1.2.3-59-g8ed1b From 61da886bf74e738995d359fa14d77f72d14cfb87 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:25 -0400 Subject: xprtrdma: Explicitly resetting MRs is no longer necessary When a memory operation fails, the MR's driver state might not match its hardware state. The only reliable recourse is to dereg the MR. This is done in ->ro_recover_mr, which then attempts to allocate a fresh MR to replace the released MR. Since commit e2ac236c0b651 ("xprtrdma: Allocate MRs on demand"), xprtrdma dynamically allocates MRs. It can add more MRs whenever they are needed. That makes it possible to simply release an MR when a memory operation fails, instead of "recovering" it. It will automatically be replaced by the on-demand MR allocator. This commit is a little larger than I wanted, but it replaces ->ro_recover_mr, rb_recovery_lock, rb_recovery_worker, and the rb_stale_mrs list with a generic work queue. Since MRs are no longer orphaned, the mrs_orphaned metric is no longer used. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 2 +- net/sunrpc/xprtrdma/fmr_ops.c | 124 +++++++++++++++++--------------------- net/sunrpc/xprtrdma/frwr_ops.c | 130 +++++++++++++++------------------------- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 38 ------------ net/sunrpc/xprtrdma/xprt_rdma.h | 14 +++-- 7 files changed, 114 insertions(+), 198 deletions(-) (limited to 'net/sunrpc/xprtrdma/transport.c') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 53df203b8057..9906f4d7d9fb 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -655,7 +655,7 @@ DEFINE_MR_EVENT(xprtrdma_localinv); DEFINE_MR_EVENT(xprtrdma_dma_map); DEFINE_MR_EVENT(xprtrdma_dma_unmap); DEFINE_MR_EVENT(xprtrdma_remoteinv); -DEFINE_MR_EVENT(xprtrdma_recover_mr); +DEFINE_MR_EVENT(xprtrdma_mr_recycle); /** ** Reply events diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index db589a23682b..f1cf3a36a992 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -49,46 +49,7 @@ fmr_is_supported(struct rpcrdma_ia *ia) return true; } -static int -fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - static struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_FMR_SGES, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - - mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(u64), GFP_KERNEL); - if (!mr->fmr.fm_physaddrs) - goto out_free; - - mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(*mr->mr_sg), GFP_KERNEL); - if (!mr->mr_sg) - goto out_free; - - sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); - - mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, - &fmr_attr); - if (IS_ERR(mr->fmr.fm_mr)) - goto out_fmr_err; - - INIT_LIST_HEAD(&mr->mr_list); - return 0; - -out_fmr_err: - dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, - PTR_ERR(mr->fmr.fm_mr)); - -out_free: - kfree(mr->mr_sg); - kfree(mr->fmr.fm_physaddrs); - return -ENOMEM; -} - -static int +static void __fmr_unmap(struct rpcrdma_mr *mr) { LIST_HEAD(l); @@ -97,13 +58,16 @@ __fmr_unmap(struct rpcrdma_mr *mr) list_add(&mr->fmr.fm_mr->list, &l); rc = ib_unmap_fmr(&l); list_del(&mr->fmr.fm_mr->list); - return rc; + if (rc) + pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", + mr, rc); } +/* Release an MR. + */ static void fmr_op_release_mr(struct rpcrdma_mr *mr) { - LIST_HEAD(unmap_list); int rc; kfree(mr->fmr.fm_physaddrs); @@ -112,10 +76,7 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) /* In case this one was left mapped, try to unmap it * to prevent dealloc_fmr from failing with EBUSY */ - rc = __fmr_unmap(mr); - if (rc) - pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", - mr, rc); + __fmr_unmap(mr); rc = ib_dealloc_fmr(mr->fmr.fm_mr); if (rc) @@ -125,28 +86,16 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -/* Reset of a single FMR. +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. */ static void -fmr_op_recover_mr(struct rpcrdma_mr *mr) +fmr_mr_recycle_worker(struct work_struct *work) { + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - int rc; - /* ORDER: invalidate first */ - rc = __fmr_unmap(mr); - if (rc) - goto out_release; - - /* ORDER: then DMA unmap */ - rpcrdma_mr_unmap_and_put(mr); - - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; + trace_xprtrdma_mr_recycle(mr); trace_xprtrdma_dma_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, @@ -154,11 +103,51 @@ out_release: spin_lock(&r_xprt->rx_buf.rb_mrlock); list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; spin_unlock(&r_xprt->rx_buf.rb_mrlock); - fmr_op_release_mr(mr); } +static int +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +{ + static struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + + mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(u64), GFP_KERNEL); + if (!mr->fmr.fm_physaddrs) + goto out_free; + + mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) + goto out_free; + + sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); + + mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + &fmr_attr); + if (IS_ERR(mr->fmr.fm_mr)) + goto out_fmr_err; + + INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, fmr_mr_recycle_worker); + return 0; + +out_fmr_err: + dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, + PTR_ERR(mr->fmr.fm_mr)); + +out_free: + kfree(mr->mr_sg); + kfree(mr->fmr.fm_physaddrs); + return -ENOMEM; +} + /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -312,7 +301,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); if (rc) - goto out_reset; + goto out_release; /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. @@ -325,13 +314,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) return; -out_reset: +out_release: pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); while (!list_empty(mrs)) { mr = rpcrdma_mr_pop(mrs); list_del(&mr->fmr.fm_mr->list); - fmr_op_recover_mr(mr); + rpcrdma_mr_recycle(mr); } } @@ -339,7 +328,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_send = fmr_op_send, .ro_unmap_sync = fmr_op_unmap_sync, - .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, .ro_init_mr = fmr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 1cc4db515c85..6594627e3c7d 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -97,6 +97,44 @@ out_not_supported: return false; } +static void +frwr_op_release_mr(struct rpcrdma_mr *mr) +{ + int rc; + + rc = ib_dereg_mr(mr->frwr.fr_mr); + if (rc) + pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", + mr, rc); + kfree(mr->mr_sg); + kfree(mr); +} + +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. + */ +static void +frwr_mr_recycle_worker(struct work_struct *work) +{ + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); + enum rpcrdma_frwr_state state = mr->frwr.fr_state; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + + trace_xprtrdma_mr_recycle(mr); + + if (state != FRWR_FLUSHED_LI) { + trace_xprtrdma_dma_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + } + + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; + spin_unlock(&r_xprt->rx_buf.rb_mrlock); + frwr_op_release_mr(mr); +} + static int frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { @@ -113,6 +151,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) goto out_list_err; INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker); sg_init_table(mr->mr_sg, depth); init_completion(&frwr->fr_linv_done); return 0; @@ -131,79 +170,6 @@ out_list_err: return rc; } -static void -frwr_op_release_mr(struct rpcrdma_mr *mr) -{ - int rc; - - rc = ib_dereg_mr(mr->frwr.fr_mr); - if (rc) - pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", - mr, rc); - kfree(mr->mr_sg); - kfree(mr); -} - -static int -__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - struct rpcrdma_frwr *frwr = &mr->frwr; - int rc; - - rc = ib_dereg_mr(frwr->fr_mr); - if (rc) { - pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", - rc, mr); - return rc; - } - - frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, - ia->ri_max_frwr_depth); - if (IS_ERR(frwr->fr_mr)) { - pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", - PTR_ERR(frwr->fr_mr), mr); - return PTR_ERR(frwr->fr_mr); - } - - dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr); - frwr->fr_state = FRWR_IS_INVALID; - return 0; -} - -/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. - */ -static void -frwr_op_recover_mr(struct rpcrdma_mr *mr) -{ - enum rpcrdma_frwr_state state = mr->frwr.fr_state; - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc; - - rc = __frwr_mr_reset(ia, mr); - if (state != FRWR_FLUSHED_LI) { - trace_xprtrdma_dma_unmap(mr); - ib_dma_unmap_sg(ia->ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - } - if (rc) - goto out_release; - - rpcrdma_mr_put(mr); - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; - - spin_lock(&r_xprt->rx_buf.rb_mrlock); - list_del(&mr->mr_all); - spin_unlock(&r_xprt->rx_buf.rb_mrlock); - - frwr_op_release_mr(mr); -} - /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -385,7 +351,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr = NULL; do { if (mr) - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); mr = rpcrdma_mr_get(r_xprt); if (!mr) return ERR_PTR(-EAGAIN); @@ -452,7 +418,7 @@ out_dmamap_err: out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", frwr->fr_mr, n, mr->mr_nents); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } @@ -571,7 +537,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) if (bad_wr != first) wait_for_completion(&frwr->fr_linv_done); if (rc) - goto reset_mrs; + goto out_release; /* ORDER: Now DMA unmap all of the MRs, and return * them to the free MR list. @@ -583,22 +549,21 @@ unmap: } return; -reset_mrs: +out_release: pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); - /* Find and reset the MRs in the LOCAL_INV WRs that did not + /* Unmap and release the MRs in the LOCAL_INV WRs that did not * get posted. */ while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); - - __frwr_mr_reset(ia, mr); - bad_wr = bad_wr->next; + + list_del(&mr->mr_list); + frwr_op_release_mr(mr); } - goto unmap; } const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { @@ -606,7 +571,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_send = frwr_op_send, .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, - .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, .ro_init_mr = frwr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 15edc050ca93..228aee851667 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -803,7 +803,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) struct rpcrdma_mr *mr; mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); } /* This implementation supports the following combinations diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 98cbc7b060ba..3ae73e6a5c93 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -792,7 +792,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n", - r_xprt->rx_stats.mrs_recovered, + r_xprt->rx_stats.mrs_recycled, r_xprt->rx_stats.mrs_orphaned, r_xprt->rx_stats.mrs_allocated, r_xprt->rx_stats.local_inv_needed, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 5625a5089f96..88fe75e6d41a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -977,39 +977,6 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) } } -static void -rpcrdma_mr_recovery_worker(struct work_struct *work) -{ - struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, - rb_recovery_worker.work); - struct rpcrdma_mr *mr; - - spin_lock(&buf->rb_recovery_lock); - while (!list_empty(&buf->rb_stale_mrs)) { - mr = rpcrdma_mr_pop(&buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - trace_xprtrdma_recover_mr(mr); - mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr); - - spin_lock(&buf->rb_recovery_lock); - } - spin_unlock(&buf->rb_recovery_lock); -} - -void -rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - - spin_lock(&buf->rb_recovery_lock); - rpcrdma_mr_push(mr, &buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - schedule_delayed_work(&buf->rb_recovery_worker, 0); -} - static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { @@ -1142,14 +1109,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); - spin_lock_init(&buf->rb_recovery_lock); INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all); - INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); - INIT_DELAYED_WORK(&buf->rb_recovery_worker, - rpcrdma_mr_recovery_worker); rpcrdma_mrs_create(r_xprt); @@ -1233,7 +1196,6 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - cancel_delayed_work_sync(&buf->rb_recovery_worker); cancel_delayed_work_sync(&buf->rb_refresh_worker); rpcrdma_sendctxs_destroy(buf); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2ca14f7c2d51..eae21668e692 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -280,6 +280,7 @@ struct rpcrdma_mr { u32 mr_handle; u32 mr_length; u64 mr_offset; + struct work_struct mr_recycle; struct list_head mr_all; }; @@ -411,9 +412,6 @@ struct rpcrdma_buffer { u32 rb_bc_max_requests; - spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ - struct list_head rb_stale_mrs; - struct delayed_work rb_recovery_worker; struct delayed_work rb_refresh_worker; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -452,7 +450,7 @@ struct rpcrdma_stats { unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; - unsigned long mrs_recovered; + unsigned long mrs_recycled; unsigned long mrs_orphaned; unsigned long mrs_allocated; unsigned long empty_sendctx_q; @@ -481,7 +479,6 @@ struct rpcrdma_memreg_ops { struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_recover_mr)(struct rpcrdma_mr *mr); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); @@ -578,7 +575,12 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); -void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); + +static inline void +rpcrdma_mr_recycle(struct rpcrdma_mr *mr) +{ + schedule_work(&mr->mr_recycle); +} struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); -- cgit v1.2.3-59-g8ed1b From 3968a8a5310404c2f0b9e4d9f28cab13a12bc4fd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:36 -0400 Subject: sunrpc: Fix connect metrics For TCP, the logic in xprt_connect_status is currently never invoked to record a successful connection. Commit 2a4919919a97 ("SUNRPC: Return EAGAIN instead of ENOTCONN when waking up xprt->pending") changed the way TCP xprt's are awoken after a connect succeeds. Instead, change connection-oriented transports to bump connect_count and compute connect_time the moment that XPRT_CONNECTED is set. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 14 ++++---------- net/sunrpc/xprtrdma/transport.c | 6 +++++- net/sunrpc/xprtsock.c | 10 ++++++---- 3 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net/sunrpc/xprtrdma/transport.c') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a8db2e3f8904..93c7a2f4a266 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -789,17 +789,11 @@ void xprt_connect(struct rpc_task *task) static void xprt_connect_status(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - - if (task->tk_status == 0) { - xprt->stat.connect_count++; - xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; + switch (task->tk_status) { + case 0: dprintk("RPC: %5u xprt_connect_status: connection established\n", task->tk_pid); - return; - } - - switch (task->tk_status) { + break; case -ECONNREFUSED: case -ECONNRESET: case -ECONNABORTED: @@ -816,7 +810,7 @@ static void xprt_connect_status(struct rpc_task *task) default: dprintk("RPC: %5u xprt_connect_status: error %d connecting to " "server %s\n", task->tk_pid, -task->tk_status, - xprt->servername); + task->tk_rqstp->rq_xprt->servername); task->tk_status = -EIO; } } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 3ae73e6a5c93..087acfce142a 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -242,8 +242,12 @@ rpcrdma_connect_worker(struct work_struct *work) spin_lock_bh(&xprt->transport_lock); if (ep->rep_connected > 0) { - if (!xprt_test_and_set_connected(xprt)) + if (!xprt_test_and_set_connected(xprt)) { + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_wake_pending_tasks(xprt, 0); + } } else { if (xprt_test_and_clear_connected(xprt)) xprt_wake_pending_tasks(xprt, -ENOTCONN); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 6b7539c0466e..e146caacc494 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1611,6 +1611,9 @@ static void xs_tcp_state_change(struct sock *sk) clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); xprt_clear_connecting(xprt); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_wake_pending_tasks(xprt, -EAGAIN); } spin_unlock(&xprt->transport_lock); @@ -2029,8 +2032,6 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, } /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); } @@ -2062,6 +2063,9 @@ static int xs_local_setup_socket(struct sock_xprt *transport) case 0: dprintk("RPC: xprt %p connected to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_set_connected(xprt); case -ENOBUFS: break; @@ -2387,8 +2391,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_set_memalloc(xprt); /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); switch (ret) { -- cgit v1.2.3-59-g8ed1b From 8440a886112b46a8b402679dca9d8b5662a0d73e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:41 -0400 Subject: sunrpc: Report connect_time in seconds The way connection-oriented transports report connect_time is wrong: it's supposed to be in seconds, not in jiffies. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtsock.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sunrpc/xprtrdma/transport.c') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 087acfce142a..289d13cad638 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -776,7 +776,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) 0, /* need a local port? */ xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e146caacc494..9bbc395cfd55 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2563,7 +2563,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) "%llu %llu %lu %llu %llu\n", xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -2618,7 +2618,7 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) transport->srcport, xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, -- cgit v1.2.3-59-g8ed1b From 31e62d25b5b8155b2ff6a7c6d31256475dbbcc7a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:08 -0400 Subject: xprtrdma: Simplify RPC wake-ups on connect Currently, when a connection is established, rpcrdma_conn_upcall invokes rpcrdma_conn_func and then wake_up_all(&ep->rep_connect_wait). The former wakes waiting RPCs, but the connect worker is not done yet, and that leads to races, double wakes, and difficulty understanding how this logic is supposed to work. Instead, collect all the "connection established" logic in the connect worker (xprt_rdma_connect_worker). A disconnect worker is retained to handle provider upcalls safely. Fixes: 254f91e2fa1f ("xprtrdma: RPC/RDMA must invoke ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 54 ++++++++++++++--------------------------- net/sunrpc/xprtrdma/verbs.c | 42 +++++++++++++++++++++++++------- net/sunrpc/xprtrdma/xprt_rdma.h | 4 +-- 3 files changed, 52 insertions(+), 48 deletions(-) (limited to 'net/sunrpc/xprtrdma/transport.c') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 289d13cad638..d7c4255e9d5d 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -225,51 +225,35 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt) } } -void -rpcrdma_conn_func(struct rpcrdma_ep *ep) -{ - schedule_delayed_work(&ep->rep_connect_worker, 0); -} - -void -rpcrdma_connect_worker(struct work_struct *work) +/** + * xprt_rdma_connect_worker - establish connection in the background + * @work: worker thread context + * + * Requester holds the xprt's send lock to prevent activity on this + * transport while a fresh connection is being established. RPC tasks + * sleep on the xprt's pending queue waiting for connect to complete. + */ +static void +xprt_rdma_connect_worker(struct work_struct *work) { - struct rpcrdma_ep *ep = - container_of(work, struct rpcrdma_ep, rep_connect_worker.work); - struct rpcrdma_xprt *r_xprt = - container_of(ep, struct rpcrdma_xprt, rx_ep); + struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, + rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; + int rc; - spin_lock_bh(&xprt->transport_lock); - if (ep->rep_connected > 0) { + rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + xprt_clear_connecting(xprt); + if (r_xprt->rx_ep.rep_connected > 0) { if (!xprt_test_and_set_connected(xprt)) { xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; - xprt_wake_pending_tasks(xprt, 0); + xprt_wake_pending_tasks(xprt, -EAGAIN); } } else { if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, -ENOTCONN); + xprt_wake_pending_tasks(xprt, rc); } - spin_unlock_bh(&xprt->transport_lock); -} - -static void -xprt_rdma_connect_worker(struct work_struct *work) -{ - struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, - rx_connect_worker.work); - struct rpc_xprt *xprt = &r_xprt->rx_xprt; - int rc = 0; - - xprt_clear_connected(xprt); - - rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - xprt_wake_pending_tasks(xprt, rc); - - xprt_clear_connecting(xprt); } static void @@ -302,8 +286,6 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - xprt_clear_connected(xprt); - rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c60172f88a0d..abbd3cdc259a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -108,6 +108,25 @@ rpcrdma_destroy_wq(void) } } +/** + * rpcrdma_disconnect_worker - Force a disconnect + * @work: endpoint to be disconnected + * + * Provider callbacks can possibly run in an IRQ context. This function + * is invoked in a worker thread to guarantee that disconnect wake-up + * calls are always done in process context. + */ +static void +rpcrdma_disconnect_worker(struct work_struct *work) +{ + struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, + rep_disconnect_worker.work); + struct rpcrdma_xprt *r_xprt = + container_of(ep, struct rpcrdma_xprt, rx_ep); + + xprt_force_disconnect(&r_xprt->rx_xprt); +} + static void rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { @@ -121,7 +140,7 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) if (ep->rep_connected == 1) { ep->rep_connected = -EIO; - rpcrdma_conn_func(ep); + schedule_delayed_work(&ep->rep_disconnect_worker, 0); wake_up_all(&ep->rep_connect_wait); } } @@ -271,13 +290,14 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ++xprt->connect_cookie; ep->rep_connected = 1; rpcrdma_update_connect_private(r_xprt, &event->param.conn); - goto connected; + wake_up_all(&ep->rep_connect_wait); + break; case RDMA_CM_EVENT_CONNECT_ERROR: ep->rep_connected = -ENOTCONN; - goto connected; + goto disconnected; case RDMA_CM_EVENT_UNREACHABLE: ep->rep_connected = -ENETUNREACH; - goto connected; + goto disconnected; case RDMA_CM_EVENT_REJECTED: dprintk("rpcrdma: connection to %s:%s rejected: %s\n", rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), @@ -285,12 +305,12 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->rep_connected = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) ep->rep_connected = -EAGAIN; - goto connected; + goto disconnected; case RDMA_CM_EVENT_DISCONNECTED: ++xprt->connect_cookie; ep->rep_connected = -ECONNABORTED; -connected: - rpcrdma_conn_func(ep); +disconnected: + xprt_force_disconnect(xprt); wake_up_all(&ep->rep_connect_wait); break; default: @@ -550,7 +570,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, cdata->max_requests >> 2); ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); - INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + INIT_DELAYED_WORK(&ep->rep_disconnect_worker, + rpcrdma_disconnect_worker); sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, @@ -623,7 +644,7 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - cancel_delayed_work_sync(&ep->rep_connect_worker); + cancel_delayed_work_sync(&ep->rep_disconnect_worker); if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); @@ -736,6 +757,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpc_xprt *xprt = &r_xprt->rx_xprt; int rc; retry: @@ -762,6 +784,8 @@ retry: } ep->rep_connected = 0; + xprt_clear_connected(xprt); + rpcrdma_post_recvs(r_xprt, true); rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index eae21668e692..a13ccb643ce0 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -101,7 +101,7 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - struct delayed_work rep_connect_worker; + struct delayed_work rep_disconnect_worker; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -556,7 +556,6 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -void rpcrdma_conn_func(struct rpcrdma_ep *ep); void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, @@ -654,7 +653,6 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) extern unsigned int xprt_rdma_max_inline_read; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); -void rpcrdma_connect_worker(struct work_struct *work); void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); -- cgit v1.2.3-59-g8ed1b From f26c32fa5c6ab616e12d4ff0748c85927b092601 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:40 -0400 Subject: xprtrdma: Add documenting comments Clean up: fill in or update documenting comments for transport switch entry points. For xprt_rdma_allocate: The first paragraph is no longer true since commit 5a6d1db45569 ("SUNRPC: Add a transport-specific private field in rpc_rqst"). The second paragraph is no longer true since commit 54cbd6b0c6b9 ("xprtrdma: Delay DMA mapping Send and Receive buffers"). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 43 ++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'net/sunrpc/xprtrdma/transport.c') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d7c4255e9d5d..39b7991862d0 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -256,6 +256,13 @@ xprt_rdma_connect_worker(struct work_struct *work) } } +/** + * xprt_rdma_inject_disconnect - inject a connection fault + * @xprt: transport context + * + * If @xprt is connected, disconnect it to simulate spurious connection + * loss. + */ static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) { @@ -266,16 +273,12 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) rdma_disconnect(r_xprt->rx_ia.ri_id); } -/* - * xprt_rdma_destroy +/** + * xprt_rdma_destroy - Full tear down of transport + * @xprt: doomed transport context * - * Destroy the xprt. - * Free all memory associated with the object, including its own. - * NOTE: none of the *destroy methods free memory for their top-level - * objects, even though they may have allocated it (they do free - * private memory). It's up to the caller to handle it. In this - * case (RDMA transport), all structure memory is inlined with the - * struct rpcrdma_xprt. + * Caller guarantees there will be no more calls to us with + * this @xprt. */ static void xprt_rdma_destroy(struct rpc_xprt *xprt) @@ -428,11 +431,12 @@ out1: } /** - * xprt_rdma_close - Close down RDMA connection - * @xprt: generic transport to be closed + * xprt_rdma_close - close a transport connection + * @xprt: transport context * - * Called during transport shutdown reconnect, or device - * removal. Caller holds the transport's write lock. + * Called during transport shutdown, reconnect, or device removal. + * Caller holds @xprt's send lock to prevent activity on this + * transport while the connection is torn down. */ static void xprt_rdma_close(struct rpc_xprt *xprt) @@ -511,6 +515,12 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) xprt_force_disconnect(xprt); } +/** + * xprt_rdma_connect - try to establish a transport connection + * @xprt: transport state + * @task: RPC scheduler context + * + */ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -630,13 +640,6 @@ rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * 0: Success; rq_buffer points to RPC buffer to use * ENOMEM: Out of memory, call again later * EIO: A permanent error occurred, do not retry - * - * The RDMA allocate/free functions need the task structure as a place - * to hide the struct rpcrdma_req, which is necessary for the actual - * send/recv sequence. - * - * xprt_rdma_allocate provides buffers that are already mapped for - * DMA, and a local DMA lkey is provided for each. */ static int xprt_rdma_allocate(struct rpc_task *task) -- cgit v1.2.3-59-g8ed1b From ad0911802cf6be48ddf5911ca1837d071b26e92d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:45 -0400 Subject: xprtrdma: Clean up xprt_rdma_disconnect_inject Clean up: Use the appropriate C macro instead of open-coding container_of() . Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/sunrpc/xprtrdma/transport.c') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 39b7991862d0..0cfa7bf41118 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -266,8 +266,7 @@ xprt_rdma_connect_worker(struct work_struct *work) static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) { - struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, - rx_xprt); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_inject_dsc(r_xprt); rdma_disconnect(r_xprt->rx_ia.ri_id); -- cgit v1.2.3-59-g8ed1b