Merge tag 'nfs-for-4.11-1' of git://git.linux-nfs.org/projects/anna/linux-nfs

Pull NFS client updates from Anna Schumaker: "Highlights include: Stable bugfixes: - NFSv4: Fix memory and state leak in _nfs4_open_and_get_state - xprtrdma: Fix Read chunk padding - xprtrdma: Per-connection pad optimization - xprtrdma: Disable pad optimization by default - xprtrdma: Reduce required number of send SGEs - nlm: Ensure callback code also checks that the files match - pNFS/flexfiles: If the layout is invalid, it must be updated before retrying - NFSv4: Fix reboot recovery in copy offload - Revert "NFSv4.1: Handle NFS4ERR_BADSESSION/NFS4ERR_DEADSESSION replies to OP_SEQUENCE" - NFSv4: fix getacl head length estimation - NFSv4: fix getacl ERANGE for sum ACL buffer sizes Features: - Add and use dprintk_cont macros - Various cleanups to NFS v4.x to reduce code duplication and complexity - Remove unused cr_magic related code - Improvements to sunrpc "read from buffer" code - Clean up sunrpc timeout code and allow changing TCP timeout parameters - Remove duplicate mw_list management code in xprtrdma - Add generic functions for encoding and decoding xdr streams Bugfixes: - Clean up nfs_show_mountd_netid - Make layoutreturn_ops static and use NULL instead of 0 to fix sparse warnings - Properly handle -ERESTARTSYS in nfs_rename() - Check if register_shrinker() failed during rpcauth_init() - Properly clean up procfs/pipefs entries - Various NFS over RDMA related fixes - Silence unititialized variable warning in sunrpc" * tag 'nfs-for-4.11-1' of git://git.linux-nfs.org/projects/anna/linux-nfs: (64 commits) NFSv4: fix getacl ERANGE for some ACL buffer sizes NFSv4: fix getacl head length estimation Revert "NFSv4.1: Handle NFS4ERR_BADSESSION/NFS4ERR_DEADSESSION replies to OP_SEQUENCE" NFSv4: Fix reboot recovery in copy offload pNFS/flexfiles: If the layout is invalid, it must be updated before retrying NFSv4: Clean up owner/group attribute decode SUNRPC: Add a helper function xdr_stream_decode_string_dup() NFSv4: Remove bogus "struct nfs_client" argument from decode_ace() NFSv4: Fix the underestimation of delegation XDR space reservation NFSv4: Replace callback string decode function with a generic NFSv4: Replace the open coded decode_opaque_inline() with the new generic NFSv4: Replace ad-hoc xdr encode/decode helpers with xdr_stream_* generics SUNRPC: Add generic helpers for xdr_stream encode/decode sunrpc: silence uninitialized variable warning nlm: Ensure callback code also checks that the files match sunrpc: Allow xprt->ops->timer method to sleep xprtrdma: Refactor management of mw_list field xprtrdma: Handle stale connection rejection xprtrdma: Properly recover FRWRs with in-flight FASTREG WRs xprtrdma: Shrink send SGEs array ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-03-01 16:10:30 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-03-01 16:10:30 -0800
commit: 8f03cf50bc9443e92d6e54ac4d599357d6cb7cbb (patch)
tree: 0c38aab3a4c24d06fb05376b651157627bc1669d /net/sunrpc/xprtrdma
parent: Merge tag 'for-f2fs-4.11' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs (diff)
parent: NFSv4: fix getacl ERANGE for some ACL buffer sizes (diff)
download: linux-dev-8f03cf50bc9443e92d6e54ac4d599357d6cb7cbb.tar.xz
linux-dev-8f03cf50bc9443e92d6e54ac4d599357d6cb7cbb.zip
6 files changed, 118 insertions, 112 deletions
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 1ebb09e1ac4f..59e64025ed96 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -310,10 +310,7 @@ fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 	struct rpcrdma_mw *mw;
 
 	while (!list_empty(&req->rl_registered)) {
-		mw = list_first_entry(&req->rl_registered,
-				      struct rpcrdma_mw, mw_list);
-		list_del_init(&mw->mw_list);
-
+		mw = rpcrdma_pop_mw(&req->rl_registered);
 		if (sync)
 			fmr_op_recover_mr(mw);
 		else
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 47bed5333c7f..f81dd93176c0 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -466,8 +466,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	struct ib_send_wr *first, **prev, *last, *bad_wr;
 	struct rpcrdma_rep *rep = req->rl_reply;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	struct rpcrdma_mw *mw, *tmp;
 	struct rpcrdma_frmr *f;
+	struct rpcrdma_mw *mw;
 	int count, rc;
 
 	dprintk("RPC:       %s: req %p\n", __func__, req);
@@ -534,10 +534,10 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 * them to the free MW list.
 	 */
 unmap:
-	list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+	while (!list_empty(&req->rl_registered)) {
+		mw = rpcrdma_pop_mw(&req->rl_registered);
 		dprintk("RPC:       %s: DMA unmapping frmr %p\n",
 			__func__, &mw->frmr);
-		list_del_init(&mw->mw_list);
 		ib_dma_unmap_sg(ia->ri_device,
 				mw->mw_sg, mw->mw_nents, mw->mw_dir);
 		rpcrdma_put_mw(r_xprt, mw);
@@ -571,10 +571,7 @@ frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 	struct rpcrdma_mw *mw;
 
 	while (!list_empty(&req->rl_registered)) {
-		mw = list_first_entry(&req->rl_registered,
-				      struct rpcrdma_mw, mw_list);
-		list_del_init(&mw->mw_list);
-
+		mw = rpcrdma_pop_mw(&req->rl_registered);
 		if (sync)
 			frwr_op_recover_mr(mw);
 		else
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c52e0f2ffe52..a044be2d6ad7 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -125,14 +125,34 @@ void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
 /* The client can send a request inline as long as the RPCRDMA header
  * plus the RPC call fit under the transport's inline limit. If the
  * combined call message size exceeds that limit, the client must use
- * the read chunk list for this operation.
+ * a Read chunk for this operation.
+ *
+ * A Read chunk is also required if sending the RPC call inline would
+ * exceed this device's max_sge limit.
  */
 static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
 				struct rpc_rqst *rqst)
 {
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct xdr_buf *xdr = &rqst->rq_snd_buf;
+	unsigned int count, remaining, offset;
+
+	if (xdr->len > r_xprt->rx_ia.ri_max_inline_write)
+		return false;
+
+	if (xdr->page_len) {
+		remaining = xdr->page_len;
+		offset = xdr->page_base & ~PAGE_MASK;
+		count = 0;
+		while (remaining) {
+			remaining -= min_t(unsigned int,
+					   PAGE_SIZE - offset, remaining);
+			offset = 0;
+			if (++count > r_xprt->rx_ia.ri_max_send_sges)
+				return false;
+		}
+	}
 
-	return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
+	return true;
 }
 
 /* The client can't know how large the actual reply will be. Thus it
@@ -186,9 +206,9 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
  */
 
 static int
-rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
-	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg,
-	bool reminv_expected)
+rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
+		     unsigned int pos, enum rpcrdma_chunktype type,
+		     struct rpcrdma_mr_seg *seg)
 {
 	int len, n, p, page_base;
 	struct page **ppages;
@@ -226,22 +246,21 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
 	if (len && n == RPCRDMA_MAX_SEGS)
 		goto out_overflow;
 
-	/* When encoding the read list, the tail is always sent inline */
-	if (type == rpcrdma_readch)
+	/* When encoding a Read chunk, the tail iovec contains an
+	 * XDR pad and may be omitted.
+	 */
+	if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
 		return n;
 
-	/* When encoding the Write list, some servers need to see an extra
-	 * segment for odd-length Write chunks. The upper layer provides
-	 * space in the tail iovec for this purpose.
+	/* When encoding a Write chunk, some servers need to see an
+	 * extra segment for non-XDR-aligned Write chunks. The upper
+	 * layer provides space in the tail iovec that may be used
+	 * for this purpose.
 	 */
-	if (type == rpcrdma_writech && reminv_expected)
+	if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
 		return n;
 
 	if (xdrbuf->tail[0].iov_len) {
-		/* the rpcrdma protocol allows us to omit any trailing
-		 * xdr pad bytes, saving the server an RDMA operation. */
-		if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
-			return n;
 		n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
 		if (n == RPCRDMA_MAX_SEGS)
 			goto out_overflow;
@@ -293,7 +312,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
 	if (rtype == rpcrdma_areadch)
 		pos = 0;
 	seg = req->rl_segments;
-	nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false);
+	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
+				     rtype, seg);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
 
@@ -302,7 +322,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
 						 false, &mw);
 		if (n < 0)
 			return ERR_PTR(n);
-		list_add(&mw->mw_list, &req->rl_registered);
+		rpcrdma_push_mw(mw, &req->rl_registered);
 
 		*iptr++ = xdr_one;	/* item present */
 
@@ -355,10 +375,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 	}
 
 	seg = req->rl_segments;
-	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
+	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
 				     rqst->rq_rcv_buf.head[0].iov_len,
-				     wtype, seg,
-				     r_xprt->rx_ia.ri_reminv_expected);
+				     wtype, seg);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
 
@@ -371,7 +390,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 						 true, &mw);
 		if (n < 0)
 			return ERR_PTR(n);
-		list_add(&mw->mw_list, &req->rl_registered);
+		rpcrdma_push_mw(mw, &req->rl_registered);
 
 		iptr = xdr_encode_rdma_segment(iptr, mw);
 
@@ -423,8 +442,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
 	}
 
 	seg = req->rl_segments;
-	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
-				     r_xprt->rx_ia.ri_reminv_expected);
+	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
 
@@ -437,7 +455,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
 						 true, &mw);
 		if (n < 0)
 			return ERR_PTR(n);
-		list_add(&mw->mw_list, &req->rl_registered);
+		rpcrdma_push_mw(mw, &req->rl_registered);
 
 		iptr = xdr_encode_rdma_segment(iptr, mw);
 
@@ -741,13 +759,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	iptr = headerp->rm_body.rm_chunks;
 	iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
 	if (IS_ERR(iptr))
-		goto out_unmap;
+		goto out_err;
 	iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
 	if (IS_ERR(iptr))
-		goto out_unmap;
+		goto out_err;
 	iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
 	if (IS_ERR(iptr))
-		goto out_unmap;
+		goto out_err;
 	hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
 
 	dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
@@ -758,12 +776,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
 				       &rqst->rq_snd_buf, rtype)) {
 		iptr = ERR_PTR(-EIO);
-		goto out_unmap;
+		goto out_err;
 	}
 	return 0;
 
-out_unmap:
-	r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+out_err:
+	pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
+	       PTR_ERR(iptr));
+	r_xprt->rx_stats.failed_marshal_count++;
 	return PTR_ERR(iptr);
 }
 
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 534c178d2a7e..c717f5410776 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -67,7 +67,7 @@ unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_inline_write_padding;
 static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
-		int xprt_rdma_pad_optimize = 1;
+		int xprt_rdma_pad_optimize = 0;
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 
@@ -709,10 +709,6 @@ xprt_rdma_send_request(struct rpc_task *task)
 	return 0;
 
 failed_marshal:
-	dprintk("RPC:       %s: rpcrdma_marshal_req failed, status %i\n",
-		__func__, rc);
-	if (rc == -EIO)
-		r_xprt->rx_stats.failed_marshal_count++;
 	if (rc != -ENOTCONN)
 		return rc;
 drop_connection:
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 11d07748f699..81cd31acf690 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -54,6 +54,7 @@
 #include <linux/sunrpc/svc_rdma.h>
 #include <asm/bitops.h>
 #include <linux/module.h> /* try_module_get()/module_put() */
+#include <rdma/ib_cm.h>
 
 #include "xprt_rdma.h"
 
@@ -208,6 +209,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
 
 	/* Default settings for RPC-over-RDMA Version One */
 	r_xprt->rx_ia.ri_reminv_expected = false;
+	r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
 	rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
 	wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
 
@@ -215,6 +217,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
 	    pmsg->cp_magic == rpcrdma_cmp_magic &&
 	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
 		r_xprt->rx_ia.ri_reminv_expected = true;
+		r_xprt->rx_ia.ri_implicit_roundup = true;
 		rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
 		wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
 	}
@@ -277,7 +280,14 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 		connstate = -ENETDOWN;
 		goto connected;
 	case RDMA_CM_EVENT_REJECTED:
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+		pr_info("rpcrdma: connection to %pIS:%u on %s rejected: %s\n",
+			sap, rpc_get_port(sap), ia->ri_device->name,
+			rdma_reject_msg(id, event->status));
+#endif
 		connstate = -ECONNREFUSED;
+		if (event->status == IB_CM_REJ_STALE_CONN)
+			connstate = -EAGAIN;
 		goto connected;
 	case RDMA_CM_EVENT_DISCONNECTED:
 		connstate = -ECONNABORTED;
@@ -486,18 +496,19 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
  */
 int
 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
-				struct rpcrdma_create_data_internal *cdata)
+		  struct rpcrdma_create_data_internal *cdata)
 {
 	struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
+	unsigned int max_qp_wr, max_sge;
 	struct ib_cq *sendcq, *recvcq;
-	unsigned int max_qp_wr;
 	int rc;
 
-	if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
-		dprintk("RPC:       %s: insufficient sge's available\n",
-			__func__);
+	max_sge = min(ia->ri_device->attrs.max_sge, RPCRDMA_MAX_SEND_SGES);
+	if (max_sge < RPCRDMA_MIN_SEND_SGES) {
+		pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
 		return -ENOMEM;
 	}
+	ia->ri_max_send_sges = max_sge - RPCRDMA_MIN_SEND_SGES;
 
 	if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
 		dprintk("RPC:       %s: insufficient wqe's available\n",
@@ -522,7 +533,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
 	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
 	ep->rep_attr.cap.max_recv_wr += 1;	/* drain cqe */
-	ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
+	ep->rep_attr.cap.max_send_sge = max_sge;
 	ep->rep_attr.cap.max_recv_sge = 1;
 	ep->rep_attr.cap.max_inline_data = 0;
 	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -640,20 +651,21 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 int
 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
+	struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
+						   rx_ia);
 	struct rdma_cm_id *id, *old;
+	struct sockaddr *sap;
+	unsigned int extras;
 	int rc = 0;
-	int retry_count = 0;
 
 	if (ep->rep_connected != 0) {
-		struct rpcrdma_xprt *xprt;
 retry:
 		dprintk("RPC:       %s: reconnecting...\n", __func__);
 
 		rpcrdma_ep_disconnect(ep, ia);
 
-		xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
-		id = rpcrdma_create_id(xprt, ia,
-				(struct sockaddr *)&xprt->rx_data.addr);
+		sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+		id = rpcrdma_create_id(r_xprt, ia, sap);
 		if (IS_ERR(id)) {
 			rc = -EHOSTUNREACH;
 			goto out;
@@ -708,51 +720,18 @@ retry:
 	}
 
 	wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
-
-	/*
-	 * Check state. A non-peer reject indicates no listener
-	 * (ECONNREFUSED), which may be a transient state. All
-	 * others indicate a transport condition which has already
-	 * undergone a best-effort.
-	 */
-	if (ep->rep_connected == -ECONNREFUSED &&
-	    ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
-		dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
-		goto retry;
-	}
 	if (ep->rep_connected <= 0) {
-		/* Sometimes, the only way to reliably connect to remote
-		 * CMs is to use same nonzero values for ORD and IRD. */
-		if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
-		    (ep->rep_remote_cma.responder_resources == 0 ||
-		     ep->rep_remote_cma.initiator_depth !=
-				ep->rep_remote_cma.responder_resources)) {
-			if (ep->rep_remote_cma.responder_resources == 0)
-				ep->rep_remote_cma.responder_resources = 1;
-			ep->rep_remote_cma.initiator_depth =
-				ep->rep_remote_cma.responder_resources;
+		if (ep->rep_connected == -EAGAIN)
 			goto retry;
-		}
 		rc = ep->rep_connected;
-	} else {
-		struct rpcrdma_xprt *r_xprt;
-		unsigned int extras;
-
-		dprintk("RPC:       %s: connected\n", __func__);
-
-		r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
-		extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
-
-		if (extras) {
-			rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
-			if (rc) {
-				pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
-					__func__, rc);
-				rc = 0;
-			}
-		}
+		goto out;
 	}
 
+	dprintk("RPC:       %s: connected\n", __func__);
+	extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
+	if (extras)
+		rpcrdma_ep_post_extra_recv(r_xprt, extras);
+
 out:
 	if (rc)
 		ep->rep_connected = rc;
@@ -797,9 +776,7 @@ rpcrdma_mr_recovery_worker(struct work_struct *work)
 
 	spin_lock(&buf->rb_recovery_lock);
 	while (!list_empty(&buf->rb_stale_mrs)) {
-		mw = list_first_entry(&buf->rb_stale_mrs,
-				      struct rpcrdma_mw, mw_list);
-		list_del_init(&mw->mw_list);
+		mw = rpcrdma_pop_mw(&buf->rb_stale_mrs);
 		spin_unlock(&buf->rb_recovery_lock);
 
 		dprintk("RPC:       %s: recovering MR %p\n", __func__, mw);
@@ -817,7 +794,7 @@ rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 
 	spin_lock(&buf->rb_recovery_lock);
-	list_add(&mw->mw_list, &buf->rb_stale_mrs);
+	rpcrdma_push_mw(mw, &buf->rb_stale_mrs);
 	spin_unlock(&buf->rb_recovery_lock);
 
 	schedule_delayed_work(&buf->rb_recovery_worker, 0);
@@ -1093,11 +1070,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
 	struct rpcrdma_mw *mw = NULL;
 
 	spin_lock(&buf->rb_mwlock);
-	if (!list_empty(&buf->rb_mws)) {
-		mw = list_first_entry(&buf->rb_mws,
-				      struct rpcrdma_mw, mw_list);
-		list_del_init(&mw->mw_list);
-	}
+	if (!list_empty(&buf->rb_mws))
+		mw = rpcrdma_pop_mw(&buf->rb_mws);
 	spin_unlock(&buf->rb_mwlock);
 
 	if (!mw)
@@ -1120,7 +1094,7 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 
 	spin_lock(&buf->rb_mwlock);
-	list_add_tail(&mw->mw_list, &buf->rb_mws);
+	rpcrdma_push_mw(mw, &buf->rb_mws);
 	spin_unlock(&buf->rb_mwlock);
 }
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index e35efd4ac1e4..171a35116de9 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -74,7 +74,9 @@ struct rpcrdma_ia {
 	unsigned int		ri_max_frmr_depth;
 	unsigned int		ri_max_inline_write;
 	unsigned int		ri_max_inline_read;
+	unsigned int		ri_max_send_sges;
 	bool			ri_reminv_expected;
+	bool			ri_implicit_roundup;
 	enum ib_mr_type		ri_mrtype;
 	struct ib_qp_attr	ri_qp_attr;
 	struct ib_qp_init_attr	ri_qp_init_attr;
@@ -303,15 +305,19 @@ struct rpcrdma_mr_seg {		/* chunk descriptors */
 	char		*mr_offset;	/* kva if no page, else offset */
 };
 
-/* Reserve enough Send SGEs to send a maximum size inline request:
+/* The Send SGE array is provisioned to send a maximum size
+ * inline request:
  * - RPC-over-RDMA header
  * - xdr_buf head iovec
- * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages
+ * - RPCRDMA_MAX_INLINE bytes, in pages
  * - xdr_buf tail iovec
+ *
+ * The actual number of array elements consumed by each RPC
+ * depends on the device's max_sge limit.
  */
 enum {
-	RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1,
-	RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1,
+	RPCRDMA_MIN_SEND_SGES = 3,
+	RPCRDMA_MAX_PAGE_SGES = RPCRDMA_MAX_INLINE >> PAGE_SHIFT,
 	RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
 };
 
@@ -348,6 +354,22 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
 	return rqst->rq_xprtdata;
 }
 
+static inline void
+rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list)
+{
+	list_add_tail(&mw->mw_list, list);
+}
+
+static inline struct rpcrdma_mw *
+rpcrdma_pop_mw(struct list_head *list)
+{
+	struct rpcrdma_mw *mw;
+
+	mw = list_first_entry(list, struct rpcrdma_mw, mw_list);
+	list_del(&mw->mw_list);
+	return mw;
+}
+
 /*
  * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
  * inline requests/replies, and client/server credits.
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-03-01 16:10:30 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-03-01 16:10:30 -0800
commit	8f03cf50bc9443e92d6e54ac4d599357d6cb7cbb (patch)
tree	0c38aab3a4c24d06fb05376b651157627bc1669d /net/sunrpc/xprtrdma
parent	Merge tag 'for-f2fs-4.11' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs (diff)
parent	NFSv4: fix getacl ERANGE for some ACL buffer sizes (diff)
download	linux-dev-8f03cf50bc9443e92d6e54ac4d599357d6cb7cbb.tar.xz linux-dev-8f03cf50bc9443e92d6e54ac4d599357d6cb7cbb.zip