From 9c760d1fd513053410e10d5f507daad56aeec7db Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 31 Jul 2017 18:38:50 -0400
Subject: NFSv4: Refactor _nfs4_proc_exchange_id()

Tease apart the functionality in nfs4_exchange_id_done() so that
it is easier to debug exchange id vs trunking issues by moving
all the processing out of nfs4_exchange_id_done() and into the
callers.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 171 +++++++++++++++++++++++++++---------------------------
 1 file changed, 85 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ffd2e712595d..34190c2c0900 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7403,72 +7403,8 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 struct nfs41_exchange_id_data {
 	struct nfs41_exchange_id_res res;
 	struct nfs41_exchange_id_args args;
-	struct rpc_xprt *xprt;
-	int rpc_status;
 };
 
-static void nfs4_exchange_id_done(struct rpc_task *task, void *data)
-{
-	struct nfs41_exchange_id_data *cdata =
-					(struct nfs41_exchange_id_data *)data;
-	struct nfs_client *clp = cdata->args.client;
-	int status = task->tk_status;
-
-	trace_nfs4_exchange_id(clp, status);
-
-	if (status == 0)
-		status = nfs4_check_cl_exchange_flags(cdata->res.flags);
-
-	if (cdata->xprt && status == 0) {
-		status = nfs4_detect_session_trunking(clp, &cdata->res,
-						      cdata->xprt);
-		goto out;
-	}
-
-	if (status  == 0)
-		status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect);
-
-	if (status == 0) {
-		clp->cl_clientid = cdata->res.clientid;
-		clp->cl_exchange_flags = cdata->res.flags;
-		clp->cl_seqid = cdata->res.seqid;
-		/* Client ID is not confirmed */
-		if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R))
-			clear_bit(NFS4_SESSION_ESTABLISHED,
-				  &clp->cl_session->session_state);
-
-		kfree(clp->cl_serverowner);
-		clp->cl_serverowner = cdata->res.server_owner;
-		cdata->res.server_owner = NULL;
-
-		/* use the most recent implementation id */
-		kfree(clp->cl_implid);
-		clp->cl_implid = cdata->res.impl_id;
-		cdata->res.impl_id = NULL;
-
-		if (clp->cl_serverscope != NULL &&
-		    !nfs41_same_server_scope(clp->cl_serverscope,
-					cdata->res.server_scope)) {
-			dprintk("%s: server_scope mismatch detected\n",
-				__func__);
-			set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
-			kfree(clp->cl_serverscope);
-			clp->cl_serverscope = NULL;
-		}
-
-		if (clp->cl_serverscope == NULL) {
-			clp->cl_serverscope = cdata->res.server_scope;
-			cdata->res.server_scope = NULL;
-		}
-		/* Save the EXCHANGE_ID verifier session trunk tests */
-		memcpy(clp->cl_confirm.data, cdata->args.verifier.data,
-		       sizeof(clp->cl_confirm.data));
-	}
-out:
-	cdata->rpc_status = status;
-	return;
-}
-
 static void nfs4_exchange_id_release(void *data)
 {
 	struct nfs41_exchange_id_data *cdata =
@@ -7482,7 +7418,6 @@ static void nfs4_exchange_id_release(void *data)
 }
 
 static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
-	.rpc_call_done = nfs4_exchange_id_done,
 	.rpc_release = nfs4_exchange_id_release,
 };
 
@@ -7491,7 +7426,8 @@ static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
  *
  * Wrapper for EXCHANGE_ID operation.
  */
-static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+static struct rpc_task *
+nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 			u32 sp4_how, struct rpc_xprt *xprt)
 {
 	struct rpc_message msg = {
@@ -7505,17 +7441,15 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 		.flags = RPC_TASK_TIMEOUT,
 	};
 	struct nfs41_exchange_id_data *calldata;
-	struct rpc_task *task;
 	int status;
 
 	if (!atomic_inc_not_zero(&clp->cl_count))
-		return -EIO;
+		return ERR_PTR(-EIO);
 
+	status = -ENOMEM;
 	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
-	if (!calldata) {
-		nfs_put_client(clp);
-		return -ENOMEM;
-	}
+	if (!calldata)
+		goto out;
 
 	nfs4_init_boot_verifier(clp, &calldata->args.verifier);
 
@@ -7554,7 +7488,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 		goto out_impl_id;
 	}
 	if (xprt) {
-		calldata->xprt = xprt;
 		task_setup_data.rpc_xprt = xprt;
 		task_setup_data.flags |= RPC_TASK_SOFTCONN;
 		memcpy(calldata->args.verifier.data, clp->cl_confirm.data,
@@ -7573,15 +7506,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	msg.rpc_resp = &calldata->res;
 	task_setup_data.callback_data = calldata;
 
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task))
-		return PTR_ERR(task);
-
-	status = calldata->rpc_status;
-
-	rpc_put_task(task);
-out:
-	return status;
+	return rpc_run_task(&task_setup_data);
 
 out_impl_id:
 	kfree(calldata->res.impl_id);
@@ -7591,8 +7516,69 @@ out_server_owner:
 	kfree(calldata->res.server_owner);
 out_calldata:
 	kfree(calldata);
+out:
 	nfs_put_client(clp);
-	goto out;
+	return ERR_PTR(status);
+}
+
+/*
+ * _nfs4_proc_exchange_id()
+ *
+ * Wrapper for EXCHANGE_ID operation.
+ */
+static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+			u32 sp4_how)
+{
+	struct rpc_task *task;
+	struct nfs41_exchange_id_args *argp;
+	struct nfs41_exchange_id_res *resp;
+	int status;
+
+	task = nfs4_run_exchange_id(clp, cred, sp4_how, NULL);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	argp = task->tk_msg.rpc_argp;
+	resp = task->tk_msg.rpc_resp;
+	status = task->tk_status;
+	if (status  != 0)
+		goto out;
+
+	status = nfs4_check_cl_exchange_flags(resp->flags);
+	if (status  != 0)
+		goto out;
+
+	status = nfs4_sp4_select_mode(clp, &resp->state_protect);
+	if (status != 0)
+		goto out;
+
+	clp->cl_clientid = resp->clientid;
+	clp->cl_exchange_flags = resp->flags;
+	clp->cl_seqid = resp->seqid;
+	/* Client ID is not confirmed */
+	if (!(resp->flags & EXCHGID4_FLAG_CONFIRMED_R))
+		clear_bit(NFS4_SESSION_ESTABLISHED,
+			  &clp->cl_session->session_state);
+
+	if (clp->cl_serverscope != NULL &&
+	    !nfs41_same_server_scope(clp->cl_serverscope,
+				resp->server_scope)) {
+		dprintk("%s: server_scope mismatch detected\n",
+			__func__);
+		set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
+	}
+
+	swap(clp->cl_serverowner, resp->server_owner);
+	swap(clp->cl_serverscope, resp->server_scope);
+	swap(clp->cl_implid, resp->impl_id);
+
+	/* Save the EXCHANGE_ID verifier session trunk tests */
+	memcpy(clp->cl_confirm.data, argp->verifier.data,
+	       sizeof(clp->cl_confirm.data));
+out:
+	trace_nfs4_exchange_id(clp, status);
+	rpc_put_task(task);
+	return status;
 }
 
 /*
@@ -7615,13 +7601,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	/* try SP4_MACH_CRED if krb5i/p	*/
 	if (authflavor == RPC_AUTH_GSS_KRB5I ||
 	    authflavor == RPC_AUTH_GSS_KRB5P) {
-		status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL);
+		status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
 		if (!status)
 			return 0;
 	}
 
 	/* try SP4_NONE */
-	return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL);
+	return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
 }
 
 /**
@@ -7643,6 +7629,9 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
 			    void *data)
 {
 	struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
+	struct rpc_task *task;
+	int status;
+
 	u32 sp4_how;
 
 	dprintk("--> %s try %s\n", __func__,
@@ -7651,7 +7640,17 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
 	sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
 
 	/* Test connection for session trunking. Async exchange_id call */
-	return  _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
+	task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	status = task->tk_status;
+	if (status == 0)
+		status = nfs4_detect_session_trunking(adata->clp,
+				task->tk_msg.rpc_resp, xprt);
+
+	rpc_put_task(task);
+	return status;
 }
 EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
 
-- 
cgit v1.2.3-59-g8ed1b


From 937e3133cd0b99b0724aebd081fed372441c9915 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 07:32:50 -0400
Subject: NFSv4.1: Ensure we clear the SP4_MACH_CRED flags in
 nfs4_sp4_select_mode()

If the server changes, so that it no longer supports SP4_MACH_CRED, or
that it doesn't support the same set of SP4_MACH_CRED functionality,
then we want to ensure that we clear the unsupported flags.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 34190c2c0900..d9fc34dedcf8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7319,7 +7319,9 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 		      1 << (OP_DESTROY_SESSION - 32) |
 		      1 << (OP_DESTROY_CLIENTID - 32)
 	};
+	unsigned long flags = 0;
 	unsigned int i;
+	int ret = 0;
 
 	if (sp->how == SP4_MACH_CRED) {
 		/* Print state protect result */
@@ -7335,7 +7337,8 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 		for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) {
 			if (sp->enforce.u.words[i] & ~supported_enforce[i]) {
 				dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
-				return -EINVAL;
+				ret = -EINVAL;
+				goto out;
 			}
 		}
 
@@ -7354,10 +7357,11 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 		    test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) {
 			dfprintk(MOUNT, "sp4_mach_cred:\n");
 			dfprintk(MOUNT, "  minimal mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_MINIMAL, &flags);
 		} else {
 			dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
-			return -EINVAL;
+			ret = -EINVAL;
+			goto out;
 		}
 
 		if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
@@ -7365,38 +7369,38 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 		    test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
 		    test_bit(OP_LOCKU, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  cleanup mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_CLEANUP, &flags);
 		}
 
 		if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  pnfs cleanup mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP,
-				&clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, &flags);
 		}
 
 		if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
 		    test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  secinfo mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_SECINFO, &flags);
 		}
 
 		if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) &&
 		    test_bit(OP_FREE_STATEID, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  stateid mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_STATEID, &flags);
 		}
 
 		if (test_bit(OP_WRITE, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  write mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_WRITE, &flags);
 		}
 
 		if (test_bit(OP_COMMIT, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  commit mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_COMMIT, &flags);
 		}
 	}
-
+out:
+	clp->cl_sp4_flags = flags;
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From bfab281721edc250ac44b8b6787de32c95f9adfc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 08:17:34 -0400
Subject: NFSv4: Cleanup setting of the migration flags.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d9fc34dedcf8..15e91f003adc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7498,13 +7498,10 @@ nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 				sizeof(calldata->args.verifier.data));
 	}
 	calldata->args.client = clp;
-#ifdef CONFIG_NFS_V4_1_MIGRATION
-	calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-	EXCHGID4_FLAG_BIND_PRINC_STATEID |
-	EXCHGID4_FLAG_SUPP_MOVED_MIGR,
-#else
 	calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-	EXCHGID4_FLAG_BIND_PRINC_STATEID,
+	EXCHGID4_FLAG_BIND_PRINC_STATEID;
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+	calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR;
 #endif
 	msg.rpc_argp = &calldata->args;
 	msg.rpc_resp = &calldata->res;
-- 
cgit v1.2.3-59-g8ed1b


From 4e2fcac773902eefdeae81637ca73bbf0398d11f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 8 Aug 2017 09:06:18 -0400
Subject: NFSv4: Use correct inode in _nfs4_opendata_to_nfs4_state()

When doing open by filehandle we don't really want to lookup a new inode,
but rather update the one we've got. Add a helper which does this for us.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d90132642340..5f11caefd36d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1659,6 +1659,28 @@ update:
 	return state;
 }
 
+static struct inode *
+nfs4_opendata_get_inode(struct nfs4_opendata *data)
+{
+	struct inode *inode;
+
+	switch (data->o_arg.claim) {
+	case NFS4_OPEN_CLAIM_NULL:
+	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+		if (!(data->f_attr.valid & NFS_ATTR_FATTR))
+			return ERR_PTR(-EAGAIN);
+		inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh,
+				&data->f_attr, data->f_label);
+		break;
+	default:
+		inode = d_inode(data->dentry);
+		ihold(inode);
+		nfs_refresh_inode(inode, &data->f_attr);
+	}
+	return inode;
+}
+
 static struct nfs4_state *
 _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 {
@@ -1672,10 +1694,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 		goto out;
 	}
 
-	ret = -EAGAIN;
-	if (!(data->f_attr.valid & NFS_ATTR_FATTR))
-		goto err;
-	inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
+	inode = nfs4_opendata_get_inode(data);
 	ret = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto err;
@@ -2071,7 +2090,6 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 		data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
 	case NFS4_OPEN_CLAIM_FH:
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
-		nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
 	}
 	data->timestamp = jiffies;
 	if (nfs4_setup_sequence(data->o_arg.server->nfs_client,
-- 
cgit v1.2.3-59-g8ed1b


From 75e8c48b9ef30bfda38e7f6e5d807352fbf0b090 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 8 Aug 2017 10:38:07 -0400
Subject: NFSv4: Use the nfs4_state being recovered in
 _nfs4_opendata_to_nfs4_state()

If we're recovering a nfs4_state, then we should try to use that instead
of looking up a new stateid. Only do that if the inodes match, though.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5f11caefd36d..3923ac71a420 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1682,11 +1682,29 @@ nfs4_opendata_get_inode(struct nfs4_opendata *data)
 }
 
 static struct nfs4_state *
-_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data)
 {
+	struct nfs4_state *state;
 	struct inode *inode;
-	struct nfs4_state *state = NULL;
-	int ret;
+
+	inode = nfs4_opendata_get_inode(data);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (data->state != NULL && data->state->inode == inode) {
+		state = data->state;
+		atomic_inc(&state->count);
+	} else
+		state = nfs4_get_open_state(inode, data->owner);
+	iput(inode);
+	if (state == NULL)
+		state = ERR_PTR(-ENOMEM);
+	return state;
+}
+
+static struct nfs4_state *
+_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+	struct nfs4_state *state;
 
 	if (!data->rpc_done) {
 		state = nfs4_try_open_cached(data);
@@ -1694,26 +1712,17 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 		goto out;
 	}
 
-	inode = nfs4_opendata_get_inode(data);
-	ret = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto err;
-	ret = -ENOMEM;
-	state = nfs4_get_open_state(inode, data->owner);
-	if (state == NULL)
-		goto err_put_inode;
+	state = nfs4_opendata_find_nfs4_state(data);
+	if (IS_ERR(state))
+		goto out;
+
 	if (data->o_res.delegation_type != 0)
 		nfs4_opendata_check_deleg(data, state);
 	update_open_stateid(state, &data->o_res.stateid, NULL,
 			data->o_arg.fmode);
-	iput(inode);
 out:
 	nfs_release_seqid(data->o_arg.seqid);
 	return state;
-err_put_inode:
-	iput(inode);
-err:
-	return ERR_PTR(ret);
 }
 
 static struct nfs4_state *
-- 
cgit v1.2.3-59-g8ed1b


From 6d17d653c9f152e113043d00f3bcf00c0eb5f5a2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 9 Jul 2017 13:45:27 -0400
Subject: NFS: Simplify page writeback

We don't expect the page header lock to ever be held across I/O, so
it should always be safe to wait for it, even if we're doing nonblocking
writebacks.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b1af5dee5e0a..1d447e37f472 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -366,7 +366,6 @@ nfs_page_group_clear_bits(struct nfs_page *req)
  * @inode - inode associated with request page group, must be holding inode lock
  * @head  - head request of page group, must be holding head lock
  * @req   - request that couldn't lock and needs to wait on the req bit lock
- * @nonblock - if true, don't actually wait
  *
  * NOTE: this must be called holding page_group bit lock and inode spin lock
  *       and BOTH will be released before returning.
@@ -375,7 +374,7 @@ nfs_page_group_clear_bits(struct nfs_page *req)
  */
 static int
 nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
-			  struct nfs_page *req, bool nonblock)
+			  struct nfs_page *req)
 	__releases(&inode->i_lock)
 {
 	struct nfs_page *tmp;
@@ -396,10 +395,7 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
 	/* release ref from nfs_page_find_head_request_locked */
 	nfs_release_request(head);
 
-	if (!nonblock)
-		ret = nfs_wait_on_request(req);
-	else
-		ret = -EAGAIN;
+	ret = nfs_wait_on_request(req);
 	nfs_release_request(req);
 
 	return ret;
@@ -464,7 +460,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
  *                              operations for this page.
  *
  * @page - the page used to lookup the "page group" of nfs_page structures
- * @nonblock - if true, don't block waiting for request locks
  *
  * This function joins all sub requests to the head request by first
  * locking all requests in the group, cancelling any pending operations
@@ -478,7 +473,7 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
  * error was encountered.
  */
 static struct nfs_page *
-nfs_lock_and_join_requests(struct page *page, bool nonblock)
+nfs_lock_and_join_requests(struct page *page)
 {
 	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_page *head, *subreq;
@@ -511,14 +506,9 @@ try_again:
 	if (ret < 0) {
 		spin_unlock(&inode->i_lock);
 
-		if (!nonblock && ret == -EAGAIN) {
-			nfs_page_group_lock_wait(head);
-			nfs_release_request(head);
-			goto try_again;
-		}
-
+		nfs_page_group_lock_wait(head);
 		nfs_release_request(head);
-		return ERR_PTR(ret);
+		goto try_again;
 	}
 
 	/* lock each request in the page group */
@@ -543,7 +533,7 @@ try_again:
 			/* releases page group bit lock and
 			 * inode spin lock and all references */
 			ret = nfs_unroll_locks_and_wait(inode, head,
-				subreq, nonblock);
+				subreq);
 
 			if (ret == 0)
 				goto try_again;
@@ -624,12 +614,12 @@ nfs_error_is_fatal_on_server(int err)
  * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-				struct page *page, bool nonblock)
+				struct page *page)
 {
 	struct nfs_page *req;
 	int ret = 0;
 
-	req = nfs_lock_and_join_requests(page, nonblock);
+	req = nfs_lock_and_join_requests(page);
 	if (!req)
 		goto out;
 	ret = PTR_ERR(req);
@@ -672,7 +662,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
 	int ret;
 
 	nfs_pageio_cond_complete(pgio, page_index(page));
-	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+	ret = nfs_page_async_flush(pgio, page);
 	if (ret == -EAGAIN) {
 		redirty_page_for_writepage(wbc, page);
 		ret = 0;
@@ -2015,7 +2005,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 
 	/* blocking call to cancel all requests and join to a single (head)
 	 * request */
-	req = nfs_lock_and_join_requests(page, false);
+	req = nfs_lock_and_join_requests(page);
 
 	if (IS_ERR(req)) {
 		ret = PTR_ERR(req);
-- 
cgit v1.2.3-59-g8ed1b


From 82749dd4efcec8e90fa7769eec3dd0afa2e3396a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 09:30:13 -0400
Subject: NFS: Reduce lock contention in nfs_page_find_head_request()

Add a lockless check for whether or not the page might be carrying
an existing writeback before we grab the inode->i_lock.

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1d447e37f472..06e150c4e315 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -190,9 +190,11 @@ static struct nfs_page *nfs_page_find_head_request(struct page *page)
 	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_page *req = NULL;
 
-	spin_lock(&inode->i_lock);
-	req = nfs_page_find_head_request_locked(NFS_I(inode), page);
-	spin_unlock(&inode->i_lock);
+	if (PagePrivate(page) || PageSwapCache(page)) {
+		spin_lock(&inode->i_lock);
+		req = nfs_page_find_head_request_locked(NFS_I(inode), page);
+		spin_unlock(&inode->i_lock);
+	}
 	return req;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 1403390d8366c717139cab26b8e94d943915fa12 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 09:37:12 -0400
Subject: NFS: Reduce lock contention in nfs_try_to_update_request()

Micro-optimisation to move the lockless check into the for(;;) loop.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 06e150c4e315..bb019096c331 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1097,13 +1097,12 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 	unsigned int end;
 	int error;
 
-	if (!PagePrivate(page))
-		return NULL;
-
 	end = offset + bytes;
-	spin_lock(&inode->i_lock);
 
 	for (;;) {
+		if (!(PagePrivate(page) || PageSwapCache(page)))
+			return NULL;
+		spin_lock(&inode->i_lock);
 		req = nfs_page_find_head_request_locked(NFS_I(inode), page);
 		if (req == NULL)
 			goto out_unlock;
@@ -1132,7 +1131,6 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		nfs_release_request(req);
 		if (error != 0)
 			goto out_err;
-		spin_lock(&inode->i_lock);
 	}
 
 	/* Okay, the request matches. Update the region */
-- 
cgit v1.2.3-59-g8ed1b


From 08fead2ae5a9953d47677416cc5f6bcae448480d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 18 Jul 2017 19:31:10 -0400
Subject: NFS: Ensure we always dereference the page head last

This fixes a race with nfs_page_group_sync_on_bit() whereby the
call to wake_up_bit() in nfs_page_group_unlock() could occur after
the page header had been freed.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index de9066a92c0d..a6f2bbd709ba 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -306,14 +306,11 @@ static void
 nfs_page_group_destroy(struct kref *kref)
 {
 	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+	struct nfs_page *head = req->wb_head;
 	struct nfs_page *tmp, *next;
 
-	/* subrequests must release the ref on the head request */
-	if (req->wb_head != req)
-		nfs_release_request(req->wb_head);
-
 	if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
-		return;
+		goto out;
 
 	tmp = req;
 	do {
@@ -324,6 +321,10 @@ nfs_page_group_destroy(struct kref *kref)
 		nfs_free_request(tmp);
 		tmp = next;
 	} while (tmp != req);
+out:
+	/* subrequests must release the ref on the head request */
+	if (head != req)
+		nfs_release_request(head);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 7cb9cd9aa2eafe869935d4168031f1ed376d924c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:11:11 -0400
Subject: NFS: Fix a reference and lock leak in nfs_lock_and_join_requests()

Yes, this is a situation that should never happen (hence the WARN_ON)
but we should still ensure that we free up the locks and references to
the faulty pages.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index bb019096c331..1ca759719429 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -526,8 +526,7 @@ try_again:
 		} else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
 			    ((subreq->wb_offset + subreq->wb_bytes) >
 			     (head->wb_offset + total_bytes)))) {
-			nfs_page_group_unlock(head);
-			spin_unlock(&inode->i_lock);
+			nfs_unroll_locks_and_wait(inode, head, subreq);
 			return ERR_PTR(-EIO);
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From a0e265bc78010d2d831a968d4cea3c40a0efac8b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:29:32 -0400
Subject: NFS: Fix an ABBA issue in nfs_lock_and_join_requests()

All other callers of nfs_page_group_lock() appear to already hold the
page lock on the head page, so doing it in the opposite order here
is inefficient, although not deadlock prone since we roll back all
locks on contention.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1ca759719429..c940e615f5dc 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -383,7 +383,7 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
 	int ret;
 
 	/* relinquish all the locks successfully grabbed this run */
-	for (tmp = head ; tmp != req; tmp = tmp->wb_this_page)
+	for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page)
 		nfs_unlock_request(tmp);
 
 	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
@@ -395,7 +395,7 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
 	spin_unlock(&inode->i_lock);
 
 	/* release ref from nfs_page_find_head_request_locked */
-	nfs_release_request(head);
+	nfs_unlock_and_release_request(head);
 
 	ret = nfs_wait_on_request(req);
 	nfs_release_request(req);
@@ -484,10 +484,6 @@ nfs_lock_and_join_requests(struct page *page)
 	int ret;
 
 try_again:
-	total_bytes = 0;
-
-	WARN_ON_ONCE(destroy_list);
-
 	spin_lock(&inode->i_lock);
 
 	/*
@@ -502,6 +498,16 @@ try_again:
 		return NULL;
 	}
 
+	/* lock the page head first in order to avoid an ABBA inefficiency */
+	if (!nfs_lock_request(head)) {
+		spin_unlock(&inode->i_lock);
+		ret = nfs_wait_on_request(head);
+		nfs_release_request(head);
+		if (ret < 0)
+			return ERR_PTR(ret);
+		goto try_again;
+	}
+
 	/* holding inode lock, so always make a non-blocking call to try the
 	 * page group lock */
 	ret = nfs_page_group_lock(head, true);
@@ -509,13 +515,14 @@ try_again:
 		spin_unlock(&inode->i_lock);
 
 		nfs_page_group_lock_wait(head);
-		nfs_release_request(head);
+		nfs_unlock_and_release_request(head);
 		goto try_again;
 	}
 
 	/* lock each request in the page group */
-	subreq = head;
-	do {
+	total_bytes = head->wb_bytes;
+	for (subreq = head->wb_this_page; subreq != head;
+			subreq = subreq->wb_this_page) {
 		/*
 		 * Subrequests are always contiguous, non overlapping
 		 * and in order - but may be repeated (mirrored writes).
@@ -541,9 +548,7 @@ try_again:
 
 			return ERR_PTR(ret);
 		}
-
-		subreq = subreq->wb_this_page;
-	} while (subreq != head);
+	}
 
 	/* Now that all requests are locked, make sure they aren't on any list.
 	 * Commit list removal accounting is done after locks are dropped */
-- 
cgit v1.2.3-59-g8ed1b


From e14bebf6de11a4b8476cf2b0a75bf7c3e69112d5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 11:11:49 -0400
Subject: NFS: Don't check request offset and size without holding a lock

Request offsets and sizes are not guaranteed to be stable unless you
are holding the request locked.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c940e615f5dc..84b6818e5278 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -523,6 +523,17 @@ try_again:
 	total_bytes = head->wb_bytes;
 	for (subreq = head->wb_this_page; subreq != head;
 			subreq = subreq->wb_this_page) {
+		if (!nfs_lock_request(subreq)) {
+			/* releases page group bit lock and
+			 * inode spin lock and all references */
+			ret = nfs_unroll_locks_and_wait(inode, head,
+				subreq);
+
+			if (ret == 0)
+				goto try_again;
+
+			return ERR_PTR(ret);
+		}
 		/*
 		 * Subrequests are always contiguous, non overlapping
 		 * and in order - but may be repeated (mirrored writes).
@@ -533,21 +544,10 @@ try_again:
 		} else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
 			    ((subreq->wb_offset + subreq->wb_bytes) >
 			     (head->wb_offset + total_bytes)))) {
+			nfs_unlock_request(subreq);
 			nfs_unroll_locks_and_wait(inode, head, subreq);
 			return ERR_PTR(-EIO);
 		}
-
-		if (!nfs_lock_request(subreq)) {
-			/* releases page group bit lock and
-			 * inode spin lock and all references */
-			ret = nfs_unroll_locks_and_wait(inode, head,
-				subreq);
-
-			if (ret == 0)
-				goto try_again;
-
-			return ERR_PTR(ret);
-		}
 	}
 
 	/* Now that all requests are locked, make sure they aren't on any list.
-- 
cgit v1.2.3-59-g8ed1b


From 31a01f093edbc687e783a4c8adcd76d3cc91a559 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 18 Jul 2017 19:18:49 -0400
Subject: NFS: Don't unlock writebacks before declaring PG_WB_END

We don't want nfs_lock_and_join_requests() to start fiddling with
the request before the call to nfs_page_group_sync_on_bit().

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 84b6818e5278..bb38c881fc48 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -335,8 +335,11 @@ static void nfs_end_page_writeback(struct nfs_page *req)
 {
 	struct inode *inode = page_file_mapping(req->wb_page)->host;
 	struct nfs_server *nfss = NFS_SERVER(inode);
+	bool is_done;
 
-	if (!nfs_page_group_sync_on_bit(req, PG_WB_END))
+	is_done = nfs_page_group_sync_on_bit(req, PG_WB_END);
+	nfs_unlock_request(req);
+	if (!is_done)
 		return;
 
 	end_page_writeback(req->wb_page);
@@ -596,7 +599,6 @@ try_again:
 
 static void nfs_write_error_remove_page(struct nfs_page *req)
 {
-	nfs_unlock_request(req);
 	nfs_end_page_writeback(req);
 	generic_error_remove_page(page_file_mapping(req->wb_page),
 				  req->wb_page);
@@ -1019,7 +1021,6 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 remove_req:
 		nfs_inode_remove_request(req);
 next:
-		nfs_unlock_request(req);
 		nfs_end_page_writeback(req);
 		nfs_release_request(req);
 	}
@@ -1406,7 +1407,6 @@ static void nfs_redirty_request(struct nfs_page *req)
 {
 	nfs_mark_request_dirty(req);
 	set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
-	nfs_unlock_request(req);
 	nfs_end_page_writeback(req);
 	nfs_release_request(req);
 }
-- 
cgit v1.2.3-59-g8ed1b


From b66aaa8dfeda7b5c7df513cf3b36e1290fa84055 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 18 Jul 2017 15:22:12 -0400
Subject: NFS: Fix the inode request accounting when pages have subrequests

Both nfs_destroy_unlinked_subrequests() and nfs_lock_and_join_requests()
manipulate the inode flags adjusting the NFS_I(inode)->nrequests.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index bb38c881fc48..ee981353d4aa 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -418,7 +418,8 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
  */
 static void
 nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
-				 struct nfs_page *old_head)
+				 struct nfs_page *old_head,
+				 struct inode *inode)
 {
 	while (destroy_list) {
 		struct nfs_page *subreq = destroy_list;
@@ -443,9 +444,12 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 			nfs_page_group_clear_bits(subreq);
 
 			/* release the PG_INODE_REF reference */
-			if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags))
+			if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
 				nfs_release_request(subreq);
-			else
+				spin_lock(&inode->i_lock);
+				NFS_I(inode)->nrequests--;
+				spin_unlock(&inode->i_lock);
+			} else
 				WARN_ON_ONCE(1);
 		} else {
 			WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags));
@@ -572,25 +576,24 @@ try_again:
 		head->wb_bytes = total_bytes;
 	}
 
+	/* Postpone destruction of this request */
+	if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) {
+		set_bit(PG_INODE_REF, &head->wb_flags);
+		kref_get(&head->wb_kref);
+		NFS_I(inode)->nrequests++;
+	}
+
 	/*
 	 * prepare head request to be added to new pgio descriptor
 	 */
 	nfs_page_group_clear_bits(head);
 
-	/*
-	 * some part of the group was still on the inode list - otherwise
-	 * the group wouldn't be involved in async write.
-	 * grab a reference for the head request, iff it needs one.
-	 */
-	if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags))
-		kref_get(&head->wb_kref);
-
 	nfs_page_group_unlock(head);
 
 	/* drop lock to clean uprequests on destroy list */
 	spin_unlock(&inode->i_lock);
 
-	nfs_destroy_unlinked_subrequests(destroy_list, head);
+	nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
 
 	/* still holds ref on head from nfs_page_find_head_request_locked
 	 * and still has lock on head from lock loop */
-- 
cgit v1.2.3-59-g8ed1b


From f6032f216fca8a1fa7f43a652f26cdf633183745 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 19:50:23 -0400
Subject: NFS: Teach nfs_try_to_update_request() to deal with request
 page_groups

Simplify the code, and avoid some flushes to disk.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 60 ++++++++++++++++++++--------------------------------------
 1 file changed, 20 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ee981353d4aa..0b4d1ef168e0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1107,39 +1107,19 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 
 	end = offset + bytes;
 
-	for (;;) {
-		if (!(PagePrivate(page) || PageSwapCache(page)))
-			return NULL;
-		spin_lock(&inode->i_lock);
-		req = nfs_page_find_head_request_locked(NFS_I(inode), page);
-		if (req == NULL)
-			goto out_unlock;
-
-		/* should be handled by nfs_flush_incompatible */
-		WARN_ON_ONCE(req->wb_head != req);
-		WARN_ON_ONCE(req->wb_this_page != req);
-
-		rqend = req->wb_offset + req->wb_bytes;
-		/*
-		 * Tell the caller to flush out the request if
-		 * the offsets are non-contiguous.
-		 * Note: nfs_flush_incompatible() will already
-		 * have flushed out requests having wrong owners.
-		 */
-		if (offset > rqend
-		    || end < req->wb_offset)
-			goto out_flushme;
-
-		if (nfs_lock_request(req))
-			break;
+	req = nfs_lock_and_join_requests(page);
+	if (IS_ERR_OR_NULL(req))
+		return req;
 
-		/* The request is locked, so wait and then retry */
-		spin_unlock(&inode->i_lock);
-		error = nfs_wait_on_request(req);
-		nfs_release_request(req);
-		if (error != 0)
-			goto out_err;
-	}
+	rqend = req->wb_offset + req->wb_bytes;
+	/*
+	 * Tell the caller to flush out the request if
+	 * the offsets are non-contiguous.
+	 * Note: nfs_flush_incompatible() will already
+	 * have flushed out requests having wrong owners.
+	 */
+	if (offset > rqend || end < req->wb_offset)
+		goto out_flushme;
 
 	/* Okay, the request matches. Update the region */
 	if (offset < req->wb_offset) {
@@ -1150,17 +1130,17 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		req->wb_bytes = end - req->wb_offset;
 	else
 		req->wb_bytes = rqend - req->wb_offset;
-out_unlock:
-	if (req)
-		nfs_clear_request_commit(req);
-	spin_unlock(&inode->i_lock);
 	return req;
 out_flushme:
-	spin_unlock(&inode->i_lock);
-	nfs_release_request(req);
+	/*
+	 * Note: we mark the request dirty here because
+	 * nfs_lock_and_join_requests() cannot preserve
+	 * commit flags, so we have to replay the write.
+	 */
+	nfs_mark_request_dirty(req);
+	nfs_unlock_and_release_request(req);
 	error = nfs_wb_page(inode, page);
-out_err:
-	return ERR_PTR(error);
+	return (error < 0) ? ERR_PTR(error) : NULL;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 7e6cca6caf7230b049bd681c5400b01c365ee452 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 20:00:46 -0400
Subject: NFS: Remove page group limit in nfs_flush_incompatible()

nfs_try_to_update_request() should be able to cope now.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0b4d1ef168e0..08c1ce968cce 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1205,8 +1205,6 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 		l_ctx = req->wb_lock_context;
 		do_flush = req->wb_page != page ||
 			!nfs_match_open_context(req->wb_context, ctx);
-		/* for now, flush if more than 1 request in page_group */
-		do_flush |= req->wb_this_page != req;
 		if (l_ctx && flctx &&
 		    !(list_empty_careful(&flctx->flc_posix) &&
 		      list_empty_careful(&flctx->flc_flock))) {
-- 
cgit v1.2.3-59-g8ed1b


From b5bab9bf91324a7fe21b365d6966cfd087d08e3a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:34:21 -0400
Subject: NFS: Reduce inode->i_lock contention in nfs_lock_and_join_requests()

We should no longer need the inode->i_lock, now that we've
straightened out the request locking. The locking schema is now:

1) Lock page head request
2) Lock the page group
3) Lock the subrequests one by one

Note that there is a subtle race with nfs_inode_remove_request() due
to the fact that the latter does not lock the page head, when removing
it from the struct page. Only the last subrequest is locked, hence
we need to re-check that the PagePrivate(page) is still set after
we've locked all the subrequests.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 08c1ce968cce..ff7c90c7ff79 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -372,15 +372,14 @@ nfs_page_group_clear_bits(struct nfs_page *req)
  * @head  - head request of page group, must be holding head lock
  * @req   - request that couldn't lock and needs to wait on the req bit lock
  *
- * NOTE: this must be called holding page_group bit lock and inode spin lock
- *       and BOTH will be released before returning.
+ * NOTE: this must be called holding page_group bit lock
+ *       which will be released before returning.
  *
  * returns 0 on success, < 0 on error.
  */
 static int
 nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
 			  struct nfs_page *req)
-	__releases(&inode->i_lock)
 {
 	struct nfs_page *tmp;
 	int ret;
@@ -395,7 +394,6 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
 	kref_get(&req->wb_kref);
 
 	nfs_page_group_unlock(head);
-	spin_unlock(&inode->i_lock);
 
 	/* release ref from nfs_page_find_head_request_locked */
 	nfs_unlock_and_release_request(head);
@@ -491,8 +489,9 @@ nfs_lock_and_join_requests(struct page *page)
 	int ret;
 
 try_again:
+	if (!(PagePrivate(page) || PageSwapCache(page)))
+		return NULL;
 	spin_lock(&inode->i_lock);
-
 	/*
 	 * A reference is taken only on the head request which acts as a
 	 * reference to the whole page group - the group will not be destroyed
@@ -514,16 +513,12 @@ try_again:
 			return ERR_PTR(ret);
 		goto try_again;
 	}
+	spin_unlock(&inode->i_lock);
 
-	/* holding inode lock, so always make a non-blocking call to try the
-	 * page group lock */
-	ret = nfs_page_group_lock(head, true);
+	ret = nfs_page_group_lock(head, false);
 	if (ret < 0) {
-		spin_unlock(&inode->i_lock);
-
-		nfs_page_group_lock_wait(head);
 		nfs_unlock_and_release_request(head);
-		goto try_again;
+		return ERR_PTR(ret);
 	}
 
 	/* lock each request in the page group */
@@ -531,8 +526,10 @@ try_again:
 	for (subreq = head->wb_this_page; subreq != head;
 			subreq = subreq->wb_this_page) {
 		if (!nfs_lock_request(subreq)) {
-			/* releases page group bit lock and
-			 * inode spin lock and all references */
+			/*
+			 * releases page group bit lock and
+			 * page locks and all references
+			 */
 			ret = nfs_unroll_locks_and_wait(inode, head,
 				subreq);
 
@@ -580,7 +577,9 @@ try_again:
 	if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) {
 		set_bit(PG_INODE_REF, &head->wb_flags);
 		kref_get(&head->wb_kref);
+		spin_lock(&inode->i_lock);
 		NFS_I(inode)->nrequests++;
+		spin_unlock(&inode->i_lock);
 	}
 
 	/*
@@ -590,11 +589,14 @@ try_again:
 
 	nfs_page_group_unlock(head);
 
-	/* drop lock to clean uprequests on destroy list */
-	spin_unlock(&inode->i_lock);
-
 	nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
 
+	/* Did we lose a race with nfs_inode_remove_request()? */
+	if (!(PagePrivate(page) || PageSwapCache(page))) {
+		nfs_unlock_and_release_request(head);
+		return NULL;
+	}
+
 	/* still holds ref on head from nfs_page_find_head_request_locked
 	 * and still has lock on head from lock loop */
 	return head;
@@ -968,7 +970,7 @@ nfs_clear_page_commit(struct page *page)
 		    WB_RECLAIMABLE);
 }
 
-/* Called holding inode (/cinfo) lock */
+/* Called holding the request lock on @req */
 static void
 nfs_clear_request_commit(struct nfs_page *req)
 {
@@ -977,9 +979,11 @@ nfs_clear_request_commit(struct nfs_page *req)
 		struct nfs_commit_info cinfo;
 
 		nfs_init_cinfo_from_inode(&cinfo, inode);
+		spin_lock(&inode->i_lock);
 		if (!pnfs_clear_request_commit(req, &cinfo)) {
 			nfs_request_remove_commit_list(req, &cinfo);
 		}
+		spin_unlock(&inode->i_lock);
 		nfs_clear_page_commit(req->wb_page);
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From 74a6d4b5ae4ec7e93c72a92decb2f8c16c812416 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 19 Jul 2017 08:23:10 -0400
Subject: NFS: Further optimise nfs_lock_and_join_requests()

When locking the entire group in order to remove subrequests,
the locks are always taken in order, and with the page group
lock being taken after the page head is locked. The intention
is that:

1) The lock on the group head guarantees that requests may not
   be removed from the group (although new entries could be appended
   if we're not holding the group lock).
2) It is safe to drop and retake the page group lock while iterating
   through the list, in particular when waiting for a subrequest lock.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 45 ++++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ff7c90c7ff79..1ee5d89380d9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -377,31 +377,17 @@ nfs_page_group_clear_bits(struct nfs_page *req)
  *
  * returns 0 on success, < 0 on error.
  */
-static int
-nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
+static void
+nfs_unroll_locks(struct inode *inode, struct nfs_page *head,
 			  struct nfs_page *req)
 {
 	struct nfs_page *tmp;
-	int ret;
 
 	/* relinquish all the locks successfully grabbed this run */
 	for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page)
 		nfs_unlock_request(tmp);
 
 	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
-
-	/* grab a ref on the request that will be waited on */
-	kref_get(&req->wb_kref);
-
-	nfs_page_group_unlock(head);
-
-	/* release ref from nfs_page_find_head_request_locked */
-	nfs_unlock_and_release_request(head);
-
-	ret = nfs_wait_on_request(req);
-	nfs_release_request(req);
-
-	return ret;
 }
 
 /*
@@ -525,18 +511,21 @@ try_again:
 	total_bytes = head->wb_bytes;
 	for (subreq = head->wb_this_page; subreq != head;
 			subreq = subreq->wb_this_page) {
-		if (!nfs_lock_request(subreq)) {
+
+		while (!nfs_lock_request(subreq)) {
 			/*
-			 * releases page group bit lock and
-			 * page locks and all references
+			 * Unlock page to allow nfs_page_group_sync_on_bit()
+			 * to succeed
 			 */
-			ret = nfs_unroll_locks_and_wait(inode, head,
-				subreq);
-
-			if (ret == 0)
-				goto try_again;
-
-			return ERR_PTR(ret);
+			nfs_page_group_unlock(head);
+			ret = nfs_wait_on_request(subreq);
+			if (!ret)
+				ret = nfs_page_group_lock(head, false);
+			if (ret < 0) {
+				nfs_unroll_locks(inode, head, subreq);
+				nfs_unlock_and_release_request(head);
+				return ERR_PTR(ret);
+			}
 		}
 		/*
 		 * Subrequests are always contiguous, non overlapping
@@ -549,7 +538,9 @@ try_again:
 			    ((subreq->wb_offset + subreq->wb_bytes) >
 			     (head->wb_offset + total_bytes)))) {
 			nfs_unlock_request(subreq);
-			nfs_unroll_locks_and_wait(inode, head, subreq);
+			nfs_unroll_locks(inode, head, subreq);
+			nfs_page_group_unlock(head);
+			nfs_unlock_and_release_request(head);
 			return ERR_PTR(-EIO);
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 5b2b5187fa85665f0c47029ecaf49186ec138d9b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 19 Jul 2017 10:06:36 -0400
Subject: NFS: Fix nfs_page_group_destroy() and nfs_lock_and_join_requests()
 race cases

Since nfs_page_group_destroy() does not take any locks on the requests
to be freed, we need to ensure that we don't inadvertently free the
request in nfs_destroy_unlinked_subrequests() while the last reference
is being released elsewhere.

Do this by:

1) Taking a reference to the request unless it is already being freed
2) Checking (under the page group lock) if PG_TEARDOWN is already set before
   freeing an unreferenced request in nfs_destroy_unlinked_subrequests()

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 58 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1ee5d89380d9..ffb9934607ef 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -384,10 +384,11 @@ nfs_unroll_locks(struct inode *inode, struct nfs_page *head,
 	struct nfs_page *tmp;
 
 	/* relinquish all the locks successfully grabbed this run */
-	for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page)
-		nfs_unlock_request(tmp);
-
-	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+	for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
+		if (!kref_read(&tmp->wb_kref))
+			continue;
+		nfs_unlock_and_release_request(tmp);
+	}
 }
 
 /*
@@ -414,36 +415,32 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 		WARN_ON_ONCE(old_head != subreq->wb_head);
 
 		/* make sure old group is not used */
-		subreq->wb_head = subreq;
 		subreq->wb_this_page = subreq;
 
-		/* subreq is now totally disconnected from page group or any
-		 * write / commit lists. last chance to wake any waiters */
-		nfs_unlock_request(subreq);
-
-		if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) {
-			/* release ref on old head request */
-			nfs_release_request(old_head);
+		/* Note: races with nfs_page_group_destroy() */
+		if (!kref_read(&subreq->wb_kref)) {
+			bool freeme = test_bit(PG_TEARDOWN, &subreq->wb_flags);
 
 			nfs_page_group_clear_bits(subreq);
+			/* Check if we raced with nfs_page_group_destroy() */
+			if (freeme)
+				nfs_free_request(subreq);
+			continue;
+		}
 
-			/* release the PG_INODE_REF reference */
-			if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
-				nfs_release_request(subreq);
-				spin_lock(&inode->i_lock);
-				NFS_I(inode)->nrequests--;
-				spin_unlock(&inode->i_lock);
-			} else
-				WARN_ON_ONCE(1);
-		} else {
-			WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags));
-			/* zombie requests have already released the last
-			 * reference and were waiting on the rest of the
-			 * group to complete. Since it's no longer part of a
-			 * group, simply free the request */
-			nfs_page_group_clear_bits(subreq);
-			nfs_free_request(subreq);
+		subreq->wb_head = subreq;
+
+		if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
+			nfs_release_request(subreq);
+			spin_lock(&inode->i_lock);
+			NFS_I(inode)->nrequests--;
+			spin_unlock(&inode->i_lock);
 		}
+
+		nfs_page_group_clear_bits(subreq);
+		/* subreq is now totally disconnected from page group or any
+		 * write / commit lists. last chance to wake any waiters */
+		nfs_unlock_and_release_request(subreq);
 	}
 }
 
@@ -512,6 +509,8 @@ try_again:
 	for (subreq = head->wb_this_page; subreq != head;
 			subreq = subreq->wb_this_page) {
 
+		if (!kref_get_unless_zero(&subreq->wb_kref))
+			continue;
 		while (!nfs_lock_request(subreq)) {
 			/*
 			 * Unlock page to allow nfs_page_group_sync_on_bit()
@@ -523,6 +522,7 @@ try_again:
 				ret = nfs_page_group_lock(head, false);
 			if (ret < 0) {
 				nfs_unroll_locks(inode, head, subreq);
+				nfs_release_request(subreq);
 				nfs_unlock_and_release_request(head);
 				return ERR_PTR(ret);
 			}
@@ -537,8 +537,8 @@ try_again:
 		} else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
 			    ((subreq->wb_offset + subreq->wb_bytes) >
 			     (head->wb_offset + total_bytes)))) {
-			nfs_unlock_request(subreq);
 			nfs_unroll_locks(inode, head, subreq);
+			nfs_unlock_and_release_request(subreq);
 			nfs_page_group_unlock(head);
 			nfs_unlock_and_release_request(head);
 			return ERR_PTR(-EIO);
-- 
cgit v1.2.3-59-g8ed1b


From 902a4c00462a755fe4a6ca655813c8b2a51fab4c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 19 Jul 2017 13:50:07 -0400
Subject: NFS: Remove nfs_page_group_clear_bits()

At this point, we only expect ever to potentially see PG_REMOVE and
PG_TEARDOWN being set on the subrequests.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 29 +++--------------------------
 1 file changed, 3 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ffb9934607ef..20d44ea328b6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -347,22 +347,6 @@ static void nfs_end_page_writeback(struct nfs_page *req)
 		clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 }
 
-
-/* nfs_page_group_clear_bits
- *   @req - an nfs request
- * clears all page group related bits from @req
- */
-static void
-nfs_page_group_clear_bits(struct nfs_page *req)
-{
-	clear_bit(PG_TEARDOWN, &req->wb_flags);
-	clear_bit(PG_UNLOCKPAGE, &req->wb_flags);
-	clear_bit(PG_UPTODATE, &req->wb_flags);
-	clear_bit(PG_WB_END, &req->wb_flags);
-	clear_bit(PG_REMOVE, &req->wb_flags);
-}
-
-
 /*
  * nfs_unroll_locks_and_wait -  unlock all newly locked reqs and wait on @req
  *
@@ -417,13 +401,12 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 		/* make sure old group is not used */
 		subreq->wb_this_page = subreq;
 
+		clear_bit(PG_REMOVE, &subreq->wb_flags);
+
 		/* Note: races with nfs_page_group_destroy() */
 		if (!kref_read(&subreq->wb_kref)) {
-			bool freeme = test_bit(PG_TEARDOWN, &subreq->wb_flags);
-
-			nfs_page_group_clear_bits(subreq);
 			/* Check if we raced with nfs_page_group_destroy() */
-			if (freeme)
+			if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags))
 				nfs_free_request(subreq);
 			continue;
 		}
@@ -437,7 +420,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 			spin_unlock(&inode->i_lock);
 		}
 
-		nfs_page_group_clear_bits(subreq);
 		/* subreq is now totally disconnected from page group or any
 		 * write / commit lists. last chance to wake any waiters */
 		nfs_unlock_and_release_request(subreq);
@@ -573,11 +555,6 @@ try_again:
 		spin_unlock(&inode->i_lock);
 	}
 
-	/*
-	 * prepare head request to be added to new pgio descriptor
-	 */
-	nfs_page_group_clear_bits(head);
-
 	nfs_page_group_unlock(head);
 
 	nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
-- 
cgit v1.2.3-59-g8ed1b


From dee83046e73cb7ebbbae955c1ef0f4f55a0f44f9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:51:02 -0400
Subject: NFS: Remove unuse function nfs_page_group_lock_wait()

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c        | 21 ---------------------
 include/linux/nfs_page.h |  1 -
 2 files changed, 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a6f2bbd709ba..ced7974622dd 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -165,27 +165,6 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock)
 	return -EAGAIN;
 }
 
-/*
- * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it
- * @req - a request in the group
- *
- * This is a blocking call to wait for the group lock to be cleared.
- */
-void
-nfs_page_group_lock_wait(struct nfs_page *req)
-{
-	struct nfs_page *head = req->wb_head;
-
-	WARN_ON_ONCE(head != head->wb_head);
-
-	if (!test_bit(PG_HEADLOCK, &head->wb_flags))
-		return;
-	set_bit(PG_CONTENDED1, &head->wb_flags);
-	smp_mb__after_atomic();
-	wait_on_bit(&head->wb_flags, PG_HEADLOCK,
-		TASK_UNINTERRUPTIBLE);
-}
-
 /*
  * nfs_page_group_unlock - unlock the head of the page group
  * @req - request in group that is to be unlocked
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index d67b67ae6c8b..de1d24cedaa2 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -140,7 +140,6 @@ extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
 extern	void nfs_unlock_and_release_request(struct nfs_page *);
 extern int nfs_page_group_lock(struct nfs_page *, bool);
-extern void nfs_page_group_lock_wait(struct nfs_page *);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
 extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
-- 
cgit v1.2.3-59-g8ed1b


From 1344b7ea172b4911a8ee8a6ff26c5bc6b5abb302 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:54:14 -0400
Subject: NFS: Remove unused parameter from nfs_page_group_lock()

nfs_page_group_lock() is now always called with the 'nonblock'
parameter set to 'false'.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c        | 31 +++++++++++--------------------
 fs/nfs/write.c           |  6 +++---
 include/linux/nfs_page.h |  2 +-
 3 files changed, 15 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ced7974622dd..af6731dd4324 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -134,19 +134,14 @@ EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
 /*
  * nfs_page_group_lock - lock the head of the page group
  * @req - request in group that is to be locked
- * @nonblock - if true don't block waiting for lock
  *
- * this lock must be held if modifying the page group list
+ * this lock must be held when traversing or modifying the page
+ * group list
  *
- * return 0 on success, < 0 on error: -EDELAY if nonblocking or the
- * result from wait_on_bit_lock
- *
- * NOTE: calling with nonblock=false should always have set the
- *       lock bit (see fs/buffer.c and other uses of wait_on_bit_lock
- *       with TASK_UNINTERRUPTIBLE), so there is no need to check the result.
+ * return 0 on success, < 0 on error
  */
 int
-nfs_page_group_lock(struct nfs_page *req, bool nonblock)
+nfs_page_group_lock(struct nfs_page *req)
 {
 	struct nfs_page *head = req->wb_head;
 
@@ -155,14 +150,10 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock)
 	if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
 		return 0;
 
-	if (!nonblock) {
-		set_bit(PG_CONTENDED1, &head->wb_flags);
-		smp_mb__after_atomic();
-		return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+	set_bit(PG_CONTENDED1, &head->wb_flags);
+	smp_mb__after_atomic();
+	return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
 				TASK_UNINTERRUPTIBLE);
-	}
-
-	return -EAGAIN;
 }
 
 /*
@@ -225,7 +216,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
 {
 	bool ret;
 
-	nfs_page_group_lock(req, false);
+	nfs_page_group_lock(req);
 	ret = nfs_page_group_sync_on_bit_locked(req, bit);
 	nfs_page_group_unlock(req);
 
@@ -1016,7 +1007,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 	unsigned int bytes_left = 0;
 	unsigned int offset, pgbase;
 
-	nfs_page_group_lock(req, false);
+	nfs_page_group_lock(req);
 
 	subreq = req;
 	bytes_left = subreq->wb_bytes;
@@ -1038,7 +1029,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 			if (mirror->pg_recoalesce)
 				return 0;
 			/* retry add_request for this subreq */
-			nfs_page_group_lock(req, false);
+			nfs_page_group_lock(req);
 			continue;
 		}
 
@@ -1135,7 +1126,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 
 	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
 		if (midx) {
-			nfs_page_group_lock(req, false);
+			nfs_page_group_lock(req);
 
 			/* find the last request */
 			for (lastreq = req->wb_head;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 20d44ea328b6..0f418d825185 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -271,7 +271,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
 	unsigned int pos = 0;
 	unsigned int len = nfs_page_length(req->wb_page);
 
-	nfs_page_group_lock(req, false);
+	nfs_page_group_lock(req);
 
 	do {
 		tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -480,7 +480,7 @@ try_again:
 	}
 	spin_unlock(&inode->i_lock);
 
-	ret = nfs_page_group_lock(head, false);
+	ret = nfs_page_group_lock(head);
 	if (ret < 0) {
 		nfs_unlock_and_release_request(head);
 		return ERR_PTR(ret);
@@ -501,7 +501,7 @@ try_again:
 			nfs_page_group_unlock(head);
 			ret = nfs_wait_on_request(subreq);
 			if (!ret)
-				ret = nfs_page_group_lock(head, false);
+				ret = nfs_page_group_lock(head);
 			if (ret < 0) {
 				nfs_unroll_locks(inode, head, subreq);
 				nfs_release_request(subreq);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index de1d24cedaa2..2f4fdafb6746 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -139,7 +139,7 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
 extern	void nfs_unlock_and_release_request(struct nfs_page *);
-extern int nfs_page_group_lock(struct nfs_page *, bool);
+extern int nfs_page_group_lock(struct nfs_page *);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
 extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
-- 
cgit v1.2.3-59-g8ed1b


From 7e8a30f8b497315a6467d86c82f6cc8acaa9db61 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 16:58:07 -0400
Subject: NFS: Fix up nfs_page_group_covers_page()

Fix up the test in nfs_page_group_covers_page(). The simplest implementation
is to check that we have a set of intersecting or contiguous subrequests
that connect page offset 0 to nfs_page_length(req->wb_page).

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0f418d825185..759e37d26acf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -243,9 +243,6 @@ nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
 {
 	struct nfs_page *req;
 
-	WARN_ON_ONCE(head != head->wb_head);
-	WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags));
-
 	req = head;
 	do {
 		if (page_offset >= req->wb_pgbase &&
@@ -273,18 +270,15 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
 
 	nfs_page_group_lock(req);
 
-	do {
+	for (;;) {
 		tmp = nfs_page_group_search_locked(req->wb_head, pos);
-		if (tmp) {
-			/* no way this should happen */
-			WARN_ON_ONCE(tmp->wb_pgbase != pos);
-			pos += tmp->wb_bytes - (pos - tmp->wb_pgbase);
-		}
-	} while (tmp && pos < len);
+		if (!tmp)
+			break;
+		pos = tmp->wb_pgbase + tmp->wb_bytes;
+	}
 
 	nfs_page_group_unlock(req);
-	WARN_ON_ONCE(pos > len);
-	return pos == len;
+	return pos >= len;
 }
 
 /* We can set the PG_uptodate flag if we see that a write request
-- 
cgit v1.2.3-59-g8ed1b


From bd37d6fce184836bd5e7cd90ce40116a4fadaf2a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 12:06:30 -0400
Subject: NFSv4: Convert nfs_lock_and_join_requests() to use
 nfs_page_find_head_request()

Hide the locking from nfs_lock_and_join_requests() so that we can
separate out the requirements for swapcache pages.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 759e37d26acf..a06167e20b72 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -154,6 +154,14 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 }
 
+static struct nfs_page *
+nfs_page_private_request(struct page *page)
+{
+	if (!PagePrivate(page))
+		return NULL;
+	return (struct nfs_page *)page_private(page);
+}
+
 /*
  * nfs_page_find_head_request_locked - find head request associated with @page
  *
@@ -164,11 +172,10 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 static struct nfs_page *
 nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
 {
-	struct nfs_page *req = NULL;
+	struct nfs_page *req;
 
-	if (PagePrivate(page))
-		req = (struct nfs_page *)page_private(page);
-	else if (unlikely(PageSwapCache(page)))
+	req = nfs_page_private_request(page);
+	if (!req && unlikely(PageSwapCache(page)))
 		req = nfs_page_search_commits_for_head_request_locked(nfsi,
 			page);
 
@@ -448,31 +455,29 @@ nfs_lock_and_join_requests(struct page *page)
 	int ret;
 
 try_again:
-	if (!(PagePrivate(page) || PageSwapCache(page)))
-		return NULL;
-	spin_lock(&inode->i_lock);
 	/*
 	 * A reference is taken only on the head request which acts as a
 	 * reference to the whole page group - the group will not be destroyed
 	 * until the head reference is released.
 	 */
-	head = nfs_page_find_head_request_locked(NFS_I(inode), page);
-
-	if (!head) {
-		spin_unlock(&inode->i_lock);
+	head = nfs_page_find_head_request(page);
+	if (!head)
 		return NULL;
-	}
 
 	/* lock the page head first in order to avoid an ABBA inefficiency */
 	if (!nfs_lock_request(head)) {
-		spin_unlock(&inode->i_lock);
 		ret = nfs_wait_on_request(head);
 		nfs_release_request(head);
 		if (ret < 0)
 			return ERR_PTR(ret);
 		goto try_again;
 	}
-	spin_unlock(&inode->i_lock);
+
+	/* Ensure that nobody removed the request before we locked it */
+	if (head != nfs_page_private_request(page) && !PageSwapCache(page)) {
+		nfs_unlock_and_release_request(head);
+		goto try_again;
+	}
 
 	ret = nfs_page_group_lock(head);
 	if (ret < 0) {
@@ -559,7 +564,7 @@ try_again:
 		return NULL;
 	}
 
-	/* still holds ref on head from nfs_page_find_head_request_locked
+	/* still holds ref on head from nfs_page_find_head_request
 	 * and still has lock on head from lock loop */
 	return head;
 }
-- 
cgit v1.2.3-59-g8ed1b


From b30d2f04c35d539bf8003b3e014c389abefc249b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 12:26:53 -0400
Subject: NFS: Refactor nfs_page_find_head_request()

Split out the 2 cases so that we can treat the locking differently.
The issue is that the locking in the pageswapcache cache is highly
linked to the commit list locking.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a06167e20b72..8d8fa6d4cfcc 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -170,20 +170,41 @@ nfs_page_private_request(struct page *page)
  * returns matching head request with reference held, or NULL if not found.
  */
 static struct nfs_page *
-nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
+nfs_page_find_private_request(struct page *page)
 {
+	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_page *req;
 
+	if (!PagePrivate(page))
+		return NULL;
+	spin_lock(&inode->i_lock);
 	req = nfs_page_private_request(page);
-	if (!req && unlikely(PageSwapCache(page)))
-		req = nfs_page_search_commits_for_head_request_locked(nfsi,
-			page);
-
 	if (req) {
 		WARN_ON_ONCE(req->wb_head != req);
 		kref_get(&req->wb_kref);
 	}
+	spin_unlock(&inode->i_lock);
+	return req;
+}
 
+static struct nfs_page *
+nfs_page_find_swap_request(struct page *page)
+{
+	struct inode *inode = page_file_mapping(page)->host;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_page *req = NULL;
+	if (!PageSwapCache(page))
+		return NULL;
+	spin_lock(&inode->i_lock);
+	if (PageSwapCache(page)) {
+		req = nfs_page_search_commits_for_head_request_locked(nfsi,
+			page);
+		if (req) {
+			WARN_ON_ONCE(req->wb_head != req);
+			kref_get(&req->wb_kref);
+		}
+	}
+	spin_unlock(&inode->i_lock);
 	return req;
 }
 
@@ -194,14 +215,11 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
  */
 static struct nfs_page *nfs_page_find_head_request(struct page *page)
 {
-	struct inode *inode = page_file_mapping(page)->host;
-	struct nfs_page *req = NULL;
+	struct nfs_page *req;
 
-	if (PagePrivate(page) || PageSwapCache(page)) {
-		spin_lock(&inode->i_lock);
-		req = nfs_page_find_head_request_locked(NFS_I(inode), page);
-		spin_unlock(&inode->i_lock);
-	}
+	req = nfs_page_find_private_request(page);
+	if (!req)
+		req = nfs_page_find_swap_request(page);
 	return req;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From e824f99adaaf1ed0e03eac8574599af6d992163d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 11:53:49 -0400
Subject: NFSv4: Use a mutex to protect the per-inode commit lists

The commit lists can get very large, so using the inode->i_lock can
end up affecting general metadata performance.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c        |  4 ++--
 fs/nfs/inode.c         |  1 +
 fs/nfs/pnfs_nfs.c      | 15 +++++++--------
 fs/nfs/write.c         | 24 ++++++++++++------------
 include/linux/nfs_fs.h |  1 +
 5 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6fb9fad2d1e6..d2972d537469 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -616,13 +616,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
 				  struct list_head *list,
 				  struct nfs_commit_info *cinfo)
 {
-	spin_lock(&cinfo->inode->i_lock);
+	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
 #ifdef CONFIG_NFS_V4_1
 	if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
 		NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 #endif
 	nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
-	spin_unlock(&cinfo->inode->i_lock);
+	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 }
 
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 109279d6d91b..34d9ebbc0dfd 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2016,6 +2016,7 @@ static void init_once(void *foo)
 	nfsi->commit_info.ncommit = 0;
 	atomic_set(&nfsi->commit_info.rpcs_out, 0);
 	init_rwsem(&nfsi->rmdir_sem);
+	mutex_init(&nfsi->commit_mutex);
 	nfs4_init_once(nfsi);
 }
 
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 25f28fa64c57..2cdee8ce2094 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -98,14 +98,13 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
 		if (!nfs_lock_request(req))
 			continue;
 		kref_get(&req->wb_kref);
-		if (cond_resched_lock(&cinfo->inode->i_lock))
-			list_safe_reset_next(req, tmp, wb_list);
 		nfs_request_remove_commit_list(req, cinfo);
 		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 		nfs_list_add_request(req, dst);
 		ret++;
 		if ((ret == max) && !cinfo->dreq)
 			break;
+		cond_resched();
 	}
 	return ret;
 }
@@ -119,7 +118,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
 	struct list_head *dst = &bucket->committing;
 	int ret;
 
-	lockdep_assert_held(&cinfo->inode->i_lock);
+	lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
 	ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
 	if (ret) {
 		cinfo->ds->nwritten -= ret;
@@ -142,7 +141,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
 {
 	int i, rv = 0, cnt;
 
-	lockdep_assert_held(&cinfo->inode->i_lock);
+	lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
 	for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
 		cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
 						       cinfo, max);
@@ -162,7 +161,7 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
 	int nwritten;
 	int i;
 
-	lockdep_assert_held(&cinfo->inode->i_lock);
+	lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
 restart:
 	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
 		nwritten = pnfs_generic_transfer_commit_list(&b->written,
@@ -953,12 +952,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
 	struct list_head *list;
 	struct pnfs_commit_bucket *buckets;
 
-	spin_lock(&cinfo->inode->i_lock);
+	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
 	buckets = cinfo->ds->buckets;
 	list = &buckets[ds_commit_idx].written;
 	if (list_empty(list)) {
 		if (!pnfs_is_valid_lseg(lseg)) {
-			spin_unlock(&cinfo->inode->i_lock);
+			mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 			cinfo->completion_ops->resched_write(cinfo, req);
 			return;
 		}
@@ -975,7 +974,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
 	cinfo->ds->nwritten++;
 
 	nfs_request_add_commit_list_locked(req, list, cinfo);
-	spin_unlock(&cinfo->inode->i_lock);
+	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8d8fa6d4cfcc..5ab5ca24b48a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -195,7 +195,7 @@ nfs_page_find_swap_request(struct page *page)
 	struct nfs_page *req = NULL;
 	if (!PageSwapCache(page))
 		return NULL;
-	spin_lock(&inode->i_lock);
+	mutex_lock(&nfsi->commit_mutex);
 	if (PageSwapCache(page)) {
 		req = nfs_page_search_commits_for_head_request_locked(nfsi,
 			page);
@@ -204,7 +204,7 @@ nfs_page_find_swap_request(struct page *page)
 			kref_get(&req->wb_kref);
 		}
 	}
-	spin_unlock(&inode->i_lock);
+	mutex_unlock(&nfsi->commit_mutex);
 	return req;
 }
 
@@ -856,7 +856,8 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
  * number of outstanding requests requiring a commit as well as
  * the MM page stats.
  *
- * The caller must hold cinfo->inode->i_lock, and the nfs_page lock.
+ * The caller must hold NFS_I(cinfo->inode)->commit_mutex, and the
+ * nfs_page lock.
  */
 void
 nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
@@ -884,9 +885,9 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
 void
 nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
-	spin_lock(&cinfo->inode->i_lock);
+	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
 	nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
-	spin_unlock(&cinfo->inode->i_lock);
+	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 	if (req->wb_page)
 		nfs_mark_page_unstable(req->wb_page, cinfo);
 }
@@ -964,11 +965,11 @@ nfs_clear_request_commit(struct nfs_page *req)
 		struct nfs_commit_info cinfo;
 
 		nfs_init_cinfo_from_inode(&cinfo, inode);
-		spin_lock(&inode->i_lock);
+		mutex_lock(&NFS_I(inode)->commit_mutex);
 		if (!pnfs_clear_request_commit(req, &cinfo)) {
 			nfs_request_remove_commit_list(req, &cinfo);
 		}
-		spin_unlock(&inode->i_lock);
+		mutex_unlock(&NFS_I(inode)->commit_mutex);
 		nfs_clear_page_commit(req->wb_page);
 	}
 }
@@ -1027,7 +1028,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 	return cinfo->mds->ncommit;
 }
 
-/* cinfo->inode->i_lock held by caller */
+/* NFS_I(cinfo->inode)->commit_mutex held by caller */
 int
 nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		     struct nfs_commit_info *cinfo, int max)
@@ -1039,13 +1040,12 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		if (!nfs_lock_request(req))
 			continue;
 		kref_get(&req->wb_kref);
-		if (cond_resched_lock(&cinfo->inode->i_lock))
-			list_safe_reset_next(req, tmp, wb_list);
 		nfs_request_remove_commit_list(req, cinfo);
 		nfs_list_add_request(req, dst);
 		ret++;
 		if ((ret == max) && !cinfo->dreq)
 			break;
+		cond_resched();
 	}
 	return ret;
 }
@@ -1065,7 +1065,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 {
 	int ret = 0;
 
-	spin_lock(&cinfo->inode->i_lock);
+	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
 	if (cinfo->mds->ncommit > 0) {
 		const int max = INT_MAX;
 
@@ -1073,7 +1073,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 					   cinfo, max);
 		ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
 	}
-	spin_unlock(&cinfo->inode->i_lock);
+	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 	return ret;
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 5cc91d6381a3..121a702888b4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -163,6 +163,7 @@ struct nfs_inode {
 	/* Readers: in-flight sillydelete RPC calls */
 	/* Writers: rmdir */
 	struct rw_semaphore	rmdir_sem;
+	struct mutex		commit_mutex;
 
 #if IS_ENABLED(CONFIG_NFS_V4)
 	struct nfs4_cached_acl	*nfs4_acl;
-- 
cgit v1.2.3-59-g8ed1b


From a6b6d5b85abf4914bbceade5dddd54c345c64136 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 15:39:46 -0400
Subject: NFS: Use an atomic_long_t to count the number of requests

Rather than forcing us to take the inode->i_lock just in order to bump
the number.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c |  2 +-
 fs/nfs/delegation.c    |  2 +-
 fs/nfs/inode.c         |  7 +++----
 fs/nfs/pagelist.c      |  4 +---
 fs/nfs/write.c         | 18 +++++-------------
 include/linux/nfs_fs.h |  4 ++--
 6 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 5427cdf04c5a..14358de173fb 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -51,7 +51,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
 		goto out_iput;
 	res->size = i_size_read(inode);
 	res->change_attr = delegation->change_attr;
-	if (nfsi->nrequests != 0)
+	if (nfs_have_writebacks(inode))
 		res->change_attr++;
 	res->ctime = inode->i_ctime;
 	res->mtime = inode->i_mtime;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d7df5e67b0c1..606dd3871f66 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -1089,7 +1089,7 @@ bool nfs4_delegation_flush_on_close(const struct inode *inode)
 	delegation = rcu_dereference(nfsi->delegation);
 	if (delegation == NULL || !(delegation->type & FMODE_WRITE))
 		goto out;
-	if (nfsi->nrequests < delegation->pagemod_limit)
+	if (atomic_long_read(&nfsi->nrequests) < delegation->pagemod_limit)
 		ret = false;
 out:
 	rcu_read_unlock();
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 34d9ebbc0dfd..0480eb02299a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1285,7 +1285,6 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
 
 static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	unsigned long ret = 0;
 
 	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
@@ -1315,7 +1314,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
 	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
 			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
 			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
-			&& nfsi->nrequests == 0) {
+			&& !nfs_have_writebacks(inode)) {
 		i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 		ret |= NFS_INO_INVALID_ATTR;
 	}
@@ -1823,7 +1822,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		if (new_isize != cur_isize) {
 			/* Do we perhaps have any outstanding writes, or has
 			 * the file grown beyond our last write? */
-			if (nfsi->nrequests == 0 || new_isize > cur_isize) {
+			if (!nfs_have_writebacks(inode) || new_isize > cur_isize) {
 				i_size_write(inode, new_isize);
 				if (!have_writers)
 					invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
@@ -2012,7 +2011,7 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_LIST_HEAD(&nfsi->commit_info.list);
-	nfsi->nrequests = 0;
+	atomic_long_set(&nfsi->nrequests, 0);
 	nfsi->commit_info.ncommit = 0;
 	atomic_set(&nfsi->commit_info.rpcs_out, 0);
 	init_rwsem(&nfsi->rmdir_sem);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index af6731dd4324..ec97c301899b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -258,9 +258,7 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
 			inode = page_file_mapping(req->wb_page)->host;
 			set_bit(PG_INODE_REF, &req->wb_flags);
 			kref_get(&req->wb_kref);
-			spin_lock(&inode->i_lock);
-			NFS_I(inode)->nrequests++;
-			spin_unlock(&inode->i_lock);
+			atomic_long_inc(&NFS_I(inode)->nrequests);
 		}
 	}
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5ab5ca24b48a..08093552f115 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -434,9 +434,7 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 
 		if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
 			nfs_release_request(subreq);
-			spin_lock(&inode->i_lock);
-			NFS_I(inode)->nrequests--;
-			spin_unlock(&inode->i_lock);
+			atomic_long_dec(&NFS_I(inode)->nrequests);
 		}
 
 		/* subreq is now totally disconnected from page group or any
@@ -567,9 +565,7 @@ try_again:
 	if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) {
 		set_bit(PG_INODE_REF, &head->wb_flags);
 		kref_get(&head->wb_kref);
-		spin_lock(&inode->i_lock);
-		NFS_I(inode)->nrequests++;
-		spin_unlock(&inode->i_lock);
+		atomic_long_inc(&NFS_I(inode)->nrequests);
 	}
 
 	nfs_page_group_unlock(head);
@@ -755,7 +751,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	nfs_lock_request(req);
 
 	spin_lock(&inode->i_lock);
-	if (!nfsi->nrequests &&
+	if (!nfs_have_writebacks(inode) &&
 	    NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
 		inode->i_version++;
 	/*
@@ -767,7 +763,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 		SetPagePrivate(req->wb_page);
 		set_page_private(req->wb_page, (unsigned long)req);
 	}
-	nfsi->nrequests++;
+	atomic_long_inc(&nfsi->nrequests);
 	/* this a head request for a page group - mark it as having an
 	 * extra reference so sub groups can follow suit.
 	 * This flag also informs pgio layer when to bump nrequests when
@@ -786,6 +782,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page *head;
 
+	atomic_long_dec(&nfsi->nrequests);
 	if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
 		head = req->wb_head;
 
@@ -795,11 +792,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 			ClearPagePrivate(head->wb_page);
 			clear_bit(PG_MAPPED, &head->wb_flags);
 		}
-		nfsi->nrequests--;
-		spin_unlock(&inode->i_lock);
-	} else {
-		spin_lock(&inode->i_lock);
-		nfsi->nrequests--;
 		spin_unlock(&inode->i_lock);
 	}
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 121a702888b4..238fdc4c46df 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -154,7 +154,7 @@ struct nfs_inode {
 	 */
 	__be32			cookieverf[2];
 
-	unsigned long		nrequests;
+	atomic_long_t		nrequests;
 	struct nfs_mds_commit_info commit_info;
 
 	/* Open contexts for shared mmap writes */
@@ -511,7 +511,7 @@ extern void nfs_commit_free(struct nfs_commit_data *data);
 static inline int
 nfs_have_writebacks(struct inode *inode)
 {
-	return NFS_I(inode)->nrequests != 0;
+	return atomic_long_read(&NFS_I(inode)->nrequests) != 0;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 5cb953d4b1e70a09084f71594c45d47458346bc2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 17:04:12 -0400
Subject: NFS: Use an atomic_long_t to count the number of commits

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/inode.c          |  2 +-
 fs/nfs/write.c          | 12 +++++++-----
 include/linux/nfs_xdr.h |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0480eb02299a..134d9f560240 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2012,7 +2012,7 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_LIST_HEAD(&nfsi->commit_info.list);
 	atomic_long_set(&nfsi->nrequests, 0);
-	nfsi->commit_info.ncommit = 0;
+	atomic_long_set(&nfsi->commit_info.ncommit, 0);
 	atomic_set(&nfsi->commit_info.rpcs_out, 0);
 	init_rwsem(&nfsi->rmdir_sem);
 	mutex_init(&nfsi->commit_mutex);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 08093552f115..12479c25028e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -857,7 +857,7 @@ nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
 {
 	set_bit(PG_CLEAN, &req->wb_flags);
 	nfs_list_add_request(req, dst);
-	cinfo->mds->ncommit++;
+	atomic_long_inc(&cinfo->mds->ncommit);
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
 
@@ -903,7 +903,7 @@ nfs_request_remove_commit_list(struct nfs_page *req,
 	if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags))
 		return;
 	nfs_list_remove_request(req);
-	cinfo->mds->ncommit--;
+	atomic_long_dec(&cinfo->mds->ncommit);
 }
 EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
 
@@ -1017,7 +1017,7 @@ out:
 unsigned long
 nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 {
-	return cinfo->mds->ncommit;
+	return atomic_long_read(&cinfo->mds->ncommit);
 }
 
 /* NFS_I(cinfo->inode)->commit_mutex held by caller */
@@ -1057,8 +1057,10 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 {
 	int ret = 0;
 
+	if (!atomic_long_read(&cinfo->mds->ncommit))
+		return 0;
 	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
-	if (cinfo->mds->ncommit > 0) {
+	if (atomic_long_read(&cinfo->mds->ncommit) > 0) {
 		const int max = INT_MAX;
 
 		ret = nfs_scan_commit_list(&cinfo->mds->list, dst,
@@ -1890,7 +1892,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	int ret = 0;
 
 	/* no commits means nothing needs to be done */
-	if (!nfsi->commit_info.ncommit)
+	if (!atomic_long_read(&nfsi->commit_info.ncommit))
 		return ret;
 
 	if (wbc->sync_mode == WB_SYNC_NONE) {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 62cbcb842f99..164d5359d4ab 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1476,7 +1476,7 @@ struct nfs_pgio_header {
 
 struct nfs_mds_commit_info {
 	atomic_t rpcs_out;
-	unsigned long		ncommit;
+	atomic_long_t		ncommit;
 	struct list_head	list;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 4b9bb25b36baa3e2e42b91e451bcd3acfe197a1d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 12:34:44 -0400
Subject: NFS: Switch to using mapping->private_lock for page writeback
 lookups.

Switch from using the inode->i_lock for this to avoid contention with
other metadata manipulation.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 12479c25028e..866702823869 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -172,18 +172,18 @@ nfs_page_private_request(struct page *page)
 static struct nfs_page *
 nfs_page_find_private_request(struct page *page)
 {
-	struct inode *inode = page_file_mapping(page)->host;
+	struct address_space *mapping = page_file_mapping(page);
 	struct nfs_page *req;
 
 	if (!PagePrivate(page))
 		return NULL;
-	spin_lock(&inode->i_lock);
+	spin_lock(&mapping->private_lock);
 	req = nfs_page_private_request(page);
 	if (req) {
 		WARN_ON_ONCE(req->wb_head != req);
 		kref_get(&req->wb_kref);
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&mapping->private_lock);
 	return req;
 }
 
@@ -743,6 +743,7 @@ out_err:
  */
 static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 {
+	struct address_space *mapping = page_file_mapping(req->wb_page);
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	WARN_ON_ONCE(req->wb_this_page != req);
@@ -750,19 +751,23 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	/* Lock the request! */
 	nfs_lock_request(req);
 
-	spin_lock(&inode->i_lock);
-	if (!nfs_have_writebacks(inode) &&
-	    NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
-		inode->i_version++;
 	/*
 	 * Swap-space should not get truncated. Hence no need to plug the race
 	 * with invalidate/truncate.
 	 */
+	spin_lock(&mapping->private_lock);
+	if (!nfs_have_writebacks(inode) &&
+	    NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) {
+		spin_lock(&inode->i_lock);
+		inode->i_version++;
+		spin_unlock(&inode->i_lock);
+	}
 	if (likely(!PageSwapCache(req->wb_page))) {
 		set_bit(PG_MAPPED, &req->wb_flags);
 		SetPagePrivate(req->wb_page);
 		set_page_private(req->wb_page, (unsigned long)req);
 	}
+	spin_unlock(&mapping->private_lock);
 	atomic_long_inc(&nfsi->nrequests);
 	/* this a head request for a page group - mark it as having an
 	 * extra reference so sub groups can follow suit.
@@ -770,7 +775,6 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	 * adding subrequests. */
 	WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags));
 	kref_get(&req->wb_kref);
-	spin_unlock(&inode->i_lock);
 }
 
 /*
@@ -778,7 +782,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
  */
 static void nfs_inode_remove_request(struct nfs_page *req)
 {
-	struct inode *inode = d_inode(req->wb_context->dentry);
+	struct address_space *mapping = page_file_mapping(req->wb_page);
+	struct inode *inode = mapping->host;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page *head;
 
@@ -786,13 +791,13 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 	if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
 		head = req->wb_head;
 
-		spin_lock(&inode->i_lock);
+		spin_lock(&mapping->private_lock);
 		if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
 			set_page_private(head->wb_page, 0);
 			ClearPagePrivate(head->wb_page);
 			clear_bit(PG_MAPPED, &head->wb_flags);
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&mapping->private_lock);
 	}
 
 	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
-- 
cgit v1.2.3-59-g8ed1b


From 8205b9ce030288e104a3024344f2a0a086231e36 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 17:07:02 -0400
Subject: NFSv4/pnfs: Replace pnfs_put_lseg_locked() with pnfs_put_lseg()

Now that we no longer hold the inode->i_lock when manipulating the
commit lists, it is safe to call pnfs_put_lseg() again.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c     | 41 -----------------------------------------
 fs/nfs/pnfs.h     |  2 --
 fs/nfs/pnfs_nfs.c |  4 ++--
 3 files changed, 2 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c383d0913b54..3125a9d7b237 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -529,47 +529,6 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 }
 EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 
-static void pnfs_free_lseg_async_work(struct work_struct *work)
-{
-	struct pnfs_layout_segment *lseg;
-	struct pnfs_layout_hdr *lo;
-
-	lseg = container_of(work, struct pnfs_layout_segment, pls_work);
-	lo = lseg->pls_layout;
-
-	pnfs_free_lseg(lseg);
-	pnfs_put_layout_hdr(lo);
-}
-
-static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg)
-{
-	INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work);
-	schedule_work(&lseg->pls_work);
-}
-
-void
-pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
-{
-	if (!lseg)
-		return;
-
-	assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
-
-	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
-		atomic_read(&lseg->pls_refcount),
-		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-	if (atomic_dec_and_test(&lseg->pls_refcount)) {
-		struct pnfs_layout_hdr *lo = lseg->pls_layout;
-		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
-			return;
-		pnfs_layout_remove_lseg(lo, lseg);
-		if (!pnfs_cache_lseg_for_layoutreturn(lo, lseg)) {
-			pnfs_get_layout_hdr(lo);
-			pnfs_free_lseg_async(lseg);
-		}
-	}
-}
-
 /*
  * is l2 fully contained in l1?
  *   start1                             end1
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 99731e3e332f..87f144f14d1e 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -67,7 +67,6 @@ struct pnfs_layout_segment {
 	u32 pls_seq;
 	unsigned long pls_flags;
 	struct pnfs_layout_hdr *pls_layout;
-	struct work_struct pls_work;
 };
 
 enum pnfs_try_status {
@@ -230,7 +229,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
 /* pnfs.c */
 void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
-void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
 
 void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
 void unset_pnfs_layoutdriver(struct nfs_server *);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 2cdee8ce2094..4b0a809653d1 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -83,7 +83,7 @@ pnfs_generic_clear_request_commit(struct nfs_page *req,
 	}
 out:
 	nfs_request_remove_commit_list(req, cinfo);
-	pnfs_put_lseg_locked(freeme);
+	pnfs_put_lseg(freeme);
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
 
@@ -126,7 +126,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
 		if (bucket->clseg == NULL)
 			bucket->clseg = pnfs_get_lseg(bucket->wlseg);
 		if (list_empty(src)) {
-			pnfs_put_lseg_locked(bucket->wlseg);
+			pnfs_put_lseg(bucket->wlseg);
 			bucket->wlseg = NULL;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 2ce209c42c01ca976ad680fea52a8e8b9a53643b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 17:29:29 -0400
Subject: NFS: Wait for requests that are locked on the commit list

If a request is on the commit list, but is locked, we will currently skip
it, which can lead to livelocking when the commit count doesn't reduce
to zero.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c |  2 ++
 fs/nfs/pnfs_nfs.c | 18 ++++++++++++++----
 fs/nfs/write.c    | 17 +++++++++++++----
 3 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ec97c301899b..548ebc7256ff 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -434,6 +434,7 @@ void nfs_release_request(struct nfs_page *req)
 {
 	kref_put(&req->wb_kref, nfs_page_group_destroy);
 }
+EXPORT_SYMBOL_GPL(nfs_release_request);
 
 /**
  * nfs_wait_on_request - Wait for a request to complete.
@@ -452,6 +453,7 @@ nfs_wait_on_request(struct nfs_page *req)
 	return wait_on_bit_io(&req->wb_flags, PG_BUSY,
 			      TASK_UNINTERRUPTIBLE);
 }
+EXPORT_SYMBOL_GPL(nfs_wait_on_request);
 
 /*
  * nfs_generic_pg_test - determine if requests can be coalesced
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 4b0a809653d1..303ff171cb5d 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -91,13 +91,23 @@ static int
 pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
 				  struct nfs_commit_info *cinfo, int max)
 {
-	struct nfs_page *req, *tmp;
+	struct nfs_page *req;
 	int ret = 0;
 
-	list_for_each_entry_safe(req, tmp, src, wb_list) {
-		if (!nfs_lock_request(req))
-			continue;
+	while(!list_empty(src)) {
+		req = list_first_entry(src, struct nfs_page, wb_list);
+
 		kref_get(&req->wb_kref);
+		if (!nfs_lock_request(req)) {
+			int status;
+			mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+			status = nfs_wait_on_request(req);
+			nfs_release_request(req);
+			mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+			if (status < 0)
+				break;
+			continue;
+		}
 		nfs_request_remove_commit_list(req, cinfo);
 		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 		nfs_list_add_request(req, dst);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 866702823869..5dd3b212376e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1030,13 +1030,22 @@ int
 nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		     struct nfs_commit_info *cinfo, int max)
 {
-	struct nfs_page *req, *tmp;
+	struct nfs_page *req;
 	int ret = 0;
 
-	list_for_each_entry_safe(req, tmp, src, wb_list) {
-		if (!nfs_lock_request(req))
-			continue;
+	while(!list_empty(src)) {
+		req = list_first_entry(src, struct nfs_page, wb_list);
 		kref_get(&req->wb_kref);
+		if (!nfs_lock_request(req)) {
+			int status;
+			mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+			status = nfs_wait_on_request(req);
+			nfs_release_request(req);
+			mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+			if (status < 0)
+				break;
+			continue;
+		}
 		nfs_request_remove_commit_list(req, cinfo);
 		nfs_list_add_request(req, dst);
 		ret++;
-- 
cgit v1.2.3-59-g8ed1b


From 14abcb0bf59a30cf65a74f6c6f53974cd7224bc6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 19 Aug 2017 10:10:34 -0400
Subject: NFSv4: Fix up mirror allocation

There are a number of callers of nfs_pageio_complete() that want to
continue using the nfs_pageio_descriptor without needing to call
nfs_pageio_init() again. Examples include nfs_pageio_resend() and
nfs_pageio_cond_complete().

The problem is that nfs_pageio_complete() also calls
nfs_pageio_cleanup_mirroring(), which frees up the array of mirrors.
This can lead to writeback errors, in the next call to
nfs_pageio_setup_mirroring().

Fix by simply moving the allocation of the mirrors to
nfs_pageio_setup_mirroring().

Link: https://bugzilla.kernel.org/show_bug.cgi?id=196709
Reported-by: JianhongYin <yin-jianhong@163.com>
Cc: stable@vger.kernel.org # 4.0+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c | 73 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 39 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index de9066a92c0d..b7d193e2a243 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -714,9 +714,6 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     int io_flags,
 		     gfp_t gfp_flags)
 {
-	struct nfs_pgio_mirror *new;
-	int i;
-
 	desc->pg_moreio = 0;
 	desc->pg_inode = inode;
 	desc->pg_ops = pg_ops;
@@ -732,21 +729,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_mirror_count = 1;
 	desc->pg_mirror_idx = 0;
 
-	if (pg_ops->pg_get_mirror_count) {
-		/* until we have a request, we don't have an lseg and no
-		 * idea how many mirrors there will be */
-		new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
-			      sizeof(struct nfs_pgio_mirror), gfp_flags);
-		desc->pg_mirrors_dynamic = new;
-		desc->pg_mirrors = new;
-
-		for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++)
-			nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize);
-	} else {
-		desc->pg_mirrors_dynamic = NULL;
-		desc->pg_mirrors = desc->pg_mirrors_static;
-		nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
-	}
+	desc->pg_mirrors_dynamic = NULL;
+	desc->pg_mirrors = desc->pg_mirrors_static;
+	nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init);
 
@@ -865,32 +850,52 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 	return ret;
 }
 
+static struct nfs_pgio_mirror *
+nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc,
+		unsigned int mirror_count)
+{
+	struct nfs_pgio_mirror *ret;
+	unsigned int i;
+
+	kfree(desc->pg_mirrors_dynamic);
+	desc->pg_mirrors_dynamic = NULL;
+	if (mirror_count == 1)
+		return desc->pg_mirrors_static;
+	ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_NOFS);
+	if (ret != NULL) {
+		for (i = 0; i < mirror_count; i++)
+			nfs_pageio_mirror_init(&ret[i], desc->pg_bsize);
+		desc->pg_mirrors_dynamic = ret;
+	}
+	return ret;
+}
+
 /*
  * nfs_pageio_setup_mirroring - determine if mirroring is to be used
  *				by calling the pg_get_mirror_count op
  */
-static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
+static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
 				       struct nfs_page *req)
 {
-	int mirror_count = 1;
+	unsigned int mirror_count = 1;
 
-	if (!pgio->pg_ops->pg_get_mirror_count)
-		return 0;
-
-	mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
-
-	if (pgio->pg_error < 0)
-		return pgio->pg_error;
-
-	if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
-		return -EINVAL;
+	if (pgio->pg_ops->pg_get_mirror_count)
+		mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+	if (mirror_count == pgio->pg_mirror_count || pgio->pg_error < 0)
+		return;
 
-	if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic))
-		return -EINVAL;
+	if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) {
+		pgio->pg_error = -EINVAL;
+		return;
+	}
 
+	pgio->pg_mirrors = nfs_pageio_alloc_mirrors(pgio, mirror_count);
+	if (pgio->pg_mirrors == NULL) {
+		pgio->pg_error = -ENOMEM;
+		pgio->pg_mirrors = pgio->pg_mirrors_static;
+		mirror_count = 1;
+	}
 	pgio->pg_mirror_count = mirror_count;
-
-	return 0;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 3bde7afdabe9f37974af806abe646c2ca43c67c7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 20 Aug 2017 11:33:25 -0400
Subject: NFS: Remove unused parameter gfp_flags from nfs_pageio_init()

Now that the mirror allocation has been moved, the parameter can go.
Also remove the redundant symbol export.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c        | 4 +---
 fs/nfs/read.c            | 2 +-
 fs/nfs/write.c           | 2 +-
 include/linux/nfs_page.h | 3 +--
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b7d193e2a243..ee66d8c3336c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -711,8 +711,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     const struct nfs_pgio_completion_ops *compl_ops,
 		     const struct nfs_rw_ops *rw_ops,
 		     size_t bsize,
-		     int io_flags,
-		     gfp_t gfp_flags)
+		     int io_flags)
 {
 	desc->pg_moreio = 0;
 	desc->pg_inode = inode;
@@ -733,7 +732,6 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_mirrors = desc->pg_mirrors_static;
 	nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
 }
-EXPORT_SYMBOL_GPL(nfs_pageio_init);
 
 /**
  * nfs_pgio_result - Basic pageio error handling
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a8421d9dab6a..0d42573d423d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -68,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 		pg_ops = server->pnfs_curr_ld->pg_read_ops;
 #endif
 	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
-			server->rsize, 0, GFP_KERNEL);
+			server->rsize, 0);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b1af5dee5e0a..ae78ac0a7a8a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1452,7 +1452,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 		pg_ops = server->pnfs_curr_ld->pg_write_ops;
 #endif
 	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
-			server->wsize, ioflags, GFP_NOIO);
+			server->wsize, ioflags);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index d67b67ae6c8b..8b1a35aad0c3 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -125,8 +125,7 @@ extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 			     const struct nfs_pgio_completion_ops *compl_ops,
 			     const struct nfs_rw_ops *rw_ops,
 			     size_t bsize,
-			     int how,
-			     gfp_t gfp_flags);
+			     int how);
 extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
 				   struct nfs_page *);
 extern  int nfs_pageio_resend(struct nfs_pageio_descriptor *,
-- 
cgit v1.2.3-59-g8ed1b


From b79e87e070476e16b1d687e5ccc2da6db1a839dc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 18 Aug 2017 17:12:52 +1000
Subject: NFSv4.1: don't use machine credentials for CLOSE when using 'sec=sys'

An NFSv4.1 client might close a file after the user who opened it has
logged off.  In this case the user's credentials may no longer be
valid, if they are e.g. kerberos credentials that have expired.

NFSv4.1 has a mechanism to allow the client to use machine credentials
to close a file.  However due to a short-coming in the RFC, a CLOSE
with those credentials may not be possible if the file in question
isn't exported to the same security flavor - the required PUTFH must
be rejected when this is the case.

Specifically if a server and client support kerberos in general and
have used it to form a machine credential, but the file is only
exported to "sec=sys", a PUTFH with the machine credentials will fail,
so CLOSE is not possible.

As RPC_AUTH_UNIX (used by sec=sys) credentials can never expire, there
is no value in using the machine credential in place of them.
So in that case, just use the users credentials for CLOSE etc, as you would
in NFSv4.0

Signed-off-by: Neil Brown <neilb@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4_fs.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 40bd05f05e74..ac4f10b7f6c1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -303,6 +303,17 @@ _nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
 	struct rpc_cred *newcred = NULL;
 	rpc_authflavor_t flavor;
 
+	if (sp4_mode == NFS_SP4_MACH_CRED_CLEANUP ||
+	    sp4_mode == NFS_SP4_MACH_CRED_PNFS_CLEANUP) {
+		/* Using machine creds for cleanup operations
+		 * is only relevent if the client credentials
+		 * might expire. So don't bother for
+		 * RPC_AUTH_UNIX.  If file was only exported to
+		 * sec=sys, the PUTFH would fail anyway.
+		 */
+		if ((*clntp)->cl_auth->au_flavor == RPC_AUTH_UNIX)
+			return false;
+	}
 	if (test_bit(sp4_mode, &clp->cl_sp4_flags)) {
 		spin_lock(&clp->cl_lock);
 		if (clp->cl_machine_cred != NULL)
-- 
cgit v1.2.3-59-g8ed1b


From 53a75f22e78a601321c2e1fd16266ecdae2f2309 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 10 Aug 2017 16:41:31 -0400
Subject: NFS: Fix NFSv2 security settings

For a while now any NFSv2 mount where sec= is specified uses
AUTH_NULL. If sec= is not specified, the mount uses AUTH_UNIX.
Commit e68fd7c8071d ("mount: use sec= that was specified on the
command line") attempted to address a very similar problem with
NFSv3, and should have fixed this too, but it has a bug.

The MNTv1 MNT procedure does not return a list of security flavors,
so our client makes up a list containing just AUTH_NULL. This should
enable nfs_verify_authflavors() to assign the sec= specified flavor,
but instead, it incorrectly sets it to AUTH_NULL.

I expect this would also be a problem for any NFSv3 server whose
MNTv3 MNT procedure returned a security flavor list containing only
AUTH_NULL.

Fixes: e68fd7c8071d ("mount: use sec= that was specified on ... ")
BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=310
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/super.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d828ef88e7db..6b179af59b92 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1691,8 +1691,8 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
 			rpc_authflavor_t *server_authlist, unsigned int count)
 {
 	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
+	bool found_auth_null = false;
 	unsigned int i;
-	int use_auth_null = false;
 
 	/*
 	 * If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1701,6 +1701,10 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
 	 * AUTH_NULL has a special meaning when it's in the server list - it
 	 * means that the server will ignore the rpc creds, so any flavor
 	 * can be used but still use the sec= that was specified.
+	 *
+	 * Note also that the MNT procedure in MNTv1 does not return a list
+	 * of supported security flavors. In this case, nfs_mount() fabricates
+	 * a security flavor list containing just AUTH_NULL.
 	 */
 	for (i = 0; i < count; i++) {
 		flavor = server_authlist[i];
@@ -1709,11 +1713,11 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
 			goto out;
 
 		if (flavor == RPC_AUTH_NULL)
-			use_auth_null = true;
+			found_auth_null = true;
 	}
 
-	if (use_auth_null) {
-		flavor = RPC_AUTH_NULL;
+	if (found_auth_null) {
+		flavor = args->auth_info.flavors[0];
 		goto out;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 237f8306c30206d997e265ff46652e602a50824f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 18 Aug 2017 17:12:51 +1000
Subject: NFS: don't expect errors from mempool_alloc().

Commit fbe77c30e9ab ("NFS: move rw_mode to nfs_pageio_header")
reintroduced some pointless code that commit 518662e0fcb9 ("NFS: fix
usage of mempools.") had recently removed.

Remove it again.

Cc: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1a877d17e6cb..ae26775b5448 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -102,10 +102,8 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
 	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
-	if (p) {
-		memset(p, 0, sizeof(*p));
-		p->rw_mode = FMODE_WRITE;
-	}
+	memset(p, 0, sizeof(*p));
+	p->rw_mode = FMODE_WRITE;
 	return p;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 779eafab06036fe1e06dea9bbd97cc4b12f0138f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 18 Aug 2017 17:12:52 +1000
Subject: NFS: flush data when locking a file to ensure cache coherence for
 mmap.

When a byte range lock (or flock) is taken out on an NFS file, the
validity of the cached data is checked and the inode is marked
NFS_INODE_INVALID_DATA.  However the cached data isn't flushed from
the page cache.

This is sufficient for future read() requests or mmap() requests as
they call nfs_revalidate_mapping() which performs the flush if
necessary.

However an existing mapping is not affected.  Accessing data through
that mapping will continue to return old data even though the inode is
marked NFS_INODE_INVALID_DATA.

This can easily be confirmed using the 'nfs' tool in
  git://github.com/okirch/twopence-nfs.git
and running

   nfs coherence FILENAME
on one client, and
   nfs coherence -r FILENAME
on another client.

It appears that prior to Linux 2.6.0 this worked correctly.

However commit:

http://git.kernel.org/cgit/linux/kernel/git/history/history.git/commit/?id=ca9268fe3ddd075714005adecd4afbd7f9ab87d0

removed the call to inode_invalidate_pages() from nfs_zap_caches().  I
haven't tested this code, but inspection suggests that prior to this
commit, file locking would invalidate all inode pages.

This patch adds a call to nfs_revalidate_mapping() after a
successful SETLK so that invalid data is flushed.  With this patch the
above test passes.  To minimize impact (and possibly avoid a GETATTR
call) this only happens if the mapping might be mapped into
userspace.

Cc: Olaf Kirch <okir@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index af330c31f627..1d71472d239d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -744,15 +744,18 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 		goto out;
 
 	/*
-	 * Revalidate the cache if the server has time stamps granular
-	 * enough to detect subsecond changes.  Otherwise, clear the
-	 * cache to prevent missing any changes.
+	 * Invalidate cache to prevent missing any changes.  If
+	 * the file is mapped, clear the page cache as well so
+	 * those mappings will be loaded.
 	 *
 	 * This makes locking act as a cache coherency point.
 	 */
 	nfs_sync_mapping(filp->f_mapping);
-	if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+	if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
 		nfs_zap_caches(inode);
+		if (mapping_mapped(filp->f_mapping))
+			nfs_revalidate_mapping(inode, filp->f_mapping);
+	}
 out:
 	return status;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 03c6f7d64ac9c0a37cca91392ac4be8993a8f53d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 16 Aug 2017 12:47:27 +1000
Subject: NFS: remove jiffies field from access cache

This field hasn't been used since commit 57b691819ee2 ("NFS: Cache
access checks more aggressively").

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/dir.c           | 4 ----
 fs/nfs/nfs4proc.c      | 1 -
 include/linux/nfs_fs.h | 1 -
 3 files changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3522b1249019..5ceaeb1f6fb6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2260,7 +2260,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
 		spin_lock(&inode->i_lock);
 		retry = false;
 	}
-	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
 	res->mask = cache->mask;
 	list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
@@ -2296,7 +2295,6 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
 		goto out;
 	if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
 		goto out;
-	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
 	res->mask = cache->mask;
 	err = 0;
@@ -2344,7 +2342,6 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 	if (cache == NULL)
 		return;
 	RB_CLEAR_NODE(&cache->rb_node);
-	cache->jiffies = set->jiffies;
 	cache->cred = get_rpccred(set->cred);
 	cache->mask = set->mask;
 
@@ -2432,7 +2429,6 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 	cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE
 		     | NFS_MAY_WRITE | NFS_MAY_READ;
 	cache.cred = cred;
-	cache.jiffies = jiffies;
 	status = NFS_PROTO(inode)->access(inode, &cache);
 	if (status != 0) {
 		if (status == -ESTALE) {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 08cc97488904..6c61e2b99635 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2285,7 +2285,6 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
 		mask = NFS4_ACCESS_READ;
 
 	cache.cred = cred;
-	cache.jiffies = jiffies;
 	nfs_access_set_mask(&cache, opendata->o_res.access_result);
 	nfs_access_add_cache(state->inode, &cache);
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 238fdc4c46df..a0282ceaa48b 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -49,7 +49,6 @@
 struct nfs_access_entry {
 	struct rb_node		rb_node;
 	struct list_head	lru;
-	unsigned long		jiffies;
 	struct rpc_cred *	cred;
 	__u32			mask;
 	struct rcu_head		rcu_head;
-- 
cgit v1.2.3-59-g8ed1b


From 58a69893a9bf6833a79dae801da78e1d4f46f0a2 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 16 Aug 2017 22:35:24 +0200
Subject: lockd: Delete an error message for a failed memory allocation in
 reclaimer()

Omit an extra message for a memory allocation failure in this function.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/lockd/clntlock.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 27d577dbe51a..96c1d14c18f1 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -235,12 +235,8 @@ reclaimer(void *ptr)
 	struct net *net = host->net;
 
 	req = kmalloc(sizeof(*req), GFP_KERNEL);
-	if (!req) {
-		printk(KERN_ERR "lockd: reclaimer unable to alloc memory."
-				" Locks for %s won't be reclaimed!\n",
-				host->h_name);
+	if (!req)
 		return 0;
-	}
 
 	allow_signal(SIGKILL);
 
-- 
cgit v1.2.3-59-g8ed1b


From e973b1a5999e57da677ab50da5f5479fdc0f0c31 Mon Sep 17 00:00:00 2001
From: "tarangg@amazon.com" <tarangg@amazon.com>
Date: Thu, 7 Sep 2017 09:29:23 -0400
Subject: NFS: Sync the correct byte range during synchronous writes

Since commit 18290650b1c8 ("NFS: Move buffered I/O locking into
nfs_file_write()") nfs_file_write() has not flushed the correct byte
range during synchronous writes.  generic_write_sync() expects that
iocb->ki_pos points to the right edge of the range rather than the
left edge.

To replicate the problem, open a file with O_DSYNC, have the client
write at increasing offsets, and then print the successful offsets.
Block port 2049 partway through that sequence, and observe that the
client application indicates successful writes in advance of what the
server received.

Fixes: 18290650b1c8 ("NFS: Move buffered I/O locking into nfs_file_write()")
Signed-off-by: Jacob Strauss <jsstraus@amazon.com>
Signed-off-by: Tarang Gupta <tarangg@amazon.com>
Tested-by: Tarang Gupta <tarangg@amazon.com>
Cc: stable@vger.kernel.org # v4.8+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 1d71472d239d..a385d1c3f146 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -631,11 +631,11 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 	if (result <= 0)
 		goto out;
 
-	result = generic_write_sync(iocb, result);
-	if (result < 0)
-		goto out;
 	written = result;
 	iocb->ki_pos += written;
+	result = generic_write_sync(iocb, written);
+	if (result < 0)
+		goto out;
 
 	/* Return error values */
 	if (nfs_need_check_write(file, inode)) {
-- 
cgit v1.2.3-59-g8ed1b


From 196639ebbe63a037fe9a80669140bd292d8bcd80 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 8 Sep 2017 21:28:11 -0400
Subject: NFS: Fix 2 use after free issues in the I/O code

The writeback code wants to send a commit after processing the pages,
which is why we want to delay releasing the struct path until after
that's done.

Also, the layout code expects that we do not free the inode before
we've put the layout segments in pnfs_writehdr_free() and
pnfs_readhdr_free()

Fixes: 919e3bd9a875 ("NFS: Ensure we commit after writeback is complete")
Fixes: 4714fb51fd03 ("nfs: remove pgio_header refcount, related cleanup")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/internal.h |  1 -
 fs/nfs/pagelist.c | 26 ++++++++++++--------------
 fs/nfs/pnfs.c     |  2 --
 3 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index dc456416d2be..68cc22083639 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -251,7 +251,6 @@ int nfs_iocounter_wait(struct nfs_lock_context *l_ctx);
 extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
 struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
 void nfs_pgio_header_free(struct nfs_pgio_header *);
-void nfs_pgio_data_destroy(struct nfs_pgio_header *);
 int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
 int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
 		      struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 23cf4a82d94c..bec120ec1967 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -501,16 +501,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
 }
 EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
 
-/*
- * nfs_pgio_header_free - Free a read or write header
- * @hdr: The header to free
- */
-void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
-{
-	hdr->rw_ops->rw_free_header(hdr);
-}
-EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
-
 /**
  * nfs_pgio_data_destroy - make @hdr suitable for reuse
  *
@@ -519,14 +509,24 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
  *
  * @hdr: A header that has had nfs_generic_pgio called
  */
-void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
+static void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
 {
 	if (hdr->args.context)
 		put_nfs_open_context(hdr->args.context);
 	if (hdr->page_array.pagevec != hdr->page_array.page_array)
 		kfree(hdr->page_array.pagevec);
 }
-EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy);
+
+/*
+ * nfs_pgio_header_free - Free a read or write header
+ * @hdr: The header to free
+ */
+void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
+{
+	nfs_pgio_data_destroy(hdr);
+	hdr->rw_ops->rw_free_header(hdr);
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
 
 /**
  * nfs_pgio_rpcsetup - Set up arguments for a pageio call
@@ -640,7 +640,6 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
 static void nfs_pgio_error(struct nfs_pgio_header *hdr)
 {
 	set_bit(NFS_IOHDR_REDO, &hdr->flags);
-	nfs_pgio_data_destroy(hdr);
 	hdr->completion_ops->completion(hdr);
 }
 
@@ -651,7 +650,6 @@ static void nfs_pgio_error(struct nfs_pgio_header *hdr)
 static void nfs_pgio_release(void *calldata)
 {
 	struct nfs_pgio_header *hdr = calldata;
-	nfs_pgio_data_destroy(hdr);
 	hdr->completion_ops->completion(hdr);
 }
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3125a9d7b237..7879ed8ceb76 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2233,7 +2233,6 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
 		nfs_pageio_reset_write_mds(desc);
 		mirror->pg_recoalesce = 1;
 	}
-	nfs_pgio_data_destroy(hdr);
 	hdr->release(hdr);
 }
 
@@ -2357,7 +2356,6 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
 		nfs_pageio_reset_read_mds(desc);
 		mirror->pg_recoalesce = 1;
 	}
-	nfs_pgio_data_destroy(hdr);
 	hdr->release(hdr);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 137da553dba62dfc64fb8f4ccb5be769acbf615e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 9 Sep 2017 12:28:01 -0400
Subject: NFS: nfs_lock_and_join_requests and nfs_scan_commit_list can deadlock

Since the commit list is not ordered, it is possible for nfs_scan_commit_list
to hold a request that nfs_lock_and_join_requests() is waiting for, while
at the same time trying to grab a request that nfs_lock_and_join_requests
already holds.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs_nfs.c | 16 +++++++++++-----
 fs/nfs/write.c    | 15 +++++++++++----
 2 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 303ff171cb5d..d03d836b6ee0 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -91,22 +91,28 @@ static int
 pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
 				  struct nfs_commit_info *cinfo, int max)
 {
-	struct nfs_page *req;
+	struct nfs_page *req, *tmp;
 	int ret = 0;
 
-	while(!list_empty(src)) {
-		req = list_first_entry(src, struct nfs_page, wb_list);
-
+restart:
+	list_for_each_entry_safe(req, tmp, src, wb_list) {
 		kref_get(&req->wb_kref);
 		if (!nfs_lock_request(req)) {
 			int status;
+
+			/* Prevent deadlock with nfs_lock_and_join_requests */
+			if (!list_empty(dst)) {
+				nfs_release_request(req);
+				continue;
+			}
+			/* Ensure we make progress to prevent livelock */
 			mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 			status = nfs_wait_on_request(req);
 			nfs_release_request(req);
 			mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
 			if (status < 0)
 				break;
-			continue;
+			goto restart;
 		}
 		nfs_request_remove_commit_list(req, cinfo);
 		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ae26775b5448..c3f627b08ec6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1028,21 +1028,28 @@ int
 nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		     struct nfs_commit_info *cinfo, int max)
 {
-	struct nfs_page *req;
+	struct nfs_page *req, *tmp;
 	int ret = 0;
 
-	while(!list_empty(src)) {
-		req = list_first_entry(src, struct nfs_page, wb_list);
+restart:
+	list_for_each_entry_safe(req, tmp, src, wb_list) {
 		kref_get(&req->wb_kref);
 		if (!nfs_lock_request(req)) {
 			int status;
+
+			/* Prevent deadlock with nfs_lock_and_join_requests */
+			if (!list_empty(dst)) {
+				nfs_release_request(req);
+				continue;
+			}
+			/* Ensure we make progress to prevent livelock */
 			mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 			status = nfs_wait_on_request(req);
 			nfs_release_request(req);
 			mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
 			if (status < 0)
 				break;
-			continue;
+			goto restart;
 		}
 		nfs_request_remove_commit_list(req, cinfo);
 		nfs_list_add_request(req, dst);
-- 
cgit v1.2.3-59-g8ed1b


From 5d2a9d9dac902f438daa1334540398aec6c8dcfa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 9 Sep 2017 12:40:38 -0400
Subject: NFS: Remove pnfs_generic_transfer_commit_list()

It's pretty much a duplicate of nfs_scan_commit_list() that also
clears the PG_COMMIT_TO_DS flag.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs_nfs.c | 43 ++-----------------------------------------
 fs/nfs/write.c    |  2 ++
 2 files changed, 4 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index d03d836b6ee0..60da59be83b6 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -87,44 +87,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
 
-static int
-pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
-				  struct nfs_commit_info *cinfo, int max)
-{
-	struct nfs_page *req, *tmp;
-	int ret = 0;
-
-restart:
-	list_for_each_entry_safe(req, tmp, src, wb_list) {
-		kref_get(&req->wb_kref);
-		if (!nfs_lock_request(req)) {
-			int status;
-
-			/* Prevent deadlock with nfs_lock_and_join_requests */
-			if (!list_empty(dst)) {
-				nfs_release_request(req);
-				continue;
-			}
-			/* Ensure we make progress to prevent livelock */
-			mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
-			status = nfs_wait_on_request(req);
-			nfs_release_request(req);
-			mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
-			if (status < 0)
-				break;
-			goto restart;
-		}
-		nfs_request_remove_commit_list(req, cinfo);
-		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
-		nfs_list_add_request(req, dst);
-		ret++;
-		if ((ret == max) && !cinfo->dreq)
-			break;
-		cond_resched();
-	}
-	return ret;
-}
-
 static int
 pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
 				 struct nfs_commit_info *cinfo,
@@ -135,7 +97,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
 	int ret;
 
 	lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
-	ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
+	ret = nfs_scan_commit_list(src, dst, cinfo, max);
 	if (ret) {
 		cinfo->ds->nwritten -= ret;
 		cinfo->ds->ncommitting += ret;
@@ -180,8 +142,7 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
 	lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
 restart:
 	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
-		nwritten = pnfs_generic_transfer_commit_list(&b->written,
-				dst, cinfo, 0);
+		nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0);
 		if (!nwritten)
 			continue;
 		cinfo->ds->nwritten -= nwritten;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c3f627b08ec6..121218d4e5ed 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1052,6 +1052,7 @@ restart:
 			goto restart;
 		}
 		nfs_request_remove_commit_list(req, cinfo);
+		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 		nfs_list_add_request(req, dst);
 		ret++;
 		if ((ret == max) && !cinfo->dreq)
@@ -1060,6 +1061,7 @@ restart:
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(nfs_scan_commit_list);
 
 /*
  * nfs_scan_commit - Scan an inode for commit requests
-- 
cgit v1.2.3-59-g8ed1b


From 8b77484f2b3d8f1096a2231d513fc589e4857a73 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 9 Sep 2017 15:31:28 -0400
Subject: NFS: Don't hold the group lock when calling nfs_release_request()

That can deadlock if this is the last reference since
nfs_page_group_destroy() calls nfs_page_group_sync_on_bit().
Note that even if the page was removed from the subpage list,
the req->wb_head could still be pointing to the old head.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 121218d4e5ed..36d34a4c86bd 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -532,9 +532,9 @@ try_again:
 		} else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
 			    ((subreq->wb_offset + subreq->wb_bytes) >
 			     (head->wb_offset + total_bytes)))) {
+			nfs_page_group_unlock(head);
 			nfs_unroll_locks(inode, head, subreq);
 			nfs_unlock_and_release_request(subreq);
-			nfs_page_group_unlock(head);
 			nfs_unlock_and_release_request(head);
 			return ERR_PTR(-EIO);
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 1bd5d6d08ea7ed0794c8a3908383d6d6fc202cdd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 9 Sep 2017 16:43:09 -0400
Subject: NFS: Count the bytes of skipped subrequests in
 nfs_lock_and_join_requests()

If we skip a subrequest due to a zero refcount, we should still count
the byte range that it covered so that we accurately reconstruct the
original request size.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 36d34a4c86bd..f68083db63c8 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -504,8 +504,12 @@ try_again:
 	for (subreq = head->wb_this_page; subreq != head;
 			subreq = subreq->wb_this_page) {
 
-		if (!kref_get_unless_zero(&subreq->wb_kref))
+		if (!kref_get_unless_zero(&subreq->wb_kref)) {
+			if (subreq->wb_offset == head->wb_offset + total_bytes)
+				total_bytes += subreq->wb_bytes;
 			continue;
+		}
+
 		while (!nfs_lock_request(subreq)) {
 			/*
 			 * Unlock page to allow nfs_page_group_sync_on_bit()
-- 
cgit v1.2.3-59-g8ed1b