From 73a3d07c1082145a3b78407bb5252df290470c4c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:47 -0400
Subject: NFS: Clean up inode metadata updates

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_xdr.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7fafc4c546b7..c483e239f993 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -57,8 +57,7 @@ struct nfs_fattr {
 #define NFS_ATTR_WCC		0x0001		/* pre-op WCC data    */
 #define NFS_ATTR_FATTR		0x0002		/* post-op attributes */
 #define NFS_ATTR_FATTR_V3	0x0004		/* NFSv3 attributes */
-#define NFS_ATTR_FATTR_V4	0x0008
-#define NFS_ATTR_PRE_CHANGE	0x0010
+#define NFS_ATTR_FATTR_V4	0x0008		/* NFSv4 change attribute */
 
 /*
  * Info on the file system
-- 
cgit v1.2.3-59-g8ed1b


From 0d0b5cb36faf7002a11736032313f06d6f3d881c Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Thu, 25 May 2006 01:40:53 -0400
Subject: NFS: Optimize allocation of nfs_read/write_data structures

Clean up use of page_array, and fix an off-by-one error noticed by Tom
Talpey which causes kmalloc calls in cases where using the page_array
is sufficient.

Test plan:
Normal client functional testing with r/wsize=32768.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c           | 11 ++++-------
 fs/nfs/write.c          | 18 +++++++-----------
 include/linux/nfs_xdr.h |  4 ++--
 3 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4b5f58da5650..fd9018c692bb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -51,14 +51,11 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kmalloc(size, GFP_NOFS);
-			if (p->pagevec) {
-				memset(p->pagevec, 0, size);
-			} else {
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			if (!p->pagevec) {
 				mempool_free(p, nfs_rdata_mempool);
 				p = NULL;
 			}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4cfada2cc09f..a515ec714bb6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -98,11 +98,10 @@ struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kzalloc(size, GFP_NOFS);
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
 			if (!p->pagevec) {
 				mempool_free(p, nfs_commit_mempool);
 				p = NULL;
@@ -126,14 +125,11 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kmalloc(size, GFP_NOFS);
-			if (p->pagevec) {
-				memset(p->pagevec, 0, size);
-			} else {
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			if (!p->pagevec) {
 				mempool_free(p, nfs_wdata_mempool);
 				p = NULL;
 			}
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index c483e239f993..e206c07080fe 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -694,7 +694,7 @@ struct nfs_read_data {
 #ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
-	struct page		*page_array[NFS_PAGEVEC_SIZE + 1];
+	struct page		*page_array[NFS_PAGEVEC_SIZE];
 };
 
 struct nfs_write_data {
@@ -712,7 +712,7 @@ struct nfs_write_data {
 #ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
-	struct page		*page_array[NFS_PAGEVEC_SIZE + 1];
+	struct page		*page_array[NFS_PAGEVEC_SIZE];
 };
 
 struct nfs_access_entry;
-- 
cgit v1.2.3-59-g8ed1b


From 38478b24e37587f1c4fedf8ac070ca54f052ed28 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:57 -0400
Subject: NFS: More page cache revalidation fixups

Whenever the directory changes, we want to make sure that we always
invalidate its page cache. Fix up update_changeattr() and
nfs_mark_for_revalidate() so that they do so.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c      | 10 +++++-----
 include/linux/nfs_fs.h |  6 +++++-
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e38a84874492..ef4c6cccf958 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -185,15 +185,15 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 	spin_unlock(&clp->cl_lock);
 }
 
-static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinfo)
+static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_inode *nfsi = NFS_I(dir);
 
-	spin_lock(&inode->i_lock);
-	nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+	spin_lock(&dir->i_lock);
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
 	if (cinfo->before == nfsi->change_attr && cinfo->atomic)
 		nfsi->change_attr = cinfo->after;
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&dir->i_lock);
 }
 
 struct nfs4_opendata {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c71227dd4389..1d81e7d82970 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -234,8 +234,12 @@ static inline int nfs_caches_unstable(struct inode *inode)
 
 static inline void nfs_mark_for_revalidate(struct inode *inode)
 {
+	struct nfs_inode *nfsi = NFS_I(inode);
+
 	spin_lock(&inode->i_lock);
-	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
+	if (S_ISDIR(inode->i_mode))
+		nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
 	spin_unlock(&inode->i_lock);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 44b11874ff583b6e766a05856b04f3c492c32b84 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:59 -0400
Subject: NFS: Separate metadata and page cache revalidation mechanisms

Separate out the function of revalidating the inode metadata, and
revalidating the mapping. The former may be called by lookup(),
and only really needs to check that permissions, ctime, etc haven't changed
whereas the latter needs only done when we want to read data from the page
cache, and may need to sync and then invalidate the mapping.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           |  2 +-
 fs/nfs/file.c          | 24 +++---------------------
 fs/nfs/inode.c         | 16 +++++++++++-----
 fs/nfs/symlink.c       |  2 +-
 include/linux/nfs_fs.h |  2 +-
 5 files changed, 17 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cae74dd4c7f5..1d3d8922a663 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -528,7 +528,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	lock_kernel();
 
-	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (res < 0) {
 		unlock_kernel();
 		return res;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index fade02c15e6e..63154070145a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -126,23 +126,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
 	return NFS_PROTO(inode)->file_release(inode, filp);
 }
 
-/**
- * nfs_revalidate_file - Revalidate the page cache & related metadata
- * @inode - pointer to inode struct
- * @file - pointer to file
- */
-static int nfs_revalidate_file(struct inode *inode, struct file *filp)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	int retval = 0;
-
-	if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR))
-			|| nfs_attribute_timeout(inode))
-		retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	nfs_revalidate_mapping(inode, filp->f_mapping);
-	return 0;
-}
-
 /**
  * nfs_revalidate_size - Revalidate the file size
  * @inode - pointer to inode struct
@@ -228,7 +211,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long) pos);
 
-	result = nfs_revalidate_file(inode, iocb->ki_filp);
+	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 	nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
 	if (!result)
 		result = generic_file_aio_read(iocb, buf, count, pos);
@@ -247,7 +230,7 @@ nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count,
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long long) *ppos);
 
-	res = nfs_revalidate_file(inode, filp);
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (!res)
 		res = generic_file_sendfile(filp, ppos, count, actor, target);
 	return res;
@@ -263,7 +246,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 	dfprintk(VFS, "nfs: mmap(%s/%s)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	status = nfs_revalidate_file(inode, file);
+	status = nfs_revalidate_mapping(inode, file->f_mapping);
 	if (!status)
 		status = generic_file_mmap(file, vma);
 	return status;
@@ -373,7 +356,6 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
 		if (result)
 			goto out;
 	}
-	nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 
 	result = count;
 	if (!count)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index eddd0e982d23..69036ef39866 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1220,7 +1220,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		status = -ESTALE;
 		/* Do we trust the cached ESTALE? */
 		if (NFS_ATTRTIMEO(inode) != 0) {
-			if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) {
+			if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) {
 				/* no */
 			} else
 				goto out;
@@ -1251,8 +1251,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	}
 	spin_unlock(&inode->i_lock);
 
-	nfs_revalidate_mapping(inode, inode->i_mapping);
-
 	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
 		nfs_zap_acl_cache(inode);
 
@@ -1287,7 +1285,7 @@ int nfs_attribute_timeout(struct inode *inode)
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
 	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-	if (!(NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+	if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
 			&& !nfs_attribute_timeout(inode))
 		return NFS_STALE(inode) ? -ESTALE : 0;
 	return __nfs_revalidate_inode(server, inode);
@@ -1298,9 +1296,16 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
  * @inode - pointer to host inode
  * @mapping - pointer to mapping
  */
-void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int ret = 0;
+
+	if (NFS_STALE(inode))
+		ret = -ESTALE;
+	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+			|| nfs_attribute_timeout(inode))
+		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 
 	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
 		nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
@@ -1321,6 +1326,7 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 				inode->i_sb->s_id,
 				(long long)NFS_FILEID(inode));
 	}
+	return ret;
 }
 
 /**
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 18dc95b0b646..636c479995bc 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -52,7 +52,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct page *page;
-	void *err = ERR_PTR(nfs_revalidate_inode(NFS_SERVER(inode), inode));
+	void *err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
 	if (err)
 		goto read_failed;
 	page = read_cache_page(&inode->i_data, 0,
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1d81e7d82970..1b524b9f982a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -301,7 +301,7 @@ extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
-extern void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
+extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern void nfs_begin_attr_update(struct inode *);
-- 
cgit v1.2.3-59-g8ed1b


From d2ccddf042c403b146159beea438c6bfc4a445e2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 31 May 2006 01:13:38 -0400
Subject: NFS: Flesh out nfs_invalidate_page()

In the case of a call to truncate_inode_pages(), we should really try to
cancel any pending writes on the page.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c            |  6 +++++-
 fs/nfs/pagelist.c        | 47 ++++++++++++++++++++++++++++-------------------
 fs/nfs/write.c           | 27 ++++++++++++++++++++++++---
 include/linux/nfs_fs.h   |  1 +
 include/linux/nfs_page.h |  4 ++--
 5 files changed, 60 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 63154070145a..106ef0dec04d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -303,7 +303,11 @@ static int nfs_commit_write(struct file *file, struct page *page, unsigned offse
 
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
-	/* FIXME: we really should cancel any unstarted writes on this page */
+	struct inode *inode = page->mapping->host;
+
+	/* Cancel any unstarted writes on this page */
+	if (offset == 0)
+		nfs_sync_inode_wait(inode, page->index, 1, FLUSH_INVALIDATE);
 }
 
 static int nfs_release_page(struct page *page, gfp_t gfp)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 106aca388ebc..656481c0daa3 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -325,6 +325,7 @@ out:
 
 /**
  * nfs_scan_list - Scan a list for matching requests
+ * @nfsi: NFS inode
  * @head: One of the NFS inode request lists
  * @dst: Destination list
  * @idx_start: lower bound of page->index to scan
@@ -336,14 +337,15 @@ out:
  * The requests are *not* checked to ensure that they form a contiguous set.
  * You must be holding the inode's req_lock when calling this function
  */
-int
-nfs_scan_list(struct list_head *head, struct list_head *dst,
-	      unsigned long idx_start, unsigned int npages)
+int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
+		struct list_head *dst, unsigned long idx_start,
+		unsigned int npages)
 {
-	struct list_head	*pos, *tmp;
-	struct nfs_page		*req;
-	unsigned long		idx_end;
-	int			res;
+	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
+	struct nfs_page *req;
+	unsigned long idx_end;
+	int found, i;
+	int res;
 
 	res = 0;
 	if (npages == 0)
@@ -351,21 +353,28 @@ nfs_scan_list(struct list_head *head, struct list_head *dst,
 	else
 		idx_end = idx_start + npages - 1;
 
-	list_for_each_safe(pos, tmp, head) {
-
-		req = nfs_list_entry(pos);
-
-		if (req->wb_index < idx_start)
-			continue;
-		if (req->wb_index > idx_end)
+	for (;;) {
+		found = radix_tree_gang_lookup(&nfsi->nfs_page_tree,
+				(void **)&pgvec[0], idx_start,
+				NFS_SCAN_MAXENTRIES);
+		if (found <= 0)
 			break;
+		for (i = 0; i < found; i++) {
+			req = pgvec[i];
+			if (req->wb_index > idx_end)
+				goto out;
+			idx_start = req->wb_index + 1;
+			if (req->wb_list_head != head)
+				continue;
+			if (nfs_set_page_writeback_locked(req)) {
+				nfs_list_remove_request(req);
+				nfs_list_add_request(req, dst);
+				res++;
+			}
+		}
 
-		if (!nfs_set_page_writeback_locked(req))
-			continue;
-		nfs_list_remove_request(req);
-		nfs_list_add_request(req, dst);
-		res++;
 	}
+out:
 	return res;
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a515ec714bb6..e03abbd8302e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -579,6 +579,17 @@ static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, un
 	return ret;
 }
 
+static void nfs_cancel_requests(struct list_head *head)
+{
+	struct nfs_page *req;
+	while(!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_inode_remove_request(req);
+		nfs_clear_page_writeback(req);
+	}
+}
+
 /*
  * nfs_scan_dirty - Scan an inode for dirty requests
  * @inode: NFS inode to scan
@@ -623,7 +634,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
 	int res = 0;
 
 	if (nfsi->ncommit != 0) {
-		res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages);
+		res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages);
 		nfsi->ncommit -= res;
 		if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
 			printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
@@ -1491,15 +1502,25 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
 		pages = nfs_scan_dirty(inode, &head, idx_start, npages);
 		if (pages != 0) {
 			spin_unlock(&nfsi->req_lock);
-			ret = nfs_flush_list(inode, &head, pages, how);
+			if (how & FLUSH_INVALIDATE)
+				nfs_cancel_requests(&head);
+			else
+				ret = nfs_flush_list(inode, &head, pages, how);
 			spin_lock(&nfsi->req_lock);
 			continue;
 		}
 		if (nocommit)
 			break;
-		pages = nfs_scan_commit(inode, &head, 0, 0);
+		pages = nfs_scan_commit(inode, &head, idx_start, npages);
 		if (pages == 0)
 			break;
+		if (how & FLUSH_INVALIDATE) {
+			spin_unlock(&nfsi->req_lock);
+			nfs_cancel_requests(&head);
+			spin_lock(&nfsi->req_lock);
+			continue;
+		}
+		pages += nfs_scan_commit(inode, &head, 0, 0);
 		spin_unlock(&nfsi->req_lock);
 		ret = nfs_commit_list(inode, &head, how);
 		spin_lock(&nfsi->req_lock);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1b524b9f982a..fc48135621ed 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -61,6 +61,7 @@
 #define FLUSH_LOWPRI		8	/* low priority background flush */
 #define FLUSH_HIGHPRI		16	/* high priority memory reclaim flush */
 #define FLUSH_NOCOMMIT		32	/* Don't send the NFSv3/v4 COMMIT */
+#define FLUSH_INVALIDATE	64	/* Invalidate the page cache */
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 66e2ed658527..8cadb0a77a7a 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -63,8 +63,8 @@ extern	void nfs_release_request(struct nfs_page *req);
 
 extern  int nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
 				unsigned long idx_start, unsigned int npages);
-extern	int nfs_scan_list(struct list_head *, struct list_head *,
-			  unsigned long, unsigned int);
+extern	int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, struct list_head *dst,
+			  unsigned long idx_start, unsigned int npages);
 extern	int nfs_coalesce_requests(struct list_head *, struct list_head *,
 				  unsigned int);
 extern  int nfs_wait_on_request(struct nfs_page *);
-- 
cgit v1.2.3-59-g8ed1b


From bb4a58bf46473e3e83d84054bbc110db3a0f85e4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:15 -0400
Subject: VFS: Add GPL_EXPORTED function vfs_kern_mount()

do_kern_mount() does not allow the kernel to use private mount interfaces
without exposing the same interfaces to userland. The problem is that the
filesystem is referenced by name, thus meaning that it and its mount
interface must be registered in the global filesystem list.

vfs_kern_mount() passes the struct file_system_type as an explicit
parameter in order to overcome this limitation.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/super.c            | 22 +++++++++++++++-------
 include/linux/mount.h |  5 +++++
 2 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/super.c b/fs/super.c
index a66f66bb8049..848be4fc67a2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -800,17 +800,13 @@ struct super_block *get_sb_single(struct file_system_type *fs_type,
 EXPORT_SYMBOL(get_sb_single);
 
 struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
-	struct file_system_type *type = get_fs_type(fstype);
 	struct super_block *sb = ERR_PTR(-ENOMEM);
 	struct vfsmount *mnt;
 	int error;
 	char *secdata = NULL;
 
-	if (!type)
-		return ERR_PTR(-ENODEV);
-
 	mnt = alloc_vfsmnt(name);
 	if (!mnt)
 		goto out;
@@ -841,7 +837,6 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
 	mnt->mnt_parent = mnt;
 	up_write(&sb->s_umount);
 	free_secdata(secdata);
-	put_filesystem(type);
 	return mnt;
 out_sb:
 	up_write(&sb->s_umount);
@@ -852,10 +847,23 @@ out_free_secdata:
 out_mnt:
 	free_vfsmnt(mnt);
 out:
-	put_filesystem(type);
 	return (struct vfsmount *)sb;
 }
 
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
+	put_filesystem(type);
+	return mnt;
+}
+
 EXPORT_SYMBOL_GPL(do_kern_mount);
 
 struct vfsmount *kern_mount(struct file_system_type *type)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index b7472ae91fa4..aff68c3660f5 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -73,6 +73,11 @@ extern struct vfsmount *alloc_vfsmnt(const char *name);
 extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
 				      const char *name, void *data);
 
+struct file_system_type;
+extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
+				      int flags, const char *name,
+				      void *data);
+
 struct nameidata;
 
 extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
-- 
cgit v1.2.3-59-g8ed1b


From 1f5ce9e93aa96a867f195ed45f6f77935175f12e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:16 -0400
Subject: VFS: Unexport do_kern_mount() and clean up simple_pin_fs()

Replace all module uses with the new vfs_kern_mount() interface, and fix up
simple_pin_fs().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/filesystems/automount-support.txt | 2 +-
 drivers/usb/core/inode.c                        | 2 +-
 fs/afs/mntpt.c                                  | 2 +-
 fs/afs/super.c                                  | 2 +-
 fs/afs/super.h                                  | 2 ++
 fs/binfmt_misc.c                                | 3 ++-
 fs/configfs/mount.c                             | 2 +-
 fs/debugfs/inode.c                              | 2 +-
 fs/libfs.c                                      | 4 ++--
 fs/super.c                                      | 4 +---
 include/linux/fs.h                              | 2 +-
 mm/shmem.c                                      | 2 +-
 net/sunrpc/rpc_pipe.c                           | 2 +-
 security/inode.c                                | 2 +-
 14 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/automount-support.txt b/Documentation/filesystems/automount-support.txt
index 58c65a1713e5..7cac200e2a85 100644
--- a/Documentation/filesystems/automount-support.txt
+++ b/Documentation/filesystems/automount-support.txt
@@ -19,7 +19,7 @@ following procedure:
 
  (2) Have the follow_link() op do the following steps:
 
-     (a) Call do_kern_mount() to call the appropriate filesystem to set up a
+     (a) Call vfs_kern_mount() to call the appropriate filesystem to set up a
          superblock and gain a vfsmount structure representing it.
 
      (b) Copy the nameidata provided as an argument and substitute the dentry
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 3cf945cc5b9a..695b90a17a68 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -569,7 +569,7 @@ static int create_special_files (void)
 	ignore_mount = 1;
 
 	/* create the devices special file */
-	retval = simple_pin_fs("usbfs", &usbfs_mount, &usbfs_mount_count);
+	retval = simple_pin_fs(&usb_fs_type, &usbfs_mount, &usbfs_mount_count);
 	if (retval) {
 		err ("Unable to get usbfs mount");
 		goto exit;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 4e6eeb59b83c..7b6dc03caf44 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -210,7 +210,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 
 	/* try and do the mount */
 	kdebug("--- attempting mount %s -o %s ---", devname, options);
-	mnt = do_kern_mount("afs", 0, devname, options);
+	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
 	kdebug("--- mount result %p ---", mnt);
 
 	free_page((unsigned long) devname);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 53c56e7231ab..93a7821db0d7 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,7 @@ static void afs_put_super(struct super_block *sb);
 
 static void afs_destroy_inode(struct inode *inode);
 
-static struct file_system_type afs_fs_type = {
+struct file_system_type afs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "afs",
 	.get_sb		= afs_get_sb,
diff --git a/fs/afs/super.h b/fs/afs/super.h
index ac11362f4e95..32de8cc6fae8 100644
--- a/fs/afs/super.h
+++ b/fs/afs/super.h
@@ -38,6 +38,8 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+extern struct file_system_type afs_fs_type;
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index d73d75591a39..c0a909e1d290 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -55,6 +55,7 @@ typedef struct {
 } Node;
 
 static DEFINE_RWLOCK(entries_lock);
+static struct file_system_type bm_fs_type;
 static struct vfsmount *bm_mnt;
 static int entry_count;
 
@@ -638,7 +639,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	if (!inode)
 		goto out2;
 
-	err = simple_pin_fs("binfmt_misc", &bm_mnt, &entry_count);
+	err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
 	if (err) {
 		iput(inode);
 		inode = NULL;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f920d30478e5..be5d86ae56f0 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -118,7 +118,7 @@ static struct file_system_type configfs_fs_type = {
 
 int configfs_pin_fs(void)
 {
-	return simple_pin_fs("configfs", &configfs_mount,
+	return simple_pin_fs(&configfs_fs_type, &configfs_mount,
 			     &configfs_mnt_count);
 }
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b55b4ea9a676..90f9417181fd 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -199,7 +199,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
 
 	pr_debug("debugfs: creating file '%s'\n",name);
 
-	error = simple_pin_fs("debugfs", &debugfs_mount, &debugfs_mount_count);
+	error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
 	if (error)
 		goto exit;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 7145ba7a48d0..4a3ec9ad8bed 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -424,13 +424,13 @@ out:
 
 static DEFINE_SPINLOCK(pin_fs_lock);
 
-int simple_pin_fs(char *name, struct vfsmount **mount, int *count)
+int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
 {
 	struct vfsmount *mnt = NULL;
 	spin_lock(&pin_fs_lock);
 	if (unlikely(!*mount)) {
 		spin_unlock(&pin_fs_lock);
-		mnt = do_kern_mount(name, 0, name, NULL);
+		mnt = vfs_kern_mount(type, 0, type->name, NULL);
 		if (IS_ERR(mnt))
 			return PTR_ERR(mnt);
 		spin_lock(&pin_fs_lock);
diff --git a/fs/super.c b/fs/super.c
index 848be4fc67a2..15f2afdbf82e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -864,11 +864,9 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
 	return mnt;
 }
 
-EXPORT_SYMBOL_GPL(do_kern_mount);
-
 struct vfsmount *kern_mount(struct file_system_type *type)
 {
-	return do_kern_mount(type->name, 0, type->name, NULL);
+	return vfs_kern_mount(type, 0, type->name, NULL);
 }
 
 EXPORT_SYMBOL(kern_mount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f813bc8266aa..eca70cfe5b85 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1763,7 +1763,7 @@ extern struct inode_operations simple_dir_inode_operations;
 struct tree_descr { char *name; const struct file_operations *ops; int mode; };
 struct dentry *d_alloc_name(struct dentry *, const char *);
 extern int simple_fill_super(struct super_block *, int, struct tree_descr *);
-extern int simple_pin_fs(char *name, struct vfsmount **mount, int *count);
+extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
 extern void simple_release_fs(struct vfsmount **mount, int *count);
 
 extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t);
diff --git a/mm/shmem.c b/mm/shmem.c
index 4c5e68e4e9ae..8184342440f0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2261,7 +2261,7 @@ static int __init init_tmpfs(void)
 #ifdef CONFIG_TMPFS
 	devfs_mk_dir("shm");
 #endif
-	shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER,
+	shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
 				tmpfs_fs_type.name, NULL);
 	if (IS_ERR(shm_mnt)) {
 		error = PTR_ERR(shm_mnt);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index cc673dd8433f..a5226df8ac03 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -439,7 +439,7 @@ struct vfsmount *rpc_get_mount(void)
 {
 	int err;
 
-	err = simple_pin_fs("rpc_pipefs", &rpc_mount, &rpc_mount_count);
+	err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mount, &rpc_mount_count);
 	if (err != 0)
 		return ERR_PTR(err);
 	return rpc_mount;
diff --git a/security/inode.c b/security/inode.c
index 0f77b0223662..8bf40625c670 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -224,7 +224,7 @@ struct dentry *securityfs_create_file(const char *name, mode_t mode,
 
 	pr_debug("securityfs: creating file '%s'\n",name);
 
-	error = simple_pin_fs("securityfs", &mount, &mount_count);
+	error = simple_pin_fs(&fs_type, &mount, &mount_count);
 	if (error) {
 		dentry = ERR_PTR(error);
 		goto exit;
-- 
cgit v1.2.3-59-g8ed1b


From 5528f911b4c43a5de5da34bcbd7e3f2a62503617 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:17 -0400
Subject: VFS: Add shrink_submounts()

Allow a submount to be marked as being 'shrinkable' by means of the
vfsmount->mnt_flags, and then add a function 'shrink_submounts()' which
attempts to recursively unmount these submounts.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/namespace.c        | 124 ++++++++++++++++++++++++++++++++++++++++----------
 include/linux/mount.h |   3 ++
 2 files changed, 102 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb852..b22e469ab560 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1162,6 +1162,40 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
 	}
 }
 
+/*
+ * go through the vfsmounts we've just consigned to the graveyard to
+ * - check that they're still dead
+ * - delete the vfsmount from the appropriate namespace under lock
+ * - dispose of the corpse
+ */
+static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts)
+{
+	struct namespace *namespace;
+	struct vfsmount *mnt;
+
+	while (!list_empty(graveyard)) {
+		LIST_HEAD(umounts);
+		mnt = list_entry(graveyard->next, struct vfsmount, mnt_expire);
+		list_del_init(&mnt->mnt_expire);
+
+		/* don't do anything if the namespace is dead - all the
+		 * vfsmounts from it are going away anyway */
+		namespace = mnt->mnt_namespace;
+		if (!namespace || !namespace->root)
+			continue;
+		get_namespace(namespace);
+
+		spin_unlock(&vfsmount_lock);
+		down_write(&namespace_sem);
+		expire_mount(mnt, mounts, &umounts);
+		up_write(&namespace_sem);
+		release_mounts(&umounts);
+		mntput(mnt);
+		put_namespace(namespace);
+		spin_lock(&vfsmount_lock);
+	}
+}
+
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * mountpoints that aren't in use and haven't been touched since last we came
@@ -1169,7 +1203,6 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
  */
 void mark_mounts_for_expiry(struct list_head *mounts)
 {
-	struct namespace *namespace;
 	struct vfsmount *mnt, *next;
 	LIST_HEAD(graveyard);
 
@@ -1193,38 +1226,79 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		list_move(&mnt->mnt_expire, &graveyard);
 	}
 
-	/*
-	 * go through the vfsmounts we've just consigned to the graveyard to
-	 * - check that they're still dead
-	 * - delete the vfsmount from the appropriate namespace under lock
-	 * - dispose of the corpse
-	 */
-	while (!list_empty(&graveyard)) {
-		LIST_HEAD(umounts);
-		mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire);
-		list_del_init(&mnt->mnt_expire);
+	expire_mount_list(&graveyard, mounts);
 
-		/* don't do anything if the namespace is dead - all the
-		 * vfsmounts from it are going away anyway */
-		namespace = mnt->mnt_namespace;
-		if (!namespace || !namespace->root)
+	spin_unlock(&vfsmount_lock);
+}
+
+EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+
+/*
+ * Ripoff of 'select_parent()'
+ *
+ * search the list of submounts for a given mountpoint, and move any
+ * shrinkable submounts to the 'graveyard' list.
+ */
+static int select_submounts(struct vfsmount *parent, struct list_head *graveyard)
+{
+	struct vfsmount *this_parent = parent;
+	struct list_head *next;
+	int found = 0;
+
+repeat:
+	next = this_parent->mnt_mounts.next;
+resume:
+	while (next != &this_parent->mnt_mounts) {
+		struct list_head *tmp = next;
+		struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child);
+
+		next = tmp->next;
+		if (!(mnt->mnt_flags & MNT_SHRINKABLE))
 			continue;
-		get_namespace(namespace);
+		/*
+		 * Descend a level if the d_mounts list is non-empty.
+		 */
+		if (!list_empty(&mnt->mnt_mounts)) {
+			this_parent = mnt;
+			goto repeat;
+		}
 
-		spin_unlock(&vfsmount_lock);
-		down_write(&namespace_sem);
-		expire_mount(mnt, mounts, &umounts);
-		up_write(&namespace_sem);
-		release_mounts(&umounts);
-		mntput(mnt);
-		put_namespace(namespace);
-		spin_lock(&vfsmount_lock);
+		if (!propagate_mount_busy(mnt, 1)) {
+			mntget(mnt);
+			list_move_tail(&mnt->mnt_expire, graveyard);
+			found++;
+		}
 	}
+	/*
+	 * All done at this level ... ascend and resume the search
+	 */
+	if (this_parent != parent) {
+		next = this_parent->mnt_child.next;
+		this_parent = this_parent->mnt_parent;
+		goto resume;
+	}
+	return found;
+}
+
+/*
+ * process a list of expirable mountpoints with the intent of discarding any
+ * submounts of a specific parent mountpoint
+ */
+void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts)
+{
+	LIST_HEAD(graveyard);
+	int found;
+
+	spin_lock(&vfsmount_lock);
+
+	/* extract submounts of 'mountpoint' from the expiration list */
+	while ((found = select_submounts(mountpoint, &graveyard)) != 0)
+		expire_mount_list(&graveyard, mounts);
 
 	spin_unlock(&vfsmount_lock);
 }
 
-EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+EXPORT_SYMBOL_GPL(shrink_submounts);
 
 /*
  * Some copy_from_user() implementations do not return the exact number of
diff --git a/include/linux/mount.h b/include/linux/mount.h
index aff68c3660f5..9b4e0071b92e 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -23,6 +23,8 @@
 #define MNT_NOATIME	0x08
 #define MNT_NODIRATIME	0x10
 
+#define MNT_SHRINKABLE	0x100
+
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
 #define MNT_PNODE_MASK	0x3000	/* propogation flag mask */
@@ -84,6 +86,7 @@ extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
 			int mnt_flags, struct list_head *fslist);
 
 extern void mark_mounts_for_expiry(struct list_head *mounts);
+extern void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts);
 
 extern spinlock_t vfsmount_lock;
 extern dev_t name_to_dev_t(char *name);
-- 
cgit v1.2.3-59-g8ed1b


From 8b512d9a88875affe584bb3d2a7a235f84343b9e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:18 -0400
Subject: VFS: Remove dependency of ->umount_begin() call on MNT_FORCE

Allow filesystems to decide to perform pre-umount processing whether or not
MNT_FORCE is set.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/9p/vfs_super.c  |  7 ++++---
 fs/cifs/cifsfs.c   |  6 ++++--
 fs/fuse/inode.c    |  5 +++--
 fs/namespace.c     |  4 ++--
 fs/nfs/inode.c     | 14 +++++++++-----
 include/linux/fs.h |  2 +-
 6 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 61c599b4a1e3..00c1f6baf870 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -253,11 +253,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
 }
 
 static void
-v9fs_umount_begin(struct super_block *sb)
+v9fs_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	struct v9fs_session_info *v9ses = vfsmnt->mnt_sb->s_fs_info;
 
-	v9fs_session_cancel(v9ses);
+	if (flags & MNT_FORCE)
+		v9fs_session_cancel(v9ses);
 }
 
 static struct super_operations v9fs_super_ops = {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c262d8874ce9..3fdc2258f447 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -402,12 +402,14 @@ static struct quotactl_ops cifs_quotactl_ops = {
 #endif
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-static void cifs_umount_begin(struct super_block * sblock)
+static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
 {
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo * tcon;
 
-	cifs_sb = CIFS_SB(sblock);
+	if (!(flags & MNT_FORCE))
+		return;
+	cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
 	if(cifs_sb == NULL)
 		return;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7627022446b2..13ebe5780c93 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -195,9 +195,10 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
 	return inode;
 }
 
-static void fuse_umount_begin(struct super_block *sb)
+static void fuse_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	fuse_abort_conn(get_fuse_conn_super(sb));
+	if (flags & MNT_FORCE)
+		fuse_abort_conn(get_fuse_conn_super(vfsmnt->mnt_sb));
 }
 
 static void fuse_put_super(struct super_block *sb)
diff --git a/fs/namespace.c b/fs/namespace.c
index b22e469ab560..6bb0b85293e7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -576,8 +576,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	 */
 
 	lock_kernel();
-	if ((flags & MNT_FORCE) && sb->s_op->umount_begin)
-		sb->s_op->umount_begin(sb);
+	if (sb->s_op->umount_begin)
+		sb->s_op->umount_begin(mnt, flags);
 	unlock_kernel();
 
 	/*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9ff039f9a836..fda2b4966179 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -63,7 +63,7 @@ static struct inode *nfs_alloc_inode(struct super_block *sb);
 static void nfs_destroy_inode(struct inode *);
 static int nfs_write_inode(struct inode *,int);
 static void nfs_clear_inode(struct inode *);
-static void nfs_umount_begin(struct super_block *);
+static void nfs_umount_begin(struct vfsmount *, int);
 static int  nfs_statfs(struct super_block *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
@@ -162,15 +162,19 @@ nfs_clear_inode(struct inode *inode)
 	BUG_ON(atomic_read(&nfsi->data_updates) != 0);
 }
 
-void
-nfs_umount_begin(struct super_block *sb)
+static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	struct rpc_clnt	*rpc = NFS_SB(sb)->client;
+	struct nfs_server *server;
+	struct rpc_clnt	*rpc;
 
+	if (!(flags & MNT_FORCE))
+		return;
 	/* -EIO all pending I/O */
+	server = NFS_SB(vfsmnt->mnt_sb);
+	rpc = server->client;
 	if (!IS_ERR(rpc))
 		rpc_killall_tasks(rpc);
-	rpc = NFS_SB(sb)->client_acl;
+	rpc = server->client_acl;
 	if (!IS_ERR(rpc))
 		rpc_killall_tasks(rpc);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index eca70cfe5b85..1d80ba747484 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1101,7 +1101,7 @@ struct super_operations {
 	int (*statfs) (struct super_block *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
-	void (*umount_begin) (struct super_block *);
+	void (*umount_begin) (struct vfsmount *, int);
 
 	int (*show_options)(struct seq_file *, struct vfsmount *);
 	int (*show_stats)(struct seq_file *, struct vfsmount *);
-- 
cgit v1.2.3-59-g8ed1b


From 8b4bdcf8995dd92b23d2ec22b32aee8fbbb50e1c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:19 -0400
Subject: NFS: Store the file system "fsid" value in the NFS super block.

This should enable us to detect if we are crossing a mountpoint in the
case where the server is exporting "nohide" mounts.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c            |  1 -
 fs/nfs/inode.c            |  8 ++++++++
 fs/nfs/nfs2xdr.c          |  3 ++-
 fs/nfs/nfs3xdr.c          |  3 ++-
 fs/nfs/nfs4xdr.c          |  4 ++--
 include/linux/nfs_fs.h    |  5 +++--
 include/linux/nfs_fs_sb.h |  1 +
 include/linux/nfs_page.h  |  1 -
 include/linux/nfs_xdr.h   | 19 ++++++++++++-------
 9 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 3fab5b0cfc5a..b81e7ed3c902 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -47,7 +47,6 @@
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 
-#include <linux/nfs_fs_sb.h>
 #include <linux/nfs_fs.h>
 
 #include <linux/nfs_idmap.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fda2b4966179..1a809f6f8989 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -236,6 +236,7 @@ nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *f
 		return ERR_PTR(error);
 	}
 
+	server->fsid = fsinfo->fattr->fsid;
 	return nfs_fhget(sb, rootfh, fsinfo->fattr);
 }
 
@@ -1493,6 +1494,7 @@ out:
  */
 static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
+	struct nfs_server *server;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t cur_isize, new_isize;
 	unsigned int	invalid = 0;
@@ -1511,6 +1513,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		goto out_changed;
 
+	server = NFS_SERVER(inode);
+	/* Update the fsid if and only if this is the root directory */
+	if (inode == inode->i_sb->s_root->d_inode
+			&& !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		server->fsid = fattr->fsid;
+
 	/*
 	 * Update the read time so we don't revalidate too often.
 	 */
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index f0015fa876e1..a7ed88f97a11 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -131,7 +131,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	fattr->du.nfs2.blocksize = ntohl(*p++);
 	rdev = ntohl(*p++);
 	fattr->du.nfs2.blocks = ntohl(*p++);
-	fattr->fsid_u.nfs3 = ntohl(*p++);
+	fattr->fsid.major = ntohl(*p++);
+	fattr->fsid.minor = 0;
 	fattr->fileid = ntohl(*p++);
 	p = xdr_decode_time(p, &fattr->atime);
 	p = xdr_decode_time(p, &fattr->mtime);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index ec233619687e..f70eee2cac05 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -166,7 +166,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
 		fattr->rdev = 0;
 
-	p = xdr_decode_hyper(p, &fattr->fsid_u.nfs3);
+	p = xdr_decode_hyper(p, &fattr->fsid.major);
+	fattr->fsid.minor = 0;
 	p = xdr_decode_hyper(p, &fattr->fileid);
 	p = xdr_decode_time3(p, &fattr->atime);
 	p = xdr_decode_time3(p, &fattr->mtime);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7e9a840057f2..0d5794675944 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2217,7 +2217,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 	return 0;
 }
 
-static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fsid *fsid)
+static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
 	uint32_t *p;
 
@@ -2863,7 +2863,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		goto xdr_error;
 	if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
 		goto xdr_error;
-	if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid_u.nfs4)) != 0)
+	if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
 		goto xdr_error;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index fc48135621ed..6763a0089ee4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -16,8 +16,6 @@
 #include <linux/rwsem.h>
 #include <linux/wait.h>
 
-#include <linux/nfs_fs_sb.h>
-
 #include <linux/sunrpc/debug.h>
 #include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/clnt.h>
@@ -27,6 +25,9 @@
 #include <linux/nfs3.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_xdr.h>
+
+#include <linux/nfs_fs_sb.h>
+
 #include <linux/rwsem.h>
 #include <linux/mempool.h>
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 65dec21af774..6b4a13c79474 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -35,6 +35,7 @@ struct nfs_server {
 	char *			hostname;	/* remote hostname */
 	struct nfs_fh		fh;
 	struct sockaddr_in	addr;
+	struct nfs_fsid		fsid;
 	unsigned long		mount_time;	/* when this fs was mounted */
 #ifdef CONFIG_NFS_V4
 	/* Our own IP address, as a null-terminated string.
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 8cadb0a77a7a..1f7bd287c230 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -13,7 +13,6 @@
 #include <linux/list.h>
 #include <linux/pagemap.h>
 #include <linux/wait.h>
-#include <linux/nfs_fs_sb.h>
 #include <linux/sunrpc/auth.h>
 #include <linux/nfs_xdr.h>
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e206c07080fe..95682f7d738a 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -14,11 +14,19 @@
 #define NFS_DEF_FILE_IO_SIZE	(4096U)
 #define NFS_MIN_FILE_IO_SIZE	(1024U)
 
-struct nfs4_fsid {
-	__u64 major;
-	__u64 minor;
+struct nfs_fsid {
+	uint64_t		major;
+	uint64_t		minor;
 };
 
+/*
+ * Helper for checking equality between 2 fsids.
+ */
+static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid *b)
+{
+	return a->major == b->major && a->minor == b->minor;
+}
+
 struct nfs_fattr {
 	unsigned short		valid;		/* which fields are valid */
 	__u64			pre_size;	/* pre_op_attr.size	  */
@@ -40,10 +48,7 @@ struct nfs_fattr {
 		} nfs3;
 	} du;
 	dev_t			rdev;
-	union {
-		__u64		nfs3;		/* also nfs2 */
-		struct nfs4_fsid nfs4;
-	} fsid_u;
+	struct nfs_fsid		fsid;
 	__u64			fileid;
 	struct timespec		atime;
 	struct timespec		mtime;
-- 
cgit v1.2.3-59-g8ed1b


From 55a975937d40cac582e981ddc8ed783b3dcc043c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:19 -0400
Subject: NFS: Ensure the client submounts, when it crosses a server
 mountpoint.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Makefile        |   3 +-
 fs/nfs/dir.c           |  16 +++
 fs/nfs/inode.c         | 303 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/namespace.c     |  89 +++++++++++++++
 fs/nfs/nfs4_fs.h       |   1 +
 fs/nfs/nfs4proc.c      |   2 +-
 include/linux/nfs_fs.h |   9 ++
 7 files changed, 418 insertions(+), 5 deletions(-)
 create mode 100644 fs/nfs/namespace.c

(limited to 'include')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ec61fd56a1a9..d9d494cee388 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -5,7 +5,8 @@
 obj-$(CONFIG_NFS_FS) += nfs.o
 
 nfs-y 			:= dir.o file.o inode.o nfs2xdr.o pagelist.o \
-			   proc.o read.o symlink.o unlink.o write.o
+			   proc.o read.o symlink.o unlink.o write.o \
+			   namespace.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o mount_clnt.o      
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1d3d8922a663..3ddda6f7ecc2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -868,6 +868,17 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
 	return (nd->intent.open.flags & O_EXCL) != 0;
 }
 
+static inline int nfs_reval_fsid(struct inode *dir,
+		struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+
+	if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		/* Revalidate fsid on root dir */
+		return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode);
+	return 0;
+}
+
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct dentry *res;
@@ -900,6 +911,11 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 		res = ERR_PTR(error);
 		goto out_unlock;
 	}
+	error = nfs_reval_fsid(dir, &fhandle, &fattr);
+	if (error < 0) {
+		res = ERR_PTR(error);
+		goto out_unlock;
+	}
 	inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
 	res = (struct dentry *)inode;
 	if (IS_ERR(res))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1a809f6f8989..47167ab64f5b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -221,6 +221,14 @@ nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
 	return nfs_block_bits(bsize, nrbitsp);
 }
 
+static inline void
+nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+	sb->s_maxbytes = (loff_t)maxfilesize;
+	if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+
 /*
  * Obtain the root inode of the file system.
  */
@@ -331,9 +339,7 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
 	}
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
 
-	sb->s_maxbytes = fsinfo.maxfilesize;
-	if (sb->s_maxbytes > MAX_LFS_FILESIZE) 
-		sb->s_maxbytes = MAX_LFS_FILESIZE; 
+	nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
 
 	server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
 	server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
@@ -877,6 +883,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+			/* Deal with crossing mountpoints */
+			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+				inode->i_op = &nfs_mountpoint_inode_operations;
+				inode->i_fop = NULL;
+			}
 		} else if (S_ISLNK(inode->i_mode))
 			inode->i_op = &nfs_symlink_inode_operations;
 		else
@@ -1650,6 +1661,141 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  * File system information
  */
 
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - arbitrary string to prepend to the path
+ * @dentry - pointer to dentry
+ * @buffer - result buffer
+ * @buflen - length of buffer
+ *
+ * Helper function for constructing the path from the
+ * root dentry to an arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition.
+ */
+static char *nfs_path(const char *base, const struct dentry *dentry,
+		      char *buffer, ssize_t buflen)
+{
+	char *end = buffer+buflen;
+	int namelen;
+
+	*--end = '\0';
+	buflen--;
+	spin_lock(&dcache_lock);
+	while (!IS_ROOT(dentry)) {
+		namelen = dentry->d_name.len;
+		buflen -= namelen + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= namelen;
+		memcpy(end, dentry->d_name.name, namelen);
+		*--end = '/';
+		dentry = dentry->d_parent;
+	}
+	spin_unlock(&dcache_lock);
+	namelen = strlen(base);
+	/* Strip off excess slashes in base string */
+	while (namelen > 0 && base[namelen - 1] == '/')
+		namelen--;
+	buflen -= namelen;
+	if (buflen < 0)
+		goto Elong;
+	end -= namelen;
+	memcpy(end, base, namelen);
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+struct nfs_clone_mount {
+	const struct super_block *sb;
+	const struct dentry *dentry;
+	struct nfs_fh *fh;
+	struct nfs_fattr *fattr;
+};
+
+static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
+		struct super_block *(*clone_client)(struct nfs_server *, struct nfs_clone_mount *))
+{
+	struct nfs_server *server;
+	struct nfs_server *parent = NFS_SB(data->sb);
+	struct super_block *sb = ERR_PTR(-EINVAL);
+	void *err = ERR_PTR(-ENOMEM);
+	struct inode *root_inode;
+	struct nfs_fsinfo fsinfo;
+	int len;
+
+	server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (server == NULL)
+		goto out_err;
+	memcpy(server, parent, sizeof(*server));
+	len = strlen(parent->hostname) + 1;
+	server->hostname = kmalloc(len, GFP_KERNEL);
+	if (server->hostname == NULL)
+		goto free_server;
+	memcpy(server->hostname, parent->hostname, len);
+	server->fsid = data->fattr->fsid;
+	nfs_copy_fh(&server->fh, data->fh);
+	if (rpciod_up() != 0)
+		goto free_hostname;
+
+	sb = clone_client(server, data);
+	if (IS_ERR((err = sb)) || sb->s_root)
+		goto kill_rpciod;
+
+	sb->s_op = data->sb->s_op;
+	sb->s_blocksize = data->sb->s_blocksize;
+	sb->s_blocksize_bits = data->sb->s_blocksize_bits;
+	sb->s_maxbytes = data->sb->s_maxbytes;
+
+	server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+	err = ERR_PTR(-ENOMEM);
+	server->io_stats = nfs_alloc_iostats();
+	if (server->io_stats == NULL)
+		goto out_deactivate;
+
+	server->client = rpc_clone_client(parent->client);
+	if (IS_ERR((err = server->client)))
+		goto out_deactivate;
+	if (!IS_ERR(parent->client_sys)) {
+		server->client_sys = rpc_clone_client(parent->client_sys);
+		if (IS_ERR((err = server->client_sys)))
+			goto out_deactivate;
+	}
+	if (!IS_ERR(parent->client_acl)) {
+		server->client_acl = rpc_clone_client(parent->client_acl);
+		if (IS_ERR((err = server->client_acl)))
+			goto out_deactivate;
+	}
+	root_inode = nfs_fhget(sb, data->fh, data->fattr);
+	if (!root_inode)
+		goto out_deactivate;
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto out_put_root;
+	fsinfo.fattr = data->fattr;
+	if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
+		nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+	sb->s_root->d_op = server->rpc_ops->dentry_ops;
+	sb->s_flags |= MS_ACTIVE;
+	return sb;
+out_put_root:
+	iput(root_inode);
+out_deactivate:
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+	return (struct super_block *)err;
+kill_rpciod:
+	rpciod_down();
+free_hostname:
+	kfree(server->hostname);
+free_server:
+	kfree(server);
+out_err:
+	return (struct super_block *)err;
+}
+
 static int nfs_set_super(struct super_block *s, void *data)
 {
 	s->s_fs_info = data;
@@ -1807,6 +1953,31 @@ static struct file_system_type nfs_fs_type = {
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
+static struct super_block *nfs_clone_client(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	struct super_block *sb;
+
+	sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+	if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
+		lockd_up();
+	return sb;
+}
+
+static struct super_block *nfs_clone_nfs_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs_clone_client);
+}
+
+static struct file_system_type clone_nfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.get_sb		= nfs_clone_nfs_sb,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
 #ifdef CONFIG_NFS_V4
 
 static void nfs4_clear_inode(struct inode *);
@@ -2156,6 +2327,75 @@ static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
 module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
 		 &nfs_idmap_cache_timeout, 0644);
 
+/* Constructs the SERVER-side path */
+static inline char *nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+	return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
+}
+
+static inline char *nfs4_dup_path(const struct dentry *dentry)
+{
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *path;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (!IS_ERR(path)) {
+		int len = PAGE_SIZE + page - path;
+		char *tmp = path;
+
+		path = kmalloc(len, GFP_KERNEL);
+		if (path)
+			memcpy(path, tmp, len);
+		else
+			path = ERR_PTR(-ENOMEM);
+	}
+	free_page((unsigned long)page);
+	return path;
+}
+
+static struct super_block *nfs4_clone_client(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	const struct dentry *dentry = data->dentry;
+	struct nfs4_client *clp = server->nfs4_state;
+	struct super_block *sb;
+
+	server->mnt_path = nfs4_dup_path(dentry);
+	if (IS_ERR(server->mnt_path)) {
+		sb = (struct super_block *)server->mnt_path;
+		goto err;
+	}
+	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+	if (IS_ERR(sb) || sb->s_root)
+		goto free_path;
+	nfs4_server_capabilities(server, &server->fh);
+
+	down_write(&clp->cl_sem);
+	atomic_inc(&clp->cl_count);
+	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+	up_write(&clp->cl_sem);
+	return sb;
+free_path:
+	kfree(server->mnt_path);
+err:
+	server->mnt_path = NULL;
+	return sb;
+}
+
+static struct super_block *nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs4_clone_client);
+}
+
+static struct file_system_type clone_nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.get_sb		= nfs_clone_nfs4_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
 #define nfs4_init_once(nfsi) \
 	do { \
 		INIT_LIST_HEAD(&(nfsi)->open_states); \
@@ -2183,12 +2423,69 @@ static inline void unregister_nfs4fs(void)
 	nfs_unregister_sysctl();
 }
 #else
+#define nfs4_clone_client(a,b) ERR_PTR(-EINVAL)
 #define nfs4_init_once(nfsi) \
 	do { } while (0)
 #define register_nfs4fs() (0)
 #define unregister_nfs4fs()
 #endif
 
+static inline char *nfs_devname(const struct vfsmount *mnt_parent,
+			 const struct dentry *dentry,
+			 char *buffer, ssize_t buflen)
+{
+	return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen);
+}
+
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fh - filehandle for new root dentry
+ * @fattr - attributes for new root inode
+ *
+ */
+struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+		const struct dentry *dentry, struct nfs_fh *fh,
+		struct nfs_fattr *fattr)
+{
+	struct nfs_clone_mount mountdata = {
+		.sb = mnt_parent->mnt_sb,
+		.dentry = dentry,
+		.fh = fh,
+		.fattr = fattr,
+	};
+	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *devname;
+
+	dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name);
+	if (page == NULL)
+		goto out;
+	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+	mnt = (struct vfsmount *)devname;
+	if (IS_ERR(devname))
+		goto free_page;
+	switch (NFS_SB(mnt_parent->mnt_sb)->rpc_ops->version) {
+		case 2:
+		case 3:
+			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, &mountdata);
+			break;
+		case 4:
+			mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, &mountdata);
+			break;
+		default:
+			BUG();
+	}
+free_page:
+	free_page((unsigned long)page);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
+
 extern int nfs_init_nfspagecache(void);
 extern void nfs_destroy_nfspagecache(void);
 extern int nfs_init_readpagecache(void);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
new file mode 100644
index 000000000000..a155505c36f1
--- /dev/null
+++ b/fs/nfs/namespace.c
@@ -0,0 +1,89 @@
+/*
+ * linux/fs/nfs/namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * NFS namespace
+ */
+
+#include <linux/config.h>
+
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+/*
+ * nfs_follow_mountpoint - handle crossing a mountpoint on the server
+ * @dentry - dentry of mountpoint
+ * @nd - nameidata info
+ *
+ * When we encounter a mountpoint on the server, we want to set up
+ * a mountpoint on the client too, to prevent inode numbers from
+ * colliding, and to allow "df" to work properly.
+ * On NFSv4, we also want to allow for the fact that different
+ * filesystems may be migrated to different servers in a failover
+ * situation, and that different filesystems may want to use
+ * different security flavours.
+ */
+static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+{
+	struct vfsmount *mnt;
+	struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+	struct dentry *parent;
+	struct nfs_fh fh;
+	struct nfs_fattr fattr;
+	int err;
+
+	BUG_ON(IS_ROOT(dentry));
+	dprintk("%s: enter\n", __FUNCTION__);
+	dput(nd->dentry);
+	nd->dentry = dget(dentry);
+	if (d_mountpoint(nd->dentry))
+		goto out_follow;
+	/* Look it up again */
+	parent = dget_parent(nd->dentry);
+	err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr);
+	dput(parent);
+	if (err != 0)
+		goto out_err;
+
+	mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
+	err = PTR_ERR(mnt);
+	if (IS_ERR(mnt))
+		goto out_err;
+
+	mntget(mnt);
+	err = do_add_mount(mnt, nd, nd->mnt->mnt_flags, NULL);
+	if (err < 0) {
+		mntput(mnt);
+		if (err == -EBUSY)
+			goto out_follow;
+		goto out_err;
+	}
+	mntput(nd->mnt);
+	dput(nd->dentry);
+	nd->mnt = mnt;
+	nd->dentry = dget(mnt->mnt_root);
+out:
+	dprintk("%s: done, returned %d\n", __FUNCTION__, err);
+	return ERR_PTR(err);
+out_err:
+	path_release(nd);
+	goto out;
+out_follow:
+	while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
+		;
+	err = 0;
+	goto out;
+}
+
+struct inode_operations nfs_mountpoint_inode_operations = {
+	.follow_link	= nfs_follow_mountpoint,
+	.getattr	= nfs_getattr,
+};
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0f5e4e7cddec..307832fd1a49 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -217,6 +217,7 @@ extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
+extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ef4c6cccf958..308407205e6c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1331,7 +1331,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 	return status;
 }
 
-static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
 	struct nfs4_exception exception = { };
 	int err;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 6763a0089ee4..0ce8704732c2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -313,6 +313,10 @@ extern void nfs_end_data_update(struct inode *);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
+extern struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+					const struct dentry *dentry,
+					struct nfs_fh *fh,
+					struct nfs_fattr *fattr);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern u32 root_nfs_parse_addr(char *name); /*__init*/
@@ -398,6 +402,11 @@ extern void nfs_unregister_sysctl(void);
 #define nfs_unregister_sysctl() do { } while(0)
 #endif
 
+/*
+ * linux/fs/nfs/namespace.c
+ */
+extern struct inode_operations nfs_mountpoint_inode_operations;
+
 /*
  * linux/fs/nfs/unlink.c
  */
-- 
cgit v1.2.3-59-g8ed1b


From 51d8fa6a109589d522c18a8e9bf3fb167a91b1bc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:20 -0400
Subject: NFS: Add timeout to submounts

Make automounted partitions expire using the mark_mounts_for_expiry()
function. The timeout is controlled via a sysctl.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  3 +++
 fs/nfs/namespace.c     | 25 ++++++++++++++++++++++++-
 fs/nfs/sysctl.c        | 10 ++++++++++
 include/linux/nfs_fs.h |  3 +++
 4 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 47167ab64f5b..3eea556d8f59 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -167,6 +167,7 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 	struct nfs_server *server;
 	struct rpc_clnt	*rpc;
 
+	shrink_submounts(vfsmnt, &nfs_automount_list);
 	if (!(flags & MNT_FORCE))
 		return;
 	/* -EIO all pending I/O */
@@ -1943,6 +1944,7 @@ static void nfs_kill_super(struct super_block *s)
 	nfs_free_iostats(server->io_stats);
 	kfree(server->hostname);
 	kfree(server);
+	nfs_release_automount_timer();
 }
 
 static struct file_system_type nfs_fs_type = {
@@ -2288,6 +2290,7 @@ static void nfs4_kill_super(struct super_block *sb)
 	nfs_free_iostats(server->io_stats);
 	kfree(server->hostname);
 	kfree(server);
+	nfs_release_automount_timer();
 }
 
 static struct file_system_type nfs4_fs_type = {
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index a155505c36f1..e426516c1116 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -18,6 +18,11 @@
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
+LIST_HEAD(nfs_automount_list);
+static void nfs_expire_automounts(void *list);
+static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list);
+int nfs_mountpoint_expiry_timeout = 500 * HZ;
+
 /*
  * nfs_follow_mountpoint - handle crossing a mountpoint on the server
  * @dentry - dentry of mountpoint
@@ -59,7 +64,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		goto out_err;
 
 	mntget(mnt);
-	err = do_add_mount(mnt, nd, nd->mnt->mnt_flags, NULL);
+	err = do_add_mount(mnt, nd, nd->mnt->mnt_flags|MNT_SHRINKABLE, &nfs_automount_list);
 	if (err < 0) {
 		mntput(mnt);
 		if (err == -EBUSY)
@@ -70,6 +75,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	dput(nd->dentry);
 	nd->mnt = mnt;
 	nd->dentry = dget(mnt->mnt_root);
+	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
 	dprintk("%s: done, returned %d\n", __FUNCTION__, err);
 	return ERR_PTR(err);
@@ -87,3 +93,20 @@ struct inode_operations nfs_mountpoint_inode_operations = {
 	.follow_link	= nfs_follow_mountpoint,
 	.getattr	= nfs_getattr,
 };
+
+static void nfs_expire_automounts(void *data)
+{
+	struct list_head *list = (struct list_head *)data;
+
+	mark_mounts_for_expiry(list);
+	if (!list_empty(list))
+		schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+}
+
+void nfs_release_automount_timer(void)
+{
+	if (list_empty(&nfs_automount_list)) {
+		cancel_delayed_work(&nfs_automount_task);
+		flush_scheduled_work();
+	}
+}
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 4c486eb867ca..db61e51bb154 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
 
 #include "callback.h"
 
@@ -46,6 +47,15 @@ static ctl_table nfs_cb_sysctls[] = {
 		.strategy = &sysctl_jiffies,
 	},
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nfs_mountpoint_timeout",
+		.data		= &nfs_mountpoint_expiry_timeout,
+		.maxlen		= sizeof(nfs_mountpoint_expiry_timeout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0ce8704732c2..a34b3ee443f1 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -405,7 +405,10 @@ extern void nfs_unregister_sysctl(void);
 /*
  * linux/fs/nfs/namespace.c
  */
+extern struct list_head nfs_automount_list;
 extern struct inode_operations nfs_mountpoint_inode_operations;
+extern int nfs_mountpoint_expiry_timeout;
+extern void nfs_release_automount_timer(void);
 
 /*
  * linux/fs/nfs/unlink.c
-- 
cgit v1.2.3-59-g8ed1b


From 8b23ea7bedb8b45a5bb56745fa3ff11018acf04e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:21 -0400
Subject: RPC: Allow struc xdr_stream to read the page section of an xdr_buf

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xdr.h |  1 +
 net/sunrpc/xdr.c           | 28 ++++++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 84c35d42d250..e6d3d349506c 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -194,6 +194,7 @@ extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
 extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p);
 extern uint32_t *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
+extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 
 #endif /* __KERNEL__ */
 
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index ca4bfa57e116..49174f0d0a3e 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -568,8 +568,7 @@ EXPORT_SYMBOL(xdr_inline_decode);
  *
  * Moves data beyond the current pointer position from the XDR head[] buffer
  * into the page list. Any data that lies beyond current position + "len"
- * bytes is moved into the XDR tail[]. The current pointer is then
- * repositioned at the beginning of the XDR tail.
+ * bytes is moved into the XDR tail[].
  */
 void xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 {
@@ -606,6 +605,31 @@ void xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 }
 EXPORT_SYMBOL(xdr_read_pages);
 
+/**
+ * xdr_enter_page - decode data from the XDR page
+ * @xdr: pointer to xdr_stream struct
+ * @len: number of bytes of page data
+ *
+ * Moves data beyond the current pointer position from the XDR head[] buffer
+ * into the page list. Any data that lies beyond current position + "len"
+ * bytes is moved into the XDR tail[]. The current pointer is then
+ * repositioned at the beginning of the first XDR page.
+ */
+void xdr_enter_page(struct xdr_stream *xdr, unsigned int len)
+{
+	char * kaddr = page_address(xdr->buf->pages[0]);
+	xdr_read_pages(xdr, len);
+	/*
+	 * Position current pointer at beginning of tail, and
+	 * set remaining message length.
+	 */
+	if (len > PAGE_CACHE_SIZE - xdr->buf->page_base)
+		len = PAGE_CACHE_SIZE - xdr->buf->page_base;
+	xdr->p = (uint32_t *)(kaddr + xdr->buf->page_base);
+	xdr->end = (uint32_t *)((char *)xdr->p + len);
+}
+EXPORT_SYMBOL(xdr_enter_page);
+
 static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
 
 void
-- 
cgit v1.2.3-59-g8ed1b


From 683b57b435326eb512c7305892683b6205669448 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:22 -0400
Subject: NFSv4: Implement the fs_locations function call

NFSv4 allows for the fact that filesystems may be replicated across
several servers or that they may be migrated to a backup server in case of
failure of the primary server.
fs_locations is an NFSv4 operation for retrieving information about the
location of migrated and/or replicated filesystems.

Based on an initial implementation by Jiaying Zhang <jiayingz@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h        |   2 +
 fs/nfs/nfs4proc.c       |  29 +++++++++++++
 fs/nfs/nfs4xdr.c        | 112 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nfs4.h    |   1 +
 include/linux/nfs_xdr.h |  24 +++++++++++
 5 files changed, 166 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 307832fd1a49..5b7651171215 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -218,6 +218,8 @@ extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
+extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+		struct nfs_fs_locations *fs_locations, struct page *page);
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 308407205e6c..768514dc0c4c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3570,6 +3570,35 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
 	return len;
 }
 
+int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+		struct nfs_fs_locations *fs_locations, struct page *page)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	u32 bitmask[2] = {
+		[0] = server->attr_bitmask[0] | FATTR4_WORD0_FS_LOCATIONS,
+		[1] = server->attr_bitmask[1],
+	};
+	struct nfs4_fs_locations_arg args = {
+		.dir_fh = NFS_FH(dir),
+		.name = &dentry->d_name,
+		.page = page,
+		.bitmask = bitmask,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp = &args,
+		.rpc_resp = &fs_locations,
+	};
+	int status;
+
+	dprintk("%s: start\n", __FUNCTION__);
+	fs_locations->fattr.valid = 0;
+	fs_locations->server = server;
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("%s: returned status = %d\n", __FUNCTION__, status);
+	return status;
+}
+
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
 	.recover_open	= nfs4_open_reclaim,
 	.recover_lock	= nfs4_lock_reclaim,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0d5794675944..7add3137b6b6 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -411,6 +411,15 @@ static int nfs_stat_to_errno(int);
 #define NFS4_dec_setacl_sz	(compound_decode_hdr_maxsz + \
 				decode_putfh_maxsz + \
 				op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+#define NFS4_enc_fs_locations_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_getattr_maxsz)
+#define NFS4_dec_fs_locations_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_putfh_maxsz + \
+				 op_decode_hdr_maxsz + \
+				 nfs4_fattr_bitmap_maxsz)
 
 static struct {
 	unsigned int	mode;
@@ -2002,6 +2011,38 @@ out:
 	return status;
 }
 
+/*
+ * Encode FS_LOCATIONS request
+ */
+static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.nops = 3,
+	};
+	struct rpc_auth *auth = req->rq_task->tk_auth;
+	int replen;
+	int status;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, &hdr);
+	if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+		goto out;
+	if ((status = encode_lookup(&xdr, args->name)) != 0)
+		goto out;
+	if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+		goto out;
+	/* set up reply
+	 *   toplevel_status + taglen + rescount + OP_PUTFH + status
+	 *   + OP_LOOKUP + status + OP_GETATTR + status = 7
+	 */
+	replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
+	xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
+			0, PAGE_SIZE);
+out:
+	return status;
+}
+
 /*
  * START OF "GENERIC" DECODE ROUTINES.
  *   These may look a little ugly since they are imported from a "generic"
@@ -2036,7 +2077,7 @@ out:
 	} \
 } while (0)
 
-static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string)
+static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
 	uint32_t *p;
 
@@ -2087,7 +2128,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp)
 {
 	uint32_t *p;
-	uint32_t strlen;
+	unsigned int strlen;
 	char *str;
 
 	READ_BUF(12);
@@ -2336,6 +2377,45 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	return status;
 }
 
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fs_locations *res)
+{
+	int n;
+	uint32_t *p;
+	int status = -EIO;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
+		goto out;
+	status = 0;
+	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
+		goto out;
+	status = decode_opaque_inline(xdr, &res->fs_pathlen, &res->fs_path);
+	if (unlikely(status != 0))
+		goto out;
+	READ_BUF(4);
+	READ32(n);
+	if (n <= 0)
+		goto out_eio;
+	res->nlocations = 0;
+	while (res->nlocations < n) {
+		struct nfs_fs_location *loc = &res->locations[res->nlocations];
+
+		status = decode_opaque_inline(xdr, &loc->serverlen, &loc->server);
+		if (unlikely(status != 0))
+			goto out_eio;
+		status = decode_opaque_inline(xdr, &loc->rootpathlen, &loc->rootpath);
+		if (unlikely(status != 0))
+			goto out_eio;
+		if (res->nlocations < NFS_FS_LOCATIONS_MAXENTRIES)
+			res->nlocations++;
+	}
+out:
+	dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status);
+	return status;
+out_eio:
+	status = -EIO;
+	goto out;
+}
+
 static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
 	uint32_t *p;
@@ -2867,6 +2947,10 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		goto xdr_error;
 	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
 		goto xdr_error;
+	if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+						struct nfs_fs_locations,
+						fattr))) != 0)
+		goto xdr_error;
 	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
 		goto xdr_error;
 	fattr->mode |= fmode;
@@ -4210,6 +4294,29 @@ out:
 	return status;
 }
 
+/*
+ * FS_LOCATIONS request
+ */
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs_fs_locations *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (status != 0)
+		goto out;
+	if ((status = decode_putfh(&xdr)) != 0)
+		goto out;
+	if ((status = decode_lookup(&xdr)) != 0)
+		goto out;
+	xdr_enter_page(&xdr, PAGE_SIZE);
+	status = decode_getfattr(&xdr, &res->fattr, res->server);
+out:
+	return status;
+}
+
 uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
 {
 	uint32_t bitmap[2] = {0};
@@ -4381,6 +4488,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(DELEGRETURN,	enc_delegreturn, dec_delegreturn),
   PROC(GETACL,		enc_getacl,	dec_getacl),
   PROC(SETACL,		enc_setacl,	dec_setacl),
+  PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
 };
 
 struct rpc_version		nfs_version4 = {
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 0c1c306cdaec..1477fc857f6b 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -384,6 +384,7 @@ enum {
 	NFSPROC4_CLNT_DELEGRETURN,
 	NFSPROC4_CLNT_GETACL,
 	NFSPROC4_CLNT_SETACL,
+	NFSPROC4_CLNT_FS_LOCATIONS,
 };
 
 #endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 95682f7d738a..15a20b815302 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -679,6 +679,30 @@ struct nfs4_server_caps_res {
 	u32				has_symlinks;
 };
 
+struct nfs_fs_location {
+	unsigned int serverlen;
+	char * server;
+	unsigned int rootpathlen;
+	char * rootpath;
+};
+
+#define NFS_FS_LOCATIONS_MAXENTRIES 10
+struct nfs_fs_locations {
+	struct nfs_fattr fattr;
+	const struct nfs_server *server;
+	unsigned int fs_pathlen;
+	char * fs_path;
+	int nlocations;
+	struct nfs_fs_location locations[NFS_FS_LOCATIONS_MAXENTRIES];
+};
+
+struct nfs4_fs_locations_arg {
+	const struct nfs_fh *dir_fh;
+	const struct qstr *name;
+	struct page *page;
+	const u32 *bitmask;
+};
+
 #endif /* CONFIG_NFS_V4 */
 
 struct nfs_page;
-- 
cgit v1.2.3-59-g8ed1b


From 7aaa0b3bd4d215d9ce4d62b6c2043a63ba650f93 Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:23 -0400
Subject: NFSv4: convert fs-locations-components to conform to RFC3530

Use component4-style formats for decoding list of servers and pathnames in
fs_locations.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h        |  2 +-
 fs/nfs/nfs4proc.c       |  4 +--
 fs/nfs/nfs4xdr.c        | 80 +++++++++++++++++++++++++++++++++++++++++++------
 include/linux/nfs_xdr.h | 30 ++++++++++++-------
 4 files changed, 94 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 5b7651171215..22a5f838ea58 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -219,7 +219,7 @@ extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct n
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
-		struct nfs_fs_locations *fs_locations, struct page *page);
+		struct nfs4_fs_locations *fs_locations, struct page *page);
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 768514dc0c4c..043223a0eda6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3571,7 +3571,7 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
 }
 
 int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
-		struct nfs_fs_locations *fs_locations, struct page *page)
+		struct nfs4_fs_locations *fs_locations, struct page *page)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
 	u32 bitmask[2] = {
@@ -3587,7 +3587,7 @@ int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
 		.rpc_argp = &args,
-		.rpc_resp = &fs_locations,
+		.rpc_resp = fs_locations,
 	};
 	int status;
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7add3137b6b6..f6a1ea7df374 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2377,7 +2377,43 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	return status;
 }
 
-static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fs_locations *res)
+static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
+{
+	int n;
+	uint32_t *p;
+	int status = 0;
+
+	READ_BUF(4);
+	READ32(n);
+	if (n <= 0)
+		goto out_eio;
+	dprintk("path ");
+	path->ncomponents = 0;
+	while (path->ncomponents < n) {
+		struct nfs4_string *component = &path->components[path->ncomponents];
+		status = decode_opaque_inline(xdr, &component->len, &component->data);
+		if (unlikely(status != 0))
+			goto out_eio;
+		if (path->ncomponents != n)
+			dprintk("/");
+		dprintk("%s", component->data);
+		if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
+			path->ncomponents++;
+		else {
+			dprintk("cannot parse %d components in path\n", n);
+			goto out_eio;
+		}
+	}
+out:
+	dprintk("\n");
+	return status;
+out_eio:
+	dprintk(" status %d", status);
+	status = -EIO;
+	goto out;
+}
+
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
 {
 	int n;
 	uint32_t *p;
@@ -2388,7 +2424,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	status = 0;
 	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
 		goto out;
-	status = decode_opaque_inline(xdr, &res->fs_pathlen, &res->fs_path);
+	dprintk("%s: fsroot ", __FUNCTION__);
+	status = decode_pathname(xdr, &res->fs_path);
 	if (unlikely(status != 0))
 		goto out;
 	READ_BUF(4);
@@ -2397,15 +2434,40 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		goto out_eio;
 	res->nlocations = 0;
 	while (res->nlocations < n) {
-		struct nfs_fs_location *loc = &res->locations[res->nlocations];
+		int m;
+		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
 
-		status = decode_opaque_inline(xdr, &loc->serverlen, &loc->server);
-		if (unlikely(status != 0))
+		READ_BUF(4);
+		READ32(m);
+		if (m <= 0)
 			goto out_eio;
-		status = decode_opaque_inline(xdr, &loc->rootpathlen, &loc->rootpath);
+
+		loc->nservers = 0;
+		dprintk("%s: servers ", __FUNCTION__);
+		while (loc->nservers < m) {
+			struct nfs4_string *server = &loc->servers[loc->nservers];
+			status = decode_opaque_inline(xdr, &server->len, &server->data);
+			if (unlikely(status != 0))
+				goto out_eio;
+			dprintk("%s ", server->data);
+			if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
+				loc->nservers++;
+			else {
+				int i;
+				dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations);
+				for (i = loc->nservers; i < m; i++) {
+					int len;
+					char *data;
+					status = decode_opaque_inline(xdr, &len, &data);
+					if (unlikely(status != 0))
+						goto out_eio;
+				}
+			}
+		}
+		status = decode_pathname(xdr, &loc->rootpath);
 		if (unlikely(status != 0))
 			goto out_eio;
-		if (res->nlocations < NFS_FS_LOCATIONS_MAXENTRIES)
+		if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
 			res->nlocations++;
 	}
 out:
@@ -2948,7 +3010,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
-						struct nfs_fs_locations,
+						struct nfs4_fs_locations,
 						fattr))) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
@@ -4297,7 +4359,7 @@ out:
 /*
  * FS_LOCATIONS request
  */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs_fs_locations *res)
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 15a20b815302..d6eea8348728 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -679,21 +679,31 @@ struct nfs4_server_caps_res {
 	u32				has_symlinks;
 };
 
-struct nfs_fs_location {
-	unsigned int serverlen;
-	char * server;
-	unsigned int rootpathlen;
-	char * rootpath;
+struct nfs4_string {
+	unsigned int len;
+	char *data;
 };
 
-#define NFS_FS_LOCATIONS_MAXENTRIES 10
-struct nfs_fs_locations {
+#define NFS4_PATHNAME_MAXCOMPONENTS 512
+struct nfs4_pathname {
+	unsigned int ncomponents;
+	struct nfs4_string components[NFS4_PATHNAME_MAXCOMPONENTS];
+};
+
+#define NFS4_FS_LOCATION_MAXSERVERS 10
+struct nfs4_fs_location {
+	unsigned int nservers;
+	struct nfs4_string servers[NFS4_FS_LOCATION_MAXSERVERS];
+	struct nfs4_pathname rootpath;
+};
+
+#define NFS4_FS_LOCATIONS_MAXENTRIES 10
+struct nfs4_fs_locations {
 	struct nfs_fattr fattr;
 	const struct nfs_server *server;
-	unsigned int fs_pathlen;
-	char * fs_path;
+	struct nfs4_pathname fs_path;
 	int nlocations;
-	struct nfs_fs_location locations[NFS_FS_LOCATIONS_MAXENTRIES];
+	struct nfs4_fs_location locations[NFS4_FS_LOCATIONS_MAXENTRIES];
 };
 
 struct nfs4_fs_locations_arg {
-- 
cgit v1.2.3-59-g8ed1b


From 9cdb3883c38f883436a84c2353a4cf964ff890a2 Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:28 -0400
Subject: NFSv4: Ensure client submounts when following a referral

Set up mountpoint when hitting a referral on moved error by getting
fs_locations.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         | 270 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nfs_fs.h |   2 +
 2 files changed, 269 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ebdab885c475..0d8302e59d69 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,8 @@
 #include <linux/mount.h>
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -1714,6 +1716,10 @@ struct nfs_clone_mount {
 	const struct dentry *dentry;
 	struct nfs_fh *fh;
 	struct nfs_fattr *fattr;
+	char *hostname;
+	char *mnt_path;
+	struct sockaddr_in *addr;
+	rpc_authflavor_t authflavor;
 };
 
 static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
@@ -1724,17 +1730,19 @@ static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
 	struct nfs_server *parent = NFS_SB(data->sb);
 	struct super_block *sb = ERR_PTR(-EINVAL);
 	void *err = ERR_PTR(-ENOMEM);
+	char *hostname;
 	int len;
 
 	server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
 	if (server == NULL)
 		goto out_err;
 	memcpy(server, parent, sizeof(*server));
-	len = strlen(parent->hostname) + 1;
+	hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
+	len = strlen(hostname) + 1;
 	server->hostname = kmalloc(len, GFP_KERNEL);
 	if (server->hostname == NULL)
 		goto free_server;
-	memcpy(server->hostname, parent->hostname, len);
+	memcpy(server->hostname, hostname, len);
 	if (rpciod_up() != 0)
 		goto free_hostname;
 
@@ -2458,7 +2466,8 @@ static inline void unregister_nfs4fs(void)
 	nfs_unregister_sysctl();
 }
 #else
-#define nfs4_clone_client(a,b) ERR_PTR(-EINVAL)
+#define nfs4_fill_sb(a,b)	ERR_PTR(-EINVAL)
+#define nfs4_fill_super(a,b)	ERR_PTR(-EINVAL)
 #define nfs4_init_once(nfsi) \
 	do { } while (0)
 #define register_nfs4fs() (0)
@@ -2521,6 +2530,261 @@ out:
 	return mnt;
 }
 
+/* Check if fs_root is valid */
+static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname, char *buffer, ssize_t buflen)
+{
+	char *end = buffer + buflen;
+	int n;
+
+	*--end = '\0';
+	buflen--;
+
+	n = pathname->ncomponents;
+	while (--n >= 0) {
+		struct nfs4_string *component = &pathname->components[n];
+		buflen -= component->len + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= component->len;
+		memcpy(end, component->data, component->len);
+		*--end = '/';
+	}
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+/* Check if the string represents a "valid" IPv4 address */
+static inline int valid_ipaddr4(const char *buf)
+{
+	int rc, count, in[4];
+
+	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+	if (rc != 4)
+		return -EINVAL;
+	for (count = 0; count < 4; count++) {
+		if (in[count] > 255)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	struct super_block *sb = ERR_PTR(-ENOMEM);
+	int len;
+
+	len = strlen(data->mnt_path) + 1;
+	server->mnt_path = kmalloc(len, GFP_KERNEL);
+	if (server->mnt_path == NULL)
+		goto err;
+	memcpy(server->mnt_path, data->mnt_path, len);
+	memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in));
+
+	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+	if (IS_ERR(sb) || sb->s_root)
+		goto free_path;
+	return sb;
+free_path:
+	kfree(server->mnt_path);
+err:
+	server->mnt_path = NULL;
+	return sb;
+}
+
+static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct rpc_timeout timeparms;
+	int proto, timeo, retrans;
+	void *err;
+
+	proto = IPPROTO_TCP;
+	/* Since we are following a referral and there may be alternatives,
+	   set the timeouts and retries to low values */
+	timeo = 2;
+	retrans = 1;
+	nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
+
+	server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
+	if (IS_ERR((err = server->client)))
+		goto out_err;
+
+	sb->s_time_gran = 1;
+	sb->s_op = &nfs4_sops;
+	err = ERR_PTR(nfs_sb_init(sb, data->authflavor));
+	if (!IS_ERR(err))
+		return server;
+out_err:
+	return (struct nfs_server *)err;
+}
+
+static struct super_block *nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server);
+}
+
+static struct file_system_type nfs_referral_nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.get_sb		= nfs_referral_nfs4_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+/**
+ * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fspath - fs path returned in fs_locations
+ * @mntpath - mount path to new server
+ * @hostname - hostname of new server
+ * @addr - host addr of new server
+ *
+ */
+struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
+	     const struct dentry *dentry, struct nfs4_fs_locations *locations)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct nfs_clone_mount mountdata = {
+		.sb = mnt_parent->mnt_sb,
+		.dentry = dentry,
+		.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
+	};
+	char *page, *page2;
+	char *path, *fs_path;
+	char *devname;
+	int loc, s;
+
+	if (locations == NULL || locations->nlocations <= 0)
+		goto out;
+
+	dprintk("%s: referral at %s/%s\n", __FUNCTION__,
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	/* Ensure fs path is a prefix of current dentry path */
+	page = (char *) __get_free_page(GFP_USER);
+	if (page == NULL)
+		goto out;
+	page2 = (char *) __get_free_page(GFP_USER);
+	if (page2 == NULL)
+		goto out;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (IS_ERR(path))
+		goto out_free;
+
+	fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+	if (IS_ERR(fs_path))
+		goto out_free;
+
+	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
+		dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path);
+		goto out_free;
+	}
+
+	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+	if (IS_ERR(devname)) {
+		mnt = (struct vfsmount *)devname;
+		goto out_free;
+	}
+
+	loc = 0;
+	while (loc < locations->nlocations && IS_ERR(mnt)) {
+		struct nfs4_fs_location *location = &locations->locations[loc];
+		char *mnt_path;
+
+		if (location == NULL || location->nservers <= 0 ||
+		    location->rootpath.ncomponents == 0) {
+			loc++;
+			continue;
+		}
+
+		mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+		if (IS_ERR(mnt_path)) {
+			loc++;
+			continue;
+		}
+		mountdata.mnt_path = mnt_path;
+
+		s = 0;
+		while (s < location->nservers) {
+			struct sockaddr_in addr = {};
+
+			if (location->servers[s].len <= 0 ||
+			    valid_ipaddr4(location->servers[s].data) < 0) {
+				s++;
+				continue;
+			}
+
+			mountdata.hostname = location->servers[s].data;
+			addr.sin_addr.s_addr = in_aton(mountdata.hostname);
+			addr.sin_family = AF_INET;
+			addr.sin_port = htons(NFS_PORT);
+			mountdata.addr = &addr;
+
+			mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata);
+			if (!IS_ERR(mnt)) {
+				break;
+			}
+			s++;
+		}
+		loc++;
+	}
+
+out_free:
+	free_page((unsigned long)page);
+	free_page((unsigned long)page2);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
+
+/*
+ * nfs_do_refmount - handle crossing a referral on server
+ * @dentry - dentry of referral
+ * @nd - nameidata info
+ *
+ */
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct dentry *parent;
+	struct nfs4_fs_locations *fs_locations = NULL;
+	struct page *page;
+	int err;
+
+	/* BUG_ON(IS_ROOT(dentry)); */
+	dprintk("%s: enter\n", __FUNCTION__);
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+
+	fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (fs_locations == NULL)
+		goto out_free;
+
+	/* Get locations */
+	parent = dget_parent(dentry);
+	dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name);
+	err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page);
+	dput(parent);
+	if (err != 0 || fs_locations->nlocations <= 0 ||
+	    fs_locations->fs_path.ncomponents <= 0)
+		goto out_free;
+
+	mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
+out_free:
+	__free_page(page);
+	kfree(fs_locations);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
+
 extern int nfs_init_nfspagecache(void);
 extern void nfs_destroy_nfspagecache(void);
 extern int nfs_init_readpagecache(void);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a34b3ee443f1..09271b10f9a8 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -317,6 +317,8 @@ extern struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
 					const struct dentry *dentry,
 					struct nfs_fh *fh,
 					struct nfs_fattr *fattr);
+extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent,
+					struct dentry *dentry);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern u32 root_nfs_parse_addr(char *name); /*__init*/
-- 
cgit v1.2.3-59-g8ed1b


From 6b97fd3da1eab2cc490cfe884c7d4956522eaf8b Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:29 -0400
Subject: NFSv4: Follow a referral

Respond to a moved error on NFS lookup by setting up the referral.
Note: We don't actually follow the referral during lookup/getattr, but
later when we detect fsid mismatch in inode revalidation (similar to the
processing done for cloning submounts). Referrals will have fake attributes
until they are actually followed or traversed.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c          |  5 ++++-
 fs/nfs/namespace.c      |  9 ++++++++-
 fs/nfs/nfs4proc.c       | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_fs.h  |  1 +
 include/linux/nfs_xdr.h |  1 +
 5 files changed, 60 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0d8302e59d69..ee13cb01b56e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -888,7 +888,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
 			/* Deal with crossing mountpoints */
 			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
-				inode->i_op = &nfs_mountpoint_inode_operations;
+				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+					inode->i_op = &nfs_referral_inode_operations;
+				else
+					inode->i_op = &nfs_mountpoint_inode_operations;
 				inode->i_fop = NULL;
 			}
 		} else if (S_ISLNK(inode->i_mode))
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index e426516c1116..8ca44b7b25c3 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -58,7 +58,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	if (err != 0)
 		goto out_err;
 
-	mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
+	if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
+		mnt = nfs_do_refmount(nd->mnt, nd->dentry);
+	else
+		mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
 	err = PTR_ERR(mnt);
 	if (IS_ERR(mnt))
 		goto out_err;
@@ -94,6 +97,10 @@ struct inode_operations nfs_mountpoint_inode_operations = {
 	.getattr	= nfs_getattr,
 };
 
+struct inode_operations nfs_referral_inode_operations = {
+	.follow_link	= nfs_follow_mountpoint,
+};
+
 static void nfs_expire_automounts(void *data)
 {
 	struct list_head *list = (struct list_head *)data;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 90ee21a07b3e..3300e35d74ad 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1462,6 +1462,50 @@ out:
 	return nfs4_map_errors(status);
 }
 
+/*
+ * Get locations and (maybe) other attributes of a referral.
+ * Note that we'll actually follow the referral later when
+ * we detect fsid mismatch in inode revalidation
+ */
+static int nfs4_get_referral(struct inode *dir, struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle)
+{
+	int status = -ENOMEM;
+	struct page *page = NULL;
+	struct nfs4_fs_locations *locations = NULL;
+	struct dentry dentry = {};
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (locations == NULL)
+		goto out;
+
+	dentry.d_name.name = name->name;
+	dentry.d_name.len = name->len;
+	status = nfs4_proc_fs_locations(dir, &dentry, locations, page);
+	if (status != 0)
+		goto out;
+	/* Make sure server returned a different fsid for the referral */
+	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
+		dprintk("%s: server did not return a different fsid for a referral at %s\n", __FUNCTION__, name->name);
+		status = -EIO;
+		goto out;
+	}
+
+	memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
+	fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL;
+	if (!fattr->mode)
+		fattr->mode = S_IFDIR;
+	memset(fhandle, 0, sizeof(struct nfs_fh));
+out:
+	if (page)
+		__free_page(page);
+	if (locations)
+		kfree(locations);
+	return status;
+}
+
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs4_getattr_arg args = {
@@ -1566,6 +1610,8 @@ static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
 	
 	dprintk("NFS call  lookup %s\n", name->name);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	if (status == -NFS4ERR_MOVED)
+		status = nfs4_get_referral(dir, name, fattr, fhandle);
 	dprintk("NFS reply lookup: %d\n", status);
 	return status;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 09271b10f9a8..152798949113 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -409,6 +409,7 @@ extern void nfs_unregister_sysctl(void);
  */
 extern struct list_head nfs_automount_list;
 extern struct inode_operations nfs_mountpoint_inode_operations;
+extern struct inode_operations nfs_referral_inode_operations;
 extern int nfs_mountpoint_expiry_timeout;
 extern void nfs_release_automount_timer(void);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d6eea8348728..7c7320fa51aa 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -63,6 +63,7 @@ struct nfs_fattr {
 #define NFS_ATTR_FATTR		0x0002		/* post-op attributes */
 #define NFS_ATTR_FATTR_V3	0x0004		/* NFSv3 attributes */
 #define NFS_ATTR_FATTR_V4	0x0008		/* NFSv4 change attribute */
+#define NFS_ATTR_FATTR_V4_REFERRAL	0x0010		/* NFSv4 referral */
 
 /*
  * Info on the file system
-- 
cgit v1.2.3-59-g8ed1b


From 3134cbec5e172c3a86e2c3ef4af34b6cfd380bfa Mon Sep 17 00:00:00 2001
From: Marc Eshel <eshel@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:40:20 -0400
Subject: locks.c: add the fl_owner to nlm_compare_locks

Add the fl_owner to NLM compare locks. Since two different client can
present the same pid to the server it is not enough to distinguish locks
from different clients. The fl_owner field is a pointer to the struct
nlm_host which is unique for each client.

Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/lockd/lockd.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 995f89dc8c04..112936fcda80 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -220,6 +220,7 @@ static __inline__ int
 nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2)
 {
 	return	fl1->fl_pid   == fl2->fl_pid
+	     && fl1->fl_owner == fl2->fl_owner
 	     && fl1->fl_start == fl2->fl_start
 	     && fl1->fl_end   == fl2->fl_end
 	     &&(fl1->fl_type  == fl2->fl_type || fl2->fl_type == F_UNLCK);
-- 
cgit v1.2.3-59-g8ed1b


From 5046791417dcac1ba126b77b8062af15a2f0b8e1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:40:24 -0400
Subject: NLM: sem to mutex conversion

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c             | 8 ++++----
 include/linux/lockd/lockd.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 729ac427d359..5242743c9403 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -112,7 +112,7 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 	host->h_version    = version;
 	host->h_proto      = proto;
 	host->h_rpcclnt    = NULL;
-	init_MUTEX(&host->h_sema);
+	mutex_init(&host->h_mutex);
 	host->h_nextrebind = jiffies + NLM_HOST_REBIND;
 	host->h_expires    = jiffies + NLM_HOST_EXPIRE;
 	atomic_set(&host->h_count, 1);
@@ -172,7 +172,7 @@ nlm_bind_host(struct nlm_host *host)
 			(unsigned)ntohl(host->h_addr.sin_addr.s_addr));
 
 	/* Lock host handle */
-	down(&host->h_sema);
+	mutex_lock(&host->h_mutex);
 
 	/* If we've already created an RPC client, check whether
 	 * RPC rebind is required
@@ -204,12 +204,12 @@ nlm_bind_host(struct nlm_host *host)
 		host->h_rpcclnt = clnt;
 	}
 
-	up(&host->h_sema);
+	mutex_unlock(&host->h_mutex);
 	return clnt;
 
 forgetit:
 	printk("lockd: couldn't create RPC handle for %s\n", host->h_name);
-	up(&host->h_sema);
+	mutex_unlock(&host->h_mutex);
 	return NULL;
 }
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 112936fcda80..a6c1a33e5ae3 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -54,7 +54,7 @@ struct nlm_host {
 	u32			h_nsmstate;	/* true remote NSM state */
 	u32			h_pidcount;	/* Pseudopids */
 	atomic_t		h_count;	/* reference count */
-	struct semaphore	h_sema;		/* mutex for pmap binding */
+	struct mutex		h_mutex;	/* mutex for pmap binding */
 	unsigned long		h_nextrebind;	/* next portmap call */
 	unsigned long		h_expires;	/* eligible for GC */
 	struct list_head	h_lockowners;	/* Lockowners for the client */
-- 
cgit v1.2.3-59-g8ed1b


From 28df955a2ad484d602314b30183ea8496a9aa34a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:40:27 -0400
Subject: NLM: Fix reclaim races

Currently it is possible for a task to remove its locks at the same time as
the NLM recovery thread is trying to recover them. This quickly leads to an
Oops.
Protect the locks using an rw semaphore while they are being recovered.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c         | 39 +++++++++++++++++++++++++--------------
 fs/lockd/clntproc.c         | 14 +++++++++++++-
 fs/lockd/host.c             |  1 +
 include/linux/lockd/lockd.h |  1 +
 4 files changed, 40 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index bce744468708..52774feab93f 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -147,11 +147,10 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
  * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
  * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
  */
-static inline
-void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
+static void nlmclnt_prepare_reclaim(struct nlm_host *host)
 {
+	down_write(&host->h_rwsem);
 	host->h_monitored = 0;
-	host->h_nsmstate = newstate;
 	host->h_state++;
 	host->h_nextrebind = 0;
 	nlm_rebind_host(host);
@@ -164,6 +163,13 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 	dprintk("NLM: reclaiming locks for host %s", host->h_name);
 }
 
+static void nlmclnt_finish_reclaim(struct nlm_host *host)
+{
+	host->h_reclaiming = 0;
+	up_write(&host->h_rwsem);
+	dprintk("NLM: done reclaiming locks for host %s", host->h_name);
+}
+
 /*
  * Reclaim all locks on server host. We do this by spawning a separate
  * reclaimer thread.
@@ -171,12 +177,10 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 void
 nlmclnt_recovery(struct nlm_host *host, u32 newstate)
 {
-	if (host->h_reclaiming++) {
-		if (host->h_nsmstate == newstate)
-			return;
-		nlmclnt_prepare_reclaim(host, newstate);
-	} else {
-		nlmclnt_prepare_reclaim(host, newstate);
+	if (host->h_nsmstate == newstate)
+		return;
+	host->h_nsmstate = newstate;
+	if (!host->h_reclaiming++) {
 		nlm_get_host(host);
 		__module_get(THIS_MODULE);
 		if (kernel_thread(reclaimer, host, CLONE_KERNEL) < 0)
@@ -190,6 +194,7 @@ reclaimer(void *ptr)
 	struct nlm_host	  *host = (struct nlm_host *) ptr;
 	struct nlm_wait	  *block;
 	struct file_lock *fl, *next;
+	u32 nsmstate;
 
 	daemonize("%s-reclaim", host->h_name);
 	allow_signal(SIGKILL);
@@ -199,19 +204,25 @@ reclaimer(void *ptr)
 	lock_kernel();
 	lockd_up();
 
+	nlmclnt_prepare_reclaim(host);
 	/* First, reclaim all locks that have been marked. */
 restart:
+	nsmstate = host->h_nsmstate;
 	list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
 		list_del_init(&fl->fl_u.nfs_fl.list);
 
 		if (signalled())
 			continue;
-		if (nlmclnt_reclaim(host, fl) == 0)
-			list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
-		goto restart;
+		if (nlmclnt_reclaim(host, fl) != 0)
+			continue;
+		list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
+		if (host->h_nsmstate != nsmstate) {
+			/* Argh! The server rebooted again! */
+			list_splice_init(&host->h_granted, &host->h_reclaim);
+			goto restart;
+		}
 	}
-
-	host->h_reclaiming = 0;
+	nlmclnt_finish_reclaim(host);
 
 	/* Now, wake up all processes that sleep on a blocked lock */
 	list_for_each_entry(block, &nlm_blocked, b_list) {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index f96e38155b5c..4db62098d3f4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -508,7 +508,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	}
 
 	block = nlmclnt_prepare_block(host, fl);
+again:
 	for(;;) {
+		/* Reboot protection */
+		fl->fl_u.nfs_fl.state = host->h_state;
 		status = nlmclnt_call(req, NLMPROC_LOCK);
 		if (status < 0)
 			goto out_unblock;
@@ -531,10 +534,16 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	}
 
 	if (resp->status == NLM_LCK_GRANTED) {
-		fl->fl_u.nfs_fl.state = host->h_state;
+		down_read(&host->h_rwsem);
+		/* Check whether or not the server has rebooted */
+		if (fl->fl_u.nfs_fl.state != host->h_state) {
+			up_read(&host->h_rwsem);
+			goto again;
+		}
 		fl->fl_flags |= FL_SLEEP;
 		/* Ensure the resulting lock will get added to granted list */
 		do_vfs_lock(fl);
+		up_read(&host->h_rwsem);
 	}
 	status = nlm_stat_to_errno(resp->status);
 out_unblock:
@@ -596,6 +605,7 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
 static int
 nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 {
+	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
 	int		status;
 
@@ -604,7 +614,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
 	 * case, we want to unlock.
 	 */
+	down_read(&host->h_rwsem);
 	do_vfs_lock(fl);
+	up_read(&host->h_rwsem);
 
 	if (req->a_flags & RPC_TASK_ASYNC)
 		return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 5242743c9403..38b0e8a1aec0 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -117,6 +117,7 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 	host->h_expires    = jiffies + NLM_HOST_EXPIRE;
 	atomic_set(&host->h_count, 1);
 	init_waitqueue_head(&host->h_gracewait);
+	init_rwsem(&host->h_rwsem);
 	host->h_state      = 0;			/* pseudo NSM state */
 	host->h_nsmstate   = 0;			/* real NSM state */
 	host->h_server	   = server;
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index a6c1a33e5ae3..6b2684763fc7 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -50,6 +50,7 @@ struct nlm_host {
 				h_killed     : 1,
 				h_monitored  : 1;
 	wait_queue_head_t	h_gracewait;	/* wait while reclaiming */
+	struct rw_semaphore	h_rwsem;	/* Reboot recovery lock */
 	u32			h_state;	/* pseudo-state counter */
 	u32			h_nsmstate;	/* true remote NSM state */
 	u32			h_pidcount;	/* Pseudopids */
-- 
cgit v1.2.3-59-g8ed1b


From 06cf6f2ed0b19629700794727d86ed57b9c0583e Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 20 Jun 2006 12:56:49 -0400
Subject: NFS: Eliminate nfs_get_user_pages()

Neil Brown observed that the kmalloc() in nfs_get_user_pages() is more
likely to fail if the I/O is large enough to require the allocation of more
than a single page to keep track of all the pinned pages in the user's
buffer.

Instead of tracking one large page array per dreq/iocb, track pages per
nfs_read/write_data, just like the cached I/O path does.  An array for
pages is already allocated for us by nfs_readdata_alloc() (and the write
and commit equivalents).

This is also required for adding support for vectored I/O to the NFS direct
I/O path.

The original reason to pin the user buffer and allocate all the NFS data
structures before trying to schedule I/O was to ensure all needed resources
are allocated on the client before starting to send requests.  This reduces
the chance that resource exhaustion on the client will cause a short read
or write.

On the other hand, for an application making very large application I/O
requests, this means that it will be nearly impossible for the application
to make forward progress on a resource-limited client.

Thus, moving the buffer pinning functionality into the I/O scheduling
loops should be good for scalability.  The next patch will do the same for
NFS data structure allocation.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c         | 205 ++++++++++++++++++++++++++----------------------
 include/linux/nfs_xdr.h |   2 +
 2 files changed, 113 insertions(+), 94 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4cb3446220ba..b1630d53fbb1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -73,8 +73,6 @@ struct nfs_direct_req {
 	struct nfs_open_context	*ctx;		/* file open context info */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	struct inode *		inode;		/* target file of i/o */
-	struct page **		pages;		/* pages in our buffer */
-	unsigned int		npages;		/* count of pages */
 
 	/* completion state */
 	atomic_t		io_count;	/* i/os we're waiting for */
@@ -104,6 +102,20 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
 	return atomic_dec_and_test(&dreq->io_count);
 }
 
+/*
+ * "size" is never larger than rsize or wsize.
+ */
+static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size)
+{
+	int page_count;
+
+	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	page_count -= user_addr >> PAGE_SHIFT;
+	BUG_ON(page_count < 0);
+
+	return page_count;
+}
+
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
  * @rw: direction (read or write)
@@ -143,40 +155,6 @@ static void nfs_direct_release_pages(struct page **pages, int npages)
 		page_cache_release(pages[i]);
 }
 
-static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
-{
-	int result = -ENOMEM;
-	unsigned long page_count;
-	size_t array_size;
-
-	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	page_count -= user_addr >> PAGE_SHIFT;
-
-	array_size = (page_count * sizeof(struct page *));
-	*pages = kmalloc(array_size, GFP_KERNEL);
-	if (*pages) {
-		down_read(&current->mm->mmap_sem);
-		result = get_user_pages(current, current->mm, user_addr,
-					page_count, (rw == READ), 0,
-					*pages, NULL);
-		up_read(&current->mm->mmap_sem);
-		if (result != page_count) {
-			/*
-			 * If we got fewer pages than expected from
-			 * get_user_pages(), the user buffer runs off the
-			 * end of a mapping; return EFAULT.
-			 */
-			if (result >= 0) {
-				nfs_direct_release_pages(*pages, result);
-				result = -EFAULT;
-			} else
-				kfree(*pages);
-			*pages = NULL;
-		}
-	}
-	return result;
-}
-
 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 {
 	struct nfs_direct_req *dreq;
@@ -233,13 +211,8 @@ out:
 }
 
 /*
- * We must hold a reference to all the pages in this direct read request
- * until the RPCs complete.  This could be long *after* we are woken up in
- * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
- *
- * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
- * can't trust the iocb is still valid here if this is a synchronous
- * request.  If the waiter is woken prematurely, the iocb is long gone.
+ * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
+ * the iocb is still valid here if this is a synchronous request.
  */
 static void nfs_direct_complete(struct nfs_direct_req *dreq)
 {
@@ -297,6 +270,11 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
 	return dreq;
 }
 
+/*
+ * We must hold a reference to all the pages in this direct read request
+ * until the RPCs complete.  This could be long *after* we are woken up in
+ * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
+ */
 static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
@@ -305,6 +283,9 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 	if (nfs_readpage_result(task, data) != 0)
 		return;
 
+	nfs_direct_dirty_pages(data->pagevec, data->npages);
+	nfs_direct_release_pages(data->pagevec, data->npages);
+
 	spin_lock(&dreq->lock);
 
 	if (likely(task->tk_status >= 0))
@@ -314,11 +295,8 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 
 	spin_unlock(&dreq->lock);
 
-	if (put_dreq(dreq)) {
-		nfs_direct_dirty_pages(dreq->pages, dreq->npages);
-		nfs_direct_release_pages(dreq->pages, dreq->npages);
+	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
-	}
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -328,21 +306,23 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 
 /*
  * For each nfs_read_data struct that was allocated on the list, dispatch
- * an NFS READ operation
+ * an NFS READ operation.  If get_user_pages() fails, we stop sending reads.
+ * Read length accounting is handled by nfs_direct_read_result().
+ * Otherwise, if no requests have been sent, just return an error.
  */
-static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	struct list_head *list = &dreq->list;
-	struct page **pages = dreq->pages;
 	size_t rsize = NFS_SERVER(inode)->rsize;
-	unsigned int curpage, pgbase;
+	unsigned int pgbase;
+	int result;
+	ssize_t started = 0;
+	struct nfs_read_data *data;
 
-	curpage = 0;
 	pgbase = user_addr & ~PAGE_MASK;
 	do {
-		struct nfs_read_data *data;
 		size_t bytes;
 
 		bytes = rsize;
@@ -353,13 +333,21 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
 		data = list_entry(list->next, struct nfs_read_data, pages);
 		list_del_init(&data->pages);
 
+		data->npages = nfs_direct_count_pages(user_addr, bytes);
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					data->npages, 1, 0, data->pagevec, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (unlikely(result < data->npages))
+			goto out_err;
+
 		data->inode = inode;
 		data->cred = ctx->cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = &pages[curpage];
+		data->args.pages = data->pagevec;
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.eof = 0;
@@ -382,17 +370,36 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
 				bytes,
 				(unsigned long long)data->args.offset);
 
+		started += bytes;
+		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
-		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
 	BUG_ON(!list_empty(list));
+	return 0;
+
+out_err:
+	if (result > 0)
+		nfs_direct_release_pages(data->pagevec, result);
+
+	list_add(&data->pages, list);
+	while (!list_empty(list)) {
+		data = list_entry(list->next, struct nfs_read_data, pages);
+		list_del(&data->pages);
+		nfs_readdata_free(data);
+		if (put_dreq(dreq))
+			nfs_direct_complete(dreq);
+	}
+
+	if (started)
+		return 0;
+	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
-static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
+static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
 {
 	ssize_t result;
 	sigset_t oldset;
@@ -404,8 +411,6 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 	if (!dreq)
 		return -ENOMEM;
 
-	dreq->pages = pages;
-	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -413,8 +418,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
 	rpc_clnt_sigmask(clnt, &oldset);
-	nfs_direct_read_schedule(dreq, user_addr, count, pos);
-	result = nfs_direct_wait(dreq);
+	result = nfs_direct_read_schedule(dreq, user_addr, count, pos);
+	if (!result)
+		result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -426,9 +432,9 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 	while (!list_empty(&dreq->list)) {
 		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
+		nfs_direct_release_pages(data->pagevec, data->npages);
 		nfs_writedata_release(data);
 	}
-	nfs_direct_release_pages(dreq->pages, dreq->npages);
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -672,21 +678,23 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
 
 /*
  * For each nfs_write_data struct that was allocated on the list, dispatch
- * an NFS WRITE operation
+ * an NFS WRITE operation.  If get_user_pages() fails, we stop sending writes.
+ * Write length accounting is handled by nfs_direct_write_result().
+ * Otherwise, if no requests have been sent, just return an error.
  */
-static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
+static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	struct list_head *list = &dreq->list;
-	struct page **pages = dreq->pages;
 	size_t wsize = NFS_SERVER(inode)->wsize;
-	unsigned int curpage, pgbase;
+	unsigned int pgbase;
+	int result;
+	ssize_t started = 0;
+	struct nfs_write_data *data;
 
-	curpage = 0;
 	pgbase = user_addr & ~PAGE_MASK;
 	do {
-		struct nfs_write_data *data;
 		size_t bytes;
 
 		bytes = wsize;
@@ -695,6 +703,15 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 
 		BUG_ON(list_empty(list));
 		data = list_entry(list->next, struct nfs_write_data, pages);
+
+		data->npages = nfs_direct_count_pages(user_addr, bytes);
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					data->npages, 0, 0, data->pagevec, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (unlikely(result < data->npages))
+			goto out_err;
+
 		list_move_tail(&data->pages, &dreq->rewrite_list);
 
 		data->inode = inode;
@@ -703,7 +720,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = &pages[curpage];
+		data->args.pages = data->pagevec;
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.count = bytes;
@@ -727,17 +744,36 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 				bytes,
 				(unsigned long long)data->args.offset);
 
+		started += bytes;
+		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
-		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
 	BUG_ON(!list_empty(list));
+	return 0;
+
+out_err:
+	if (result > 0)
+		nfs_direct_release_pages(data->pagevec, result);
+
+	list_add(&data->pages, list);
+	while (!list_empty(list)) {
+		data = list_entry(list->next, struct nfs_write_data, pages);
+		list_del(&data->pages);
+		nfs_writedata_free(data);
+		if (put_dreq(dreq))
+			nfs_direct_write_complete(dreq, inode);
+	}
+
+	if (started)
+		return 0;
+	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
-static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
+static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
 {
 	ssize_t result;
 	sigset_t oldset;
@@ -753,8 +789,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	if (dreq->commit_data == NULL || count < wsize)
 		sync = FLUSH_STABLE;
 
-	dreq->pages = pages;
-	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -765,8 +799,9 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	nfs_begin_data_update(inode);
 
 	rpc_clnt_sigmask(clnt, &oldset);
-	nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
-	result = nfs_direct_wait(dreq);
+	result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
+	if (!result)
+		result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -796,8 +831,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval = -EINVAL;
-	int page_count;
-	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -819,14 +852,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count,
 	if (retval)
 		goto out;
 
-	retval = nfs_get_user_pages(READ, (unsigned long) buf,
-						count, &pages);
-	if (retval < 0)
-		goto out;
-	page_count = retval;
-
-	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
-						pages, page_count);
+	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos);
 	if (retval > 0)
 		iocb->ki_pos = pos + retval;
 
@@ -862,8 +888,6 @@ out:
 ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval;
-	int page_count;
-	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -891,14 +915,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t
 	if (retval)
 		goto out;
 
-	retval = nfs_get_user_pages(WRITE, (unsigned long) buf,
-						count, &pages);
-	if (retval < 0)
-		goto out;
-	page_count = retval;
-
-	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
-					pos, pages, page_count);
+	retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
 
 	/*
 	 * XXX: nfs_end_data_update() already ensures this file's
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7c7320fa51aa..2d3fb6416d91 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -729,6 +729,7 @@ struct nfs_read_data {
 	struct list_head	pages;	/* Coalesced read requests */
 	struct nfs_page		*req;	/* multi ops per nfs_page */
 	struct page		**pagevec;
+	unsigned int		npages;	/* active pages in pagevec */
 	struct nfs_readargs args;
 	struct nfs_readres  res;
 #ifdef CONFIG_NFS_V4
@@ -747,6 +748,7 @@ struct nfs_write_data {
 	struct list_head	pages;		/* Coalesced requests we wish to flush */
 	struct nfs_page		*req;		/* multi ops per nfs_page */
 	struct page		**pagevec;
+	unsigned int		npages;		/* active pages in pagevec */
 	struct nfs_writeargs	args;		/* argument struct */
 	struct nfs_writeres	res;		/* result struct */
 #ifdef CONFIG_NFS_V4
-- 
cgit v1.2.3-59-g8ed1b


From ccf01ef7aa9c6c293a1c64c27331a2ce227916ec Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 25 Jun 2006 06:27:31 -0400
Subject: Merge branch 'odirect'

---
 fs/nfs/direct.c         | 435 ++++++++++++++++++++++++++----------------------
 include/linux/nfs_xdr.h |   2 -
 2 files changed, 234 insertions(+), 203 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e25b7595b7ad..402005c35ab3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -68,19 +68,25 @@ struct nfs_direct_req {
 	struct kref		kref;		/* release manager */
 
 	/* I/O parameters */
+	struct list_head	list,		/* nfs_read/write_data structs */
+				rewrite_list;	/* saved nfs_write_data structs */
 	struct nfs_open_context	*ctx;		/* file open context info */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	struct inode *		inode;		/* target file of i/o */
+	unsigned long		user_addr;	/* location of user's buffer */
+	size_t			user_count;	/* total bytes to move */
+	loff_t			pos;		/* starting offset in file */
+	struct page **		pages;		/* pages in our buffer */
+	unsigned int		npages;		/* count of pages */
 
 	/* completion state */
-	atomic_t		io_count;	/* i/os we're waiting for */
 	spinlock_t		lock;		/* protect completion state */
+	int			outstanding;	/* i/os we're waiting for */
 	ssize_t			count,		/* bytes actually processed */
 				error;		/* any reported error */
 	struct completion	completion;	/* wait for i/o completion */
 
 	/* commit state */
-	struct list_head	rewrite_list;	/* saved nfs_write_data structs */
 	struct nfs_write_data *	commit_data;	/* special write_data for commits */
 	int			flags;
 #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
@@ -88,37 +94,8 @@ struct nfs_direct_req {
 	struct nfs_writeverf	verf;		/* unstable write verifier */
 };
 
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
-static const struct rpc_call_ops nfs_write_direct_ops;
-
-static inline void get_dreq(struct nfs_direct_req *dreq)
-{
-	atomic_inc(&dreq->io_count);
-}
-
-static inline int put_dreq(struct nfs_direct_req *dreq)
-{
-	return atomic_dec_and_test(&dreq->io_count);
-}
-
-/*
- * "size" is never larger than rsize or wsize.
- */
-static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size)
-{
-	int page_count;
-
-	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	page_count -= user_addr >> PAGE_SHIFT;
-	BUG_ON(page_count < 0);
-
-	return page_count;
-}
-
-static inline unsigned int nfs_max_pages(unsigned int size)
-{
-	return (size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
 
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
@@ -142,21 +119,50 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_
 	return -EINVAL;
 }
 
-static void nfs_direct_dirty_pages(struct page **pages, int npages)
+static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
 {
 	int i;
 	for (i = 0; i < npages; i++) {
 		struct page *page = pages[i];
-		if (!PageCompound(page))
+		if (do_dirty && !PageCompound(page))
 			set_page_dirty_lock(page);
+		page_cache_release(page);
 	}
+	kfree(pages);
 }
 
-static void nfs_direct_release_pages(struct page **pages, int npages)
+static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
 {
-	int i;
-	for (i = 0; i < npages; i++)
-		page_cache_release(pages[i]);
+	int result = -ENOMEM;
+	unsigned long page_count;
+	size_t array_size;
+
+	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	page_count -= user_addr >> PAGE_SHIFT;
+
+	array_size = (page_count * sizeof(struct page *));
+	*pages = kmalloc(array_size, GFP_KERNEL);
+	if (*pages) {
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					page_count, (rw == READ), 0,
+					*pages, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (result != page_count) {
+			/*
+			 * If we got fewer pages than expected from
+			 * get_user_pages(), the user buffer runs off the
+			 * end of a mapping; return EFAULT.
+			 */
+			if (result >= 0) {
+				nfs_free_user_pages(*pages, result, 0);
+				result = -EFAULT;
+			} else
+				kfree(*pages);
+			*pages = NULL;
+		}
+	}
+	return result;
 }
 
 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
@@ -168,13 +174,13 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 		return NULL;
 
 	kref_init(&dreq->kref);
-	kref_get(&dreq->kref);
 	init_completion(&dreq->completion);
+	INIT_LIST_HEAD(&dreq->list);
 	INIT_LIST_HEAD(&dreq->rewrite_list);
 	dreq->iocb = NULL;
 	dreq->ctx = NULL;
 	spin_lock_init(&dreq->lock);
-	atomic_set(&dreq->io_count, 0);
+	dreq->outstanding = 0;
 	dreq->count = 0;
 	dreq->error = 0;
 	dreq->flags = 0;
@@ -215,11 +221,18 @@ out:
 }
 
 /*
- * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
- * the iocb is still valid here if this is a synchronous request.
+ * We must hold a reference to all the pages in this direct read request
+ * until the RPCs complete.  This could be long *after* we are woken up in
+ * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
+ *
+ * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
+ * can't trust the iocb is still valid here if this is a synchronous
+ * request.  If the waiter is woken prematurely, the iocb is long gone.
  */
 static void nfs_direct_complete(struct nfs_direct_req *dreq)
 {
+	nfs_free_user_pages(dreq->pages, dreq->npages, 1);
+
 	if (dreq->iocb) {
 		long res = (long) dreq->error;
 		if (!res)
@@ -232,10 +245,48 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 }
 
 /*
- * We must hold a reference to all the pages in this direct read request
- * until the RPCs complete.  This could be long *after* we are woken up in
- * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
+ * Note we also set the number of requests we have in the dreq when we are
+ * done.  This prevents races with I/O completion so we will always wait
+ * until all requests have been dispatched and completed.
  */
+static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
+{
+	struct list_head *list;
+	struct nfs_direct_req *dreq;
+	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		return NULL;
+
+	list = &dreq->list;
+	for(;;) {
+		struct nfs_read_data *data = nfs_readdata_alloc(rpages);
+
+		if (unlikely(!data)) {
+			while (!list_empty(list)) {
+				data = list_entry(list->next,
+						  struct nfs_read_data, pages);
+				list_del(&data->pages);
+				nfs_readdata_free(data);
+			}
+			kref_put(&dreq->kref, nfs_direct_req_release);
+			return NULL;
+		}
+
+		INIT_LIST_HEAD(&data->pages);
+		list_add(&data->pages, list);
+
+		data->req = (struct nfs_page *) dreq;
+		dreq->outstanding++;
+		if (nbytes <= rsize)
+			break;
+		nbytes -= rsize;
+	}
+	kref_get(&dreq->kref);
+	return dreq;
+}
+
 static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
@@ -244,9 +295,6 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 	if (nfs_readpage_result(task, data) != 0)
 		return;
 
-	nfs_direct_dirty_pages(data->pagevec, data->npages);
-	nfs_direct_release_pages(data->pagevec, data->npages);
-
 	spin_lock(&dreq->lock);
 
 	if (likely(task->tk_status >= 0))
@@ -254,10 +302,13 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 	else
 		dreq->error = task->tk_status;
 
-	spin_unlock(&dreq->lock);
+	if (--dreq->outstanding) {
+		spin_unlock(&dreq->lock);
+		return;
+	}
 
-	if (put_dreq(dreq))
-		nfs_direct_complete(dreq);
+	spin_unlock(&dreq->lock);
+	nfs_direct_complete(dreq);
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -266,60 +317,41 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 };
 
 /*
- * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
- * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
- * bail and stop sending more reads.  Read length accounting is
- * handled automatically by nfs_direct_read_result().  Otherwise, if
- * no requests have been sent, just return an error.
+ * For each nfs_read_data struct that was allocated on the list, dispatch
+ * an NFS READ operation
  */
-static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
+	struct list_head *list = &dreq->list;
+	struct page **pages = dreq->pages;
+	size_t count = dreq->user_count;
+	loff_t pos = dreq->pos;
 	size_t rsize = NFS_SERVER(inode)->rsize;
-	unsigned int rpages = nfs_max_pages(rsize);
-	unsigned int pgbase;
-	int result;
-	ssize_t started = 0;
-
-	get_dreq(dreq);
+	unsigned int curpage, pgbase;
 
-	pgbase = user_addr & ~PAGE_MASK;
+	curpage = 0;
+	pgbase = dreq->user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_read_data *data;
 		size_t bytes;
 
-		result = -ENOMEM;
-		data = nfs_readdata_alloc(rpages);
-		if (unlikely(!data))
-			break;
-
 		bytes = rsize;
 		if (count < rsize)
 			bytes = count;
 
-		data->npages = nfs_direct_count_pages(user_addr, bytes);
-		down_read(&current->mm->mmap_sem);
-		result = get_user_pages(current, current->mm, user_addr,
-					data->npages, 1, 0, data->pagevec, NULL);
-		up_read(&current->mm->mmap_sem);
-		if (unlikely(result < data->npages)) {
-			if (result > 0)
-				nfs_direct_release_pages(data->pagevec, result);
-			nfs_readdata_release(data);
-			break;
-		}
-
-		get_dreq(dreq);
+		BUG_ON(list_empty(list));
+		data = list_entry(list->next, struct nfs_read_data, pages);
+		list_del_init(&data->pages);
 
-		data->req = (struct nfs_page *) dreq;
 		data->inode = inode;
 		data->cred = ctx->cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
+		data->args.pages = &pages[curpage];
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.eof = 0;
@@ -342,35 +374,33 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 				bytes,
 				(unsigned long long)data->args.offset);
 
-		started += bytes;
-		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
+		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
-
-	if (put_dreq(dreq))
-		nfs_direct_complete(dreq);
-
-	if (started)
-		return 0;
-	return result < 0 ? (ssize_t) result : -EFAULT;
+	BUG_ON(!list_empty(list));
 }
 
-static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
 {
-	ssize_t result = 0;
+	ssize_t result;
 	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct rpc_clnt *clnt = NFS_CLIENT(inode);
 	struct nfs_direct_req *dreq;
 
-	dreq = nfs_direct_req_alloc();
+	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
 	if (!dreq)
 		return -ENOMEM;
 
+	dreq->user_addr = user_addr;
+	dreq->user_count = count;
+	dreq->pos = pos;
+	dreq->pages = pages;
+	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -378,9 +408,8 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
 	rpc_clnt_sigmask(clnt, &oldset);
-	result = nfs_direct_read_schedule(dreq, user_addr, count, pos);
-	if (!result)
-		result = nfs_direct_wait(dreq);
+	nfs_direct_read_schedule(dreq);
+	result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -388,10 +417,10 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 {
-	while (!list_empty(&dreq->rewrite_list)) {
-		struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
+	list_splice_init(&dreq->rewrite_list, &dreq->list);
+	while (!list_empty(&dreq->list)) {
+		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
-		nfs_direct_release_pages(data->pagevec, data->npages);
 		nfs_writedata_release(data);
 	}
 }
@@ -399,51 +428,14 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
-	struct inode *inode = dreq->inode;
-	struct list_head *p;
-	struct nfs_write_data *data;
+	struct list_head *pos;
 
+	list_splice_init(&dreq->rewrite_list, &dreq->list);
+	list_for_each(pos, &dreq->list)
+		dreq->outstanding++;
 	dreq->count = 0;
-	get_dreq(dreq);
-
-	list_for_each(p, &dreq->rewrite_list) {
-		data = list_entry(p, struct nfs_write_data, pages);
-
-		get_dreq(dreq);
-
-		/*
-		 * Reset data->res.
-		 */
-		nfs_fattr_init(&data->fattr);
-		data->res.count = data->args.count;
-		memset(&data->verf, 0, sizeof(data->verf));
-
-		/*
-		 * Reuse data->task; data->args should not have changed
-		 * since the original request was sent.
-		 */
-		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
-				&nfs_write_direct_ops, data);
-		NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
-
-		data->task.tk_priority = RPC_PRIORITY_NORMAL;
-		data->task.tk_cookie = (unsigned long) inode;
-
-		/*
-		 * We're called via an RPC callback, so BKL is already held.
-		 */
-		rpc_execute(&data->task);
-
-		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-				data->task.tk_pid,
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode),
-				data->args.count,
-				(unsigned long long)data->args.offset);
-	}
 
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, inode);
+	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
 }
 
 static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
@@ -480,8 +472,8 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->cred = dreq->ctx->cred;
 
 	data->args.fh = NFS_FH(data->inode);
-	data->args.offset = 0;
-	data->args.count = 0;
+	data->args.offset = dreq->pos;
+	data->args.count = dreq->user_count;
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
@@ -543,6 +535,47 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 }
 #endif
 
+static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
+{
+	struct list_head *list;
+	struct nfs_direct_req *dreq;
+	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		return NULL;
+
+	list = &dreq->list;
+	for(;;) {
+		struct nfs_write_data *data = nfs_writedata_alloc(wpages);
+
+		if (unlikely(!data)) {
+			while (!list_empty(list)) {
+				data = list_entry(list->next,
+						  struct nfs_write_data, pages);
+				list_del(&data->pages);
+				nfs_writedata_free(data);
+			}
+			kref_put(&dreq->kref, nfs_direct_req_release);
+			return NULL;
+		}
+
+		INIT_LIST_HEAD(&data->pages);
+		list_add(&data->pages, list);
+
+		data->req = (struct nfs_page *) dreq;
+		dreq->outstanding++;
+		if (nbytes <= wsize)
+			break;
+		nbytes -= wsize;
+	}
+
+	nfs_alloc_commit_data(dreq);
+
+	kref_get(&dreq->kref);
+	return dreq;
+}
+
 static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
@@ -572,6 +605,8 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 				}
 		}
 	}
+	/* In case we have to resend */
+	data->args.stable = NFS_FILE_SYNC;
 
 	spin_unlock(&dreq->lock);
 }
@@ -585,8 +620,14 @@ static void nfs_direct_write_release(void *calldata)
 	struct nfs_write_data *data = calldata;
 	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
 
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, data->inode);
+	spin_lock(&dreq->lock);
+	if (--dreq->outstanding) {
+		spin_unlock(&dreq->lock);
+		return;
+	}
+	spin_unlock(&dreq->lock);
+
+	nfs_direct_write_complete(dreq, data->inode);
 }
 
 static const struct rpc_call_ops nfs_write_direct_ops = {
@@ -595,62 +636,41 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
 };
 
 /*
- * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
- * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
- * bail and stop sending more writes.  Write length accounting is
- * handled automatically by nfs_direct_write_result().  Otherwise, if
- * no requests have been sent, just return an error.
+ * For each nfs_write_data struct that was allocated on the list, dispatch
+ * an NFS WRITE operation
  */
-static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
+	struct list_head *list = &dreq->list;
+	struct page **pages = dreq->pages;
+	size_t count = dreq->user_count;
+	loff_t pos = dreq->pos;
 	size_t wsize = NFS_SERVER(inode)->wsize;
-	unsigned int wpages = nfs_max_pages(wsize);
-	unsigned int pgbase;
-	int result;
-	ssize_t started = 0;
+	unsigned int curpage, pgbase;
 
-	get_dreq(dreq);
-
-	pgbase = user_addr & ~PAGE_MASK;
+	curpage = 0;
+	pgbase = dreq->user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_write_data *data;
 		size_t bytes;
 
-		result = -ENOMEM;
-		data = nfs_writedata_alloc(wpages);
-		if (unlikely(!data))
-			break;
-
 		bytes = wsize;
 		if (count < wsize)
 			bytes = count;
 
-		data->npages = nfs_direct_count_pages(user_addr, bytes);
-		down_read(&current->mm->mmap_sem);
-		result = get_user_pages(current, current->mm, user_addr,
-					data->npages, 0, 0, data->pagevec, NULL);
-		up_read(&current->mm->mmap_sem);
-		if (unlikely(result < data->npages)) {
-			if (result > 0)
-				nfs_direct_release_pages(data->pagevec, result);
-			nfs_writedata_release(data);
-			break;
-		}
-
-		get_dreq(dreq);
-
+		BUG_ON(list_empty(list));
+		data = list_entry(list->next, struct nfs_write_data, pages);
 		list_move_tail(&data->pages, &dreq->rewrite_list);
 
-		data->req = (struct nfs_page *) dreq;
 		data->inode = inode;
 		data->cred = ctx->cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
+		data->args.pages = &pages[curpage];
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.count = bytes;
@@ -674,26 +694,19 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 				bytes,
 				(unsigned long long)data->args.offset);
 
-		started += bytes;
-		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
+		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
-
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, inode);
-
-	if (started)
-		return 0;
-	return result < 0 ? (ssize_t) result : -EFAULT;
+	BUG_ON(!list_empty(list));
 }
 
-static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
 {
-	ssize_t result = 0;
+	ssize_t result;
 	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct rpc_clnt *clnt = NFS_CLIENT(inode);
@@ -701,14 +714,17 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	int sync = 0;
 
-	dreq = nfs_direct_req_alloc();
+	dreq = nfs_direct_write_alloc(count, wsize);
 	if (!dreq)
 		return -ENOMEM;
-	nfs_alloc_commit_data(dreq);
-
 	if (dreq->commit_data == NULL || count < wsize)
 		sync = FLUSH_STABLE;
 
+	dreq->user_addr = user_addr;
+	dreq->user_count = count;
+	dreq->pos = pos;
+	dreq->pages = pages;
+	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -719,9 +735,8 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	nfs_begin_data_update(inode);
 
 	rpc_clnt_sigmask(clnt, &oldset);
-	result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
-	if (!result)
-		result = nfs_direct_wait(dreq);
+	nfs_direct_write_schedule(dreq, sync);
+	result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -751,6 +766,8 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval = -EINVAL;
+	int page_count;
+	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -772,7 +789,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count,
 	if (retval)
 		goto out;
 
-	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos);
+	retval = nfs_get_user_pages(READ, (unsigned long) buf,
+						count, &pages);
+	if (retval < 0)
+		goto out;
+	page_count = retval;
+
+	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
+						pages, page_count);
 	if (retval > 0)
 		iocb->ki_pos = pos + retval;
 
@@ -808,6 +832,8 @@ out:
 ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval;
+	int page_count;
+	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -835,7 +861,14 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t
 	if (retval)
 		goto out;
 
-	retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
+	retval = nfs_get_user_pages(WRITE, (unsigned long) buf,
+						count, &pages);
+	if (retval < 0)
+		goto out;
+	page_count = retval;
+
+	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
+					pos, pages, page_count);
 
 	/*
 	 * XXX: nfs_end_data_update() already ensures this file's
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2d3fb6416d91..7c7320fa51aa 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -729,7 +729,6 @@ struct nfs_read_data {
 	struct list_head	pages;	/* Coalesced read requests */
 	struct nfs_page		*req;	/* multi ops per nfs_page */
 	struct page		**pagevec;
-	unsigned int		npages;	/* active pages in pagevec */
 	struct nfs_readargs args;
 	struct nfs_readres  res;
 #ifdef CONFIG_NFS_V4
@@ -748,7 +747,6 @@ struct nfs_write_data {
 	struct list_head	pages;		/* Coalesced requests we wish to flush */
 	struct nfs_page		*req;		/* multi ops per nfs_page */
 	struct page		**pagevec;
-	unsigned int		npages;		/* active pages in pagevec */
 	struct nfs_writeargs	args;		/* argument struct */
 	struct nfs_writeres	res;		/* result struct */
 #ifdef CONFIG_NFS_V4
-- 
cgit v1.2.3-59-g8ed1b


From d75d54147db9db5194040bd1c5022df6ba36ee48 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 25 Jun 2006 02:41:26 -0700
Subject: git-nfs-build-fixes

Fix various problems with nfs4 disabled.  And various other things.

In file included from fs/nfs/inode.c:50:
fs/nfs/internal.h:24: error: static declaration of 'nfs_do_refmount' follows non-static declaration
include/linux/nfs_fs.h:320: error: previous declaration of 'nfs_do_refmount' was here
fs/nfs/internal.h:65: warning: 'struct nfs4_fs_locations' declared inside parameter list
fs/nfs/internal.h:65: warning: its scope is only this definition or declaration, which is probably not what you want
fs/nfs/internal.h: In function 'nfs4_path':
fs/nfs/internal.h:97: error: 'struct nfs_server' has no member named 'mnt_path'
fs/nfs/inode.c: In function 'init_once':
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'open_states'
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'delegation'
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'delegation_state'
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'rwsem'
distcc[26452] ERROR: compile fs/nfs/inode.c on g5/64 failed
make[1]: *** [fs/nfs/inode.o] Error 1
make: *** [fs/nfs/inode.o] Error 2
make: *** Waiting for unfinished jobs....
In file included from fs/nfs/nfs3xdr.c:26:
fs/nfs/internal.h:24: error: static declaration of 'nfs_do_refmount' follows non-static declaration
include/linux/nfs_fs.h:320: error: previous declaration of 'nfs_do_refmount' was here
fs/nfs/internal.h:65: warning: 'struct nfs4_fs_locations' declared inside parameter list
fs/nfs/internal.h:65: warning: its scope is only this definition or declaration, which is probably not what you want
fs/nfs/internal.h: In function 'nfs4_path':
fs/nfs/internal.h:97: error: 'struct nfs_server' has no member named 'mnt_path'
distcc[26486] ERROR: compile fs/nfs/nfs3xdr.c on g5/64 failed
make[1]: *** [fs/nfs/nfs3xdr.o] Error 1
make: *** [fs/nfs/nfs3xdr.o] Error 2
In file included from fs/nfs/nfs3proc.c:24:
fs/nfs/internal.h:24: error: static declaration of 'nfs_do_refmount' follows non-static declaration
include/linux/nfs_fs.h:320: error: previous declaration of 'nfs_do_refmount' was here
fs/nfs/internal.h:65: warning: 'struct nfs4_fs_locations' declared inside parameter list
fs/nfs/internal.h:65: warning: its scope is only this definition or declaration, which is probably not what you want
fs/nfs/internal.h: In function 'nfs4_path':
fs/nfs/internal.h:97: error: 'struct nfs_server' has no member named 'mnt_path'
distcc[26469] ERROR: compile fs/nfs/nfs3proc.c on bix/32 failed
make[1]: *** [fs/nfs/nfs3proc.o] Error 1
make: *** [fs/nfs/nfs3proc.o] Error 2
**FAILED**

Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andreas Gruenbacher <agruen@suse.de>
Cc: Andy Adamson <andros@citi.umich.edu>
Cc: Chuck Lever <cel@netapp.com>
Cc: David Howells <dhowells@redhat.com>
Cc: J. Bruce Fields <bfields@fieldses.org>
Cc: Manoj Naik <manoj@almaden.ibm.com>
Cc: Marc Eshel <eshel@almaden.ibm.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         | 16 +++++++++-------
 fs/nfs/internal.h      |  9 ++++++++-
 fs/nfs/nfs2xdr.c       |  2 ++
 include/linux/nfs_fs.h |  2 --
 4 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 24a7139d3449..51bc88b662fe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1089,13 +1089,15 @@ void nfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
 
-#define nfs4_init_once(nfsi) \
-	do { \
-		INIT_LIST_HEAD(&(nfsi)->open_states); \
-		nfsi->delegation = NULL; \
-		nfsi->delegation_state = 0; \
-		init_rwsem(&nfsi->rwsem); \
-	} while(0)
+static inline void nfs4_init_once(struct nfs_inode *nfsi)
+{
+#ifdef CONFIG_NFS_V4
+	INIT_LIST_HEAD(&nfsi->open_states);
+	nfsi->delegation = NULL;
+	nfsi->delegation_state = 0;
+	init_rwsem(&nfsi->rwsem);
+#endif
+}
 
 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5e51c4535b6f..bd2815e2dec1 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -58,11 +58,13 @@ extern int nfs_stat_to_errno(int);
 extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
 
 /* nfs4proc.c */
+#ifdef CONFIG_NFS_V4
 extern struct rpc_procinfo nfs4_procedures[];
 
 extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
 				  struct nfs4_fs_locations *fs_locations,
 				  struct page *page);
+#endif
 
 /* inode.c */
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
@@ -92,9 +94,14 @@ extern char *nfs_path(const char *base, const struct dentry *dentry,
 /*
  * Determine the mount path as a string
  */
-static inline char *nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
+static inline char *
+nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
 {
+#ifdef CONFIG_NFS_V4
 	return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
+#else
+	return NULL;
+#endif
 }
 
 /*
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 67391eef6b93..3b939e055a08 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -25,6 +25,8 @@
 #include <linux/nfs_fs.h>
 #include "internal.h"
 
+#include "internal.h"
+
 #define NFSDBG_FACILITY		NFSDBG_XDR
 /* #define NFS_PARANOIA 1 */
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 152798949113..0a1740b2532e 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -317,8 +317,6 @@ extern struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
 					const struct dentry *dentry,
 					struct nfs_fh *fh,
 					struct nfs_fattr *fattr);
-extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent,
-					struct dentry *dentry);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern u32 root_nfs_parse_addr(char *name); /*__init*/
-- 
cgit v1.2.3-59-g8ed1b