From 474a00ee1306eb7e82329fdc28b6471a99facba1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Mar 2011 21:31:32 -0400
Subject: kill simple_set_mnt()

not needed anymore, since all users (->get_sb() instances) are gone.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c     | 8 --------
 include/linux/fs.h | 1 -
 2 files changed, 9 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index d7513485c1f3..a2a01a104ab0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -466,14 +466,6 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
 	br_write_unlock(vfsmount_lock);
 }
 
-void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
-{
-	mnt->mnt_sb = sb;
-	mnt->mnt_root = dget(sb->s_root);
-}
-
-EXPORT_SYMBOL(simple_set_mnt);
-
 void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 92f7e04aea11..7061a8587ee3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1839,7 +1839,6 @@ extern struct dentry *mount_pseudo(struct file_system_type *, char *,
 	const struct super_operations *ops,
 	const struct dentry_operations *dops,
 	unsigned long);
-extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 
 static inline void sb_mark_dirty(struct super_block *sb)
 {
-- 
cgit v1.2.3-59-g8ed1b


From fbe0aa1f3d16fac5b641c0c1697371dcbe45b569 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@gmail.com>
Date: Thu, 17 Mar 2011 16:29:15 -0700
Subject: Some fixes for pstore

1) Change from ->get_sb() to ->mount()
2) Use mount_single() instead of mount_nodev()
3) Pulled in ramfs_get_inode() & trimmed to what I need for pstore
4) Drop the ugly pstore_writefile() Just save data using kmalloc() and
   provide a pstore_file_read() that uses simple_read_from_buffer().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pstore/inode.c | 116 ++++++++++++++++++++++++++----------------------------
 1 file changed, 56 insertions(+), 60 deletions(-)

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 549d245d0b42..08342232cb1c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -40,9 +40,29 @@
 struct pstore_private {
 	u64	id;
 	int	(*erase)(u64);
+	ssize_t	size;
+	char	data[];
 };
 
-#define pstore_get_inode ramfs_get_inode
+static int pstore_file_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
+						size_t count, loff_t *ppos)
+{
+	struct pstore_private *ps = file->private_data;
+
+	return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
+}
+
+static const struct file_operations pstore_file_operations = {
+	.open	= pstore_file_open,
+	.read	= pstore_file_read,
+	.llseek	= default_llseek,
+};
 
 /*
  * When a file is unlinked from our file system we call the
@@ -63,6 +83,30 @@ static const struct inode_operations pstore_dir_inode_operations = {
 	.unlink		= pstore_unlink,
 };
 
+static struct inode *pstore_get_inode(struct super_block *sb,
+					const struct inode *dir, int mode, dev_t dev)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_uid = inode->i_gid = 0;
+		inode->i_mode = mode;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		switch (mode & S_IFMT) {
+		case S_IFREG:
+			inode->i_fop = &pstore_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_op = &pstore_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+			inc_nlink(inode);
+			break;
+		}
+	}
+	return inode;
+}
+
 static const struct super_operations pstore_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
@@ -70,37 +114,10 @@ static const struct super_operations pstore_ops = {
 };
 
 static struct super_block *pstore_sb;
-static struct vfsmount *pstore_mnt;
 
 int pstore_is_mounted(void)
 {
-	return pstore_mnt != NULL;
-}
-
-/*
- * Set up a file structure as if we had opened this file and
- * write our data to it.
- */
-static int pstore_writefile(struct inode *inode, struct dentry *dentry,
-	char *data, size_t size)
-{
-	struct file f;
-	ssize_t n;
-	mm_segment_t old_fs = get_fs();
-
-	memset(&f, '0', sizeof f);
-	f.f_mapping = inode->i_mapping;
-	f.f_path.dentry = dentry;
-	f.f_path.mnt = pstore_mnt;
-	f.f_pos = 0;
-	f.f_op = inode->i_fop;
-	set_fs(KERNEL_DS);
-	n = do_sync_write(&f, data, size, &f.f_pos);
-	set_fs(old_fs);
-
-	fsnotify_modify(&f);
-
-	return n == size;
+	return pstore_sb != NULL;
 }
 
 /*
@@ -123,8 +140,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
 	inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
 	if (!inode)
 		goto fail;
-	inode->i_uid = inode->i_gid = 0;
-	private = kmalloc(sizeof *private, GFP_KERNEL);
+	private = kmalloc(sizeof *private + size, GFP_KERNEL);
 	if (!private)
 		goto fail_alloc;
 	private->id = id;
@@ -152,28 +168,19 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
 	if (IS_ERR(dentry))
 		goto fail_lockedalloc;
 
-	d_add(dentry, inode);
-
-	mutex_unlock(&root->d_inode->i_mutex);
-
-	if (!pstore_writefile(inode, dentry, data, size))
-		goto fail_write;
+	memcpy(private->data, data, size);
+	inode->i_size = private->size = size;
 
 	inode->i_private = private;
 
 	if (time.tv_sec)
 		inode->i_mtime = inode->i_ctime = time;
 
-	return 0;
+	d_add(dentry, inode);
 
-fail_write:
-	kfree(private);
-	inode->i_nlink--;
-	mutex_lock(&root->d_inode->i_mutex);
-	d_delete(dentry);
-	dput(dentry);
 	mutex_unlock(&root->d_inode->i_mutex);
-	goto fail;
+
+	return 0;
 
 fail_lockedalloc:
 	mutex_unlock(&root->d_inode->i_mutex);
@@ -225,32 +232,21 @@ fail:
 	return err;
 }
 
-static int pstore_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *pstore_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	struct dentry *root;
-
-	root = mount_nodev(fs_type, flags, data, pstore_fill_super);
-	if (IS_ERR(root))
-		return -ENOMEM;
-
-	mnt->mnt_root = root;
-	mnt->mnt_sb = root->d_sb;
-	pstore_mnt = mnt;
-
-	return 0;
+	return mount_single(fs_type, flags, data, pstore_fill_super);
 }
 
 static void pstore_kill_sb(struct super_block *sb)
 {
 	kill_litter_super(sb);
 	pstore_sb = NULL;
-	pstore_mnt = NULL;
 }
 
 static struct file_system_type pstore_fs_type = {
 	.name		= "pstore",
-	.get_sb		= pstore_get_sb,
+	.mount		= pstore_mount,
 	.kill_sb	= pstore_kill_sb,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 9d412a43c3b26e1e549319e5eec26f0829f9f74d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Mar 2011 22:08:28 -0400
Subject: vfs: split off vfsmount-related parts of vfs_kern_mount()

new function: mount_fs().  Does all work done by vfs_kern_mount()
except the allocation and filling of vfsmount; returns root dentry
or ERR_PTR().

vfs_kern_mount() switched to using it and taken to fs/namespace.c,
along with its wrappers.

alloc_vfsmnt()/free_vfsmnt() made static.

functions in namespace.c slightly reordered.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h  |   5 +-
 fs/namespace.c | 153 ++++++++++++++++++++++++++++++++++++++++++---------------
 fs/super.c     |  96 ++++++------------------------------
 3 files changed, 132 insertions(+), 122 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index f3d15de44b15..17191546d527 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
 #include <linux/lglock.h>
 
 struct super_block;
+struct file_system_type;
 struct linux_binprm;
 struct path;
 
@@ -61,8 +62,6 @@ extern int check_unsafe_exec(struct linux_binprm *);
 extern int copy_mount_options(const void __user *, unsigned long *);
 extern int copy_mount_string(const void __user *, char **);
 
-extern void free_vfsmnt(struct vfsmount *);
-extern struct vfsmount *alloc_vfsmnt(const char *);
 extern unsigned int mnt_get_count(struct vfsmount *mnt);
 extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
@@ -99,6 +98,8 @@ extern struct file *get_empty_filp(void);
 extern int do_remount_sb(struct super_block *, int, void *, int);
 extern void __put_super(struct super_block *sb);
 extern void put_super(struct super_block *sb);
+extern struct dentry *mount_fs(struct file_system_type *,
+			       int, const char *, void *);
 
 /*
  * open.c
diff --git a/fs/namespace.c b/fs/namespace.c
index a2a01a104ab0..453529f72dff 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -196,7 +196,7 @@ unsigned int mnt_get_count(struct vfsmount *mnt)
 #endif
 }
 
-struct vfsmount *alloc_vfsmnt(const char *name)
+static struct vfsmount *alloc_vfsmnt(const char *name)
 {
 	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
@@ -466,7 +466,7 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
 	br_write_unlock(vfsmount_lock);
 }
 
-void free_vfsmnt(struct vfsmount *mnt)
+static void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
 	mnt_free_id(mnt);
@@ -670,6 +670,36 @@ static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
 	return p;
 }
 
+struct vfsmount *
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+{
+	struct vfsmount *mnt;
+	struct dentry *root;
+
+	if (!type)
+		return ERR_PTR(-ENODEV);
+
+	mnt = alloc_vfsmnt(name);
+	if (!mnt)
+		return ERR_PTR(-ENOMEM);
+
+	if (flags & MS_KERNMOUNT)
+		mnt->mnt_flags = MNT_INTERNAL;
+
+	root = mount_fs(type, flags, name, data);
+	if (IS_ERR(root)) {
+		free_vfsmnt(mnt);
+		return ERR_CAST(root);
+	}
+
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
+	mnt->mnt_mountpoint = mnt->mnt_root;
+	mnt->mnt_parent = mnt;
+	return mnt;
+}
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 					int flag)
 {
@@ -1905,7 +1935,81 @@ out:
 	return err;
 }
 
-static int do_add_mount(struct vfsmount *, struct path *, int);
+static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
+{
+	int err;
+	const char *subtype = strchr(fstype, '.');
+	if (subtype) {
+		subtype++;
+		err = -EINVAL;
+		if (!subtype[0])
+			goto err;
+	} else
+		subtype = "";
+
+	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
+	err = -ENOMEM;
+	if (!mnt->mnt_sb->s_subtype)
+		goto err;
+	return mnt;
+
+ err:
+	mntput(mnt);
+	return ERR_PTR(err);
+}
+
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
+	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
+	    !mnt->mnt_sb->s_subtype)
+		mnt = fs_set_subtype(mnt, fstype);
+	put_filesystem(type);
+	return mnt;
+}
+EXPORT_SYMBOL_GPL(do_kern_mount);
+
+/*
+ * add a mount into a namespace's mount tree
+ */
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
+{
+	int err;
+
+	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+
+	down_write(&namespace_sem);
+	/* Something was mounted here while we slept */
+	err = follow_down(path, true);
+	if (err < 0)
+		goto unlock;
+
+	err = -EINVAL;
+	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
+		goto unlock;
+
+	/* Refuse the same filesystem on the same mount point */
+	err = -EBUSY;
+	if (path->mnt->mnt_sb == newmnt->mnt_sb &&
+	    path->mnt->mnt_root == path->dentry)
+		goto unlock;
+
+	err = -EINVAL;
+	if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
+		goto unlock;
+
+	newmnt->mnt_flags = mnt_flags;
+	err = graft_tree(newmnt, path);
+
+unlock:
+	up_write(&namespace_sem);
+	return err;
+}
 
 /*
  * create a new mount for userspace and request it to be added into the
@@ -1965,43 +2069,6 @@ fail:
 	return err;
 }
 
-/*
- * add a mount into a namespace's mount tree
- */
-static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
-{
-	int err;
-
-	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
-
-	down_write(&namespace_sem);
-	/* Something was mounted here while we slept */
-	err = follow_down(path, true);
-	if (err < 0)
-		goto unlock;
-
-	err = -EINVAL;
-	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
-		goto unlock;
-
-	/* Refuse the same filesystem on the same mount point */
-	err = -EBUSY;
-	if (path->mnt->mnt_sb == newmnt->mnt_sb &&
-	    path->mnt->mnt_root == path->dentry)
-		goto unlock;
-
-	err = -EINVAL;
-	if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
-		goto unlock;
-
-	newmnt->mnt_flags = mnt_flags;
-	err = graft_tree(newmnt, path);
-
-unlock:
-	up_write(&namespace_sem);
-	return err;
-}
-
 /**
  * mnt_set_expiry - Put a mount on an expiration list
  * @mnt: The mount to list.
@@ -2660,3 +2727,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
 	kfree(ns);
 }
 EXPORT_SYMBOL(put_mnt_ns);
+
+struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
+{
+	return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
+}
+EXPORT_SYMBOL_GPL(kern_mount_data);
diff --git a/fs/super.c b/fs/super.c
index 4bae0ef6110e..e84864908264 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -910,29 +910,18 @@ struct dentry *mount_single(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_single);
 
-struct vfsmount *
-vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+struct dentry *
+mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 {
-	struct vfsmount *mnt;
 	struct dentry *root;
+	struct super_block *sb;
 	char *secdata = NULL;
-	int error;
-
-	if (!type)
-		return ERR_PTR(-ENODEV);
-
-	error = -ENOMEM;
-	mnt = alloc_vfsmnt(name);
-	if (!mnt)
-		goto out;
-
-	if (flags & MS_KERNMOUNT)
-		mnt->mnt_flags = MNT_INTERNAL;
+	int error = -ENOMEM;
 
 	if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
 		secdata = alloc_secdata();
 		if (!secdata)
-			goto out_mnt;
+			goto out;
 
 		error = security_sb_copy_data(data, secdata);
 		if (error)
@@ -944,13 +933,12 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 		error = PTR_ERR(root);
 		goto out_free_secdata;
 	}
-	mnt->mnt_root = root;
-	mnt->mnt_sb = root->d_sb;
-	BUG_ON(!mnt->mnt_sb);
-	WARN_ON(!mnt->mnt_sb->s_bdi);
-	mnt->mnt_sb->s_flags |= MS_BORN;
+	sb = root->d_sb;
+	BUG_ON(!sb);
+	WARN_ON(!sb->s_bdi);
+	sb->s_flags |= MS_BORN;
 
-	error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
+	error = security_sb_kern_mount(sb, flags, secdata);
 	if (error)
 		goto out_sb;
 
@@ -961,27 +949,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	 * violate this rule. This warning should be either removed or
 	 * converted to a BUG() in 2.6.34.
 	 */
-	WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
-		"negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
+	WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
+		"negative value (%lld)\n", type->name, sb->s_maxbytes);
 
-	mnt->mnt_mountpoint = mnt->mnt_root;
-	mnt->mnt_parent = mnt;
-	up_write(&mnt->mnt_sb->s_umount);
+	up_write(&sb->s_umount);
 	free_secdata(secdata);
-	return mnt;
+	return root;
 out_sb:
-	dput(mnt->mnt_root);
-	deactivate_locked_super(mnt->mnt_sb);
+	dput(root);
+	deactivate_locked_super(sb);
 out_free_secdata:
 	free_secdata(secdata);
-out_mnt:
-	free_vfsmnt(mnt);
 out:
 	return ERR_PTR(error);
 }
 
-EXPORT_SYMBOL_GPL(vfs_kern_mount);
-
 /**
  * freeze_super - lock the filesystem and force it into a consistent state
  * @sb: the super to lock
@@ -1071,49 +1053,3 @@ out:
 	return 0;
 }
 EXPORT_SYMBOL(thaw_super);
-
-static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
-{
-	int err;
-	const char *subtype = strchr(fstype, '.');
-	if (subtype) {
-		subtype++;
-		err = -EINVAL;
-		if (!subtype[0])
-			goto err;
-	} else
-		subtype = "";
-
-	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
-	err = -ENOMEM;
-	if (!mnt->mnt_sb->s_subtype)
-		goto err;
-	return mnt;
-
- err:
-	mntput(mnt);
-	return ERR_PTR(err);
-}
-
-struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
-{
-	struct file_system_type *type = get_fs_type(fstype);
-	struct vfsmount *mnt;
-	if (!type)
-		return ERR_PTR(-ENODEV);
-	mnt = vfs_kern_mount(type, flags, name, data);
-	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
-	    !mnt->mnt_sb->s_subtype)
-		mnt = fs_set_subtype(mnt, fstype);
-	put_filesystem(type);
-	return mnt;
-}
-EXPORT_SYMBOL_GPL(do_kern_mount);
-
-struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
-{
-	return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
-}
-
-EXPORT_SYMBOL_GPL(kern_mount_data);
-- 
cgit v1.2.3-59-g8ed1b


From 27cb1572e3e6bb1f8cf6bb3d74c914a87b131792 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 18 Mar 2011 08:29:36 -0400
Subject: fix deadlock in pivot_root()

Don't hold vfsmount_lock over the loop traversing ->mnt_parent;
do check_mnt(new.mnt) under namespace_sem instead; combined with
namespace_sem held over all that code it'll guarantee the stability
of ->mnt_parent chain all the way to the root.

Doing check_mnt() outside of namespace_sem in case of pivot_root()
is wrong anyway.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 453529f72dff..46cc26b5aaf2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2569,9 +2569,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	error = user_path_dir(new_root, &new);
 	if (error)
 		goto out0;
-	error = -EINVAL;
-	if (!check_mnt(new.mnt))
-		goto out1;
 
 	error = user_path_dir(put_old, &old);
 	if (error)
@@ -2591,7 +2588,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		IS_MNT_SHARED(new.mnt->mnt_parent) ||
 		IS_MNT_SHARED(root.mnt->mnt_parent))
 		goto out2;
-	if (!check_mnt(root.mnt))
+	if (!check_mnt(root.mnt) || !check_mnt(new.mnt))
 		goto out2;
 	error = -ENOENT;
 	if (cant_mount(old.dentry))
@@ -2615,19 +2612,19 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out2; /* not attached */
 	/* make sure we can reach put_old from new_root */
 	tmp = old.mnt;
-	br_write_lock(vfsmount_lock);
 	if (tmp != new.mnt) {
 		for (;;) {
 			if (tmp->mnt_parent == tmp)
-				goto out3; /* already mounted on put_old */
+				goto out2; /* already mounted on put_old */
 			if (tmp->mnt_parent == new.mnt)
 				break;
 			tmp = tmp->mnt_parent;
 		}
 		if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
-			goto out3;
+			goto out2;
 	} else if (!is_subdir(old.dentry, new.dentry))
-		goto out3;
+		goto out2;
+	br_write_lock(vfsmount_lock);
 	detach_mnt(new.mnt, &parent_path);
 	detach_mnt(root.mnt, &root_parent);
 	/* mount old root on put_old */
@@ -2650,9 +2647,6 @@ out1:
 	path_put(&new);
 out0:
 	return error;
-out3:
-	br_write_unlock(vfsmount_lock);
-	goto out2;
 }
 
 static void __init init_mount_tree(void)
-- 
cgit v1.2.3-59-g8ed1b


From b12cea9198fa99ffd3de1776c323bc7464d26b44 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 18 Mar 2011 08:55:38 -0400
Subject: change the locking order for namespace_sem

Have it nested inside ->i_mutex.  Instead of using follow_down()
under namespace_sem, followed by grabbing i_mutex and checking that
mountpoint to be is not dead, do the following:
	grab i_mutex
	check that it's not dead
	grab namespace_sem
	see if anything is mounted there
	if not, we've won
	otherwise
		drop locks
		put_path on what we had
		replace with what's mounted
		retry everything with new mountpoint to be

New helper (lock_mount()) does that.  do_add_mount(), do_move_mount(),
do_loopback() and pivot_root() switched to it; in case of the last
two that eliminates a race we used to have - original code didn't
do follow_down().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 133 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 73 insertions(+), 60 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 46cc26b5aaf2..9263995bf6a1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1663,9 +1663,35 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 	return err;
 }
 
+static int lock_mount(struct path *path)
+{
+	struct vfsmount *mnt;
+retry:
+	mutex_lock(&path->dentry->d_inode->i_mutex);
+	if (unlikely(cant_mount(path->dentry))) {
+		mutex_unlock(&path->dentry->d_inode->i_mutex);
+		return -ENOENT;
+	}
+	down_write(&namespace_sem);
+	mnt = lookup_mnt(path);
+	if (likely(!mnt))
+		return 0;
+	up_write(&namespace_sem);
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	path_put(path);
+	path->mnt = mnt;
+	path->dentry = dget(mnt->mnt_root);
+	goto retry;
+}
+
+static void unlock_mount(struct path *path)
+{
+	up_write(&namespace_sem);
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
+}
+
 static int graft_tree(struct vfsmount *mnt, struct path *path)
 {
-	int err;
 	if (mnt->mnt_sb->s_flags & MS_NOUSER)
 		return -EINVAL;
 
@@ -1673,16 +1699,10 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 	      S_ISDIR(mnt->mnt_root->d_inode->i_mode))
 		return -ENOTDIR;
 
-	err = -ENOENT;
-	mutex_lock(&path->dentry->d_inode->i_mutex);
-	if (cant_mount(path->dentry))
-		goto out_unlock;
+	if (d_unlinked(path->dentry))
+		return -ENOENT;
 
-	if (!d_unlinked(path->dentry))
-		err = attach_recursive_mnt(mnt, path, NULL);
-out_unlock:
-	mutex_unlock(&path->dentry->d_inode->i_mutex);
-	return err;
+	return attach_recursive_mnt(mnt, path, NULL);
 }
 
 /*
@@ -1745,6 +1765,7 @@ static int do_change_type(struct path *path, int flag)
 static int do_loopback(struct path *path, char *old_name,
 				int recurse)
 {
+	LIST_HEAD(umount_list);
 	struct path old_path;
 	struct vfsmount *mnt = NULL;
 	int err = mount_is_safe(path);
@@ -1756,13 +1777,16 @@ static int do_loopback(struct path *path, char *old_name,
 	if (err)
 		return err;
 
-	down_write(&namespace_sem);
+	err = lock_mount(path);
+	if (err)
+		goto out;
+
 	err = -EINVAL;
 	if (IS_MNT_UNBINDABLE(old_path.mnt))
-		goto out;
+		goto out2;
 
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
-		goto out;
+		goto out2;
 
 	err = -ENOMEM;
 	if (recurse)
@@ -1771,20 +1795,18 @@ static int do_loopback(struct path *path, char *old_name,
 		mnt = clone_mnt(old_path.mnt, old_path.dentry, 0);
 
 	if (!mnt)
-		goto out;
+		goto out2;
 
 	err = graft_tree(mnt, path);
 	if (err) {
-		LIST_HEAD(umount_list);
-
 		br_write_lock(vfsmount_lock);
 		umount_tree(mnt, 0, &umount_list);
 		br_write_unlock(vfsmount_lock);
-		release_mounts(&umount_list);
 	}
-
+out2:
+	unlock_mount(path);
+	release_mounts(&umount_list);
 out:
-	up_write(&namespace_sem);
 	path_put(&old_path);
 	return err;
 }
@@ -1873,18 +1895,12 @@ static int do_move_mount(struct path *path, char *old_name)
 	if (err)
 		return err;
 
-	down_write(&namespace_sem);
-	err = follow_down(path, true);
+	err = lock_mount(path);
 	if (err < 0)
 		goto out;
 
 	err = -EINVAL;
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
-		goto out;
-
-	err = -ENOENT;
-	mutex_lock(&path->dentry->d_inode->i_mutex);
-	if (cant_mount(path->dentry))
 		goto out1;
 
 	if (d_unlinked(path->dentry))
@@ -1926,9 +1942,8 @@ static int do_move_mount(struct path *path, char *old_name)
 	 * automatically */
 	list_del_init(&old_path.mnt->mnt_expire);
 out1:
-	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	unlock_mount(path);
 out:
-	up_write(&namespace_sem);
 	if (!err)
 		path_put(&parent_path);
 	path_put(&old_path);
@@ -1983,11 +1998,9 @@ static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flag
 
 	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
 
-	down_write(&namespace_sem);
-	/* Something was mounted here while we slept */
-	err = follow_down(path, true);
-	if (err < 0)
-		goto unlock;
+	err = lock_mount(path);
+	if (err)
+		return err;
 
 	err = -EINVAL;
 	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
@@ -2007,7 +2020,7 @@ static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flag
 	err = graft_tree(newmnt, path);
 
 unlock:
-	up_write(&namespace_sem);
+	unlock_mount(path);
 	return err;
 }
 
@@ -2575,55 +2588,53 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out1;
 
 	error = security_sb_pivotroot(&old, &new);
-	if (error) {
-		path_put(&old);
-		goto out1;
-	}
+	if (error)
+		goto out2;
 
 	get_fs_root(current->fs, &root);
-	down_write(&namespace_sem);
-	mutex_lock(&old.dentry->d_inode->i_mutex);
+	error = lock_mount(&old);
+	if (error)
+		goto out3;
+
 	error = -EINVAL;
 	if (IS_MNT_SHARED(old.mnt) ||
 		IS_MNT_SHARED(new.mnt->mnt_parent) ||
 		IS_MNT_SHARED(root.mnt->mnt_parent))
-		goto out2;
+		goto out4;
 	if (!check_mnt(root.mnt) || !check_mnt(new.mnt))
-		goto out2;
+		goto out4;
 	error = -ENOENT;
-	if (cant_mount(old.dentry))
-		goto out2;
 	if (d_unlinked(new.dentry))
-		goto out2;
+		goto out4;
 	if (d_unlinked(old.dentry))
-		goto out2;
+		goto out4;
 	error = -EBUSY;
 	if (new.mnt == root.mnt ||
 	    old.mnt == root.mnt)
-		goto out2; /* loop, on the same file system  */
+		goto out4; /* loop, on the same file system  */
 	error = -EINVAL;
 	if (root.mnt->mnt_root != root.dentry)
-		goto out2; /* not a mountpoint */
+		goto out4; /* not a mountpoint */
 	if (root.mnt->mnt_parent == root.mnt)
-		goto out2; /* not attached */
+		goto out4; /* not attached */
 	if (new.mnt->mnt_root != new.dentry)
-		goto out2; /* not a mountpoint */
+		goto out4; /* not a mountpoint */
 	if (new.mnt->mnt_parent == new.mnt)
-		goto out2; /* not attached */
+		goto out4; /* not attached */
 	/* make sure we can reach put_old from new_root */
 	tmp = old.mnt;
 	if (tmp != new.mnt) {
 		for (;;) {
 			if (tmp->mnt_parent == tmp)
-				goto out2; /* already mounted on put_old */
+				goto out4; /* already mounted on put_old */
 			if (tmp->mnt_parent == new.mnt)
 				break;
 			tmp = tmp->mnt_parent;
 		}
 		if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
-			goto out2;
+			goto out4;
 	} else if (!is_subdir(old.dentry, new.dentry))
-		goto out2;
+		goto out4;
 	br_write_lock(vfsmount_lock);
 	detach_mnt(new.mnt, &parent_path);
 	detach_mnt(root.mnt, &root_parent);
@@ -2634,14 +2645,16 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	br_write_unlock(vfsmount_lock);
 	chroot_fs_refs(&root, &new);
-
 	error = 0;
-	path_put(&root_parent);
-	path_put(&parent_path);
-out2:
-	mutex_unlock(&old.dentry->d_inode->i_mutex);
-	up_write(&namespace_sem);
+out4:
+	unlock_mount(&old);
+	if (!error) {
+		path_put(&root_parent);
+		path_put(&parent_path);
+	}
+out3:
 	path_put(&root);
+out2:
 	path_put(&old);
 out1:
 	path_put(&new);
-- 
cgit v1.2.3-59-g8ed1b


From 7cc90cc3ffe22a0d81b8d605b20a82ec7911012d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 18 Mar 2011 09:04:20 -0400
Subject: don't pass 'mounting_here' flag to follow_down()

it's always false now

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 4 ++--
 fs/nfsd/vfs.c         | 2 +-
 include/linux/namei.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index b912b7abe747..e092648a068c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1065,7 +1065,7 @@ failed:
  * Care must be taken as namespace_sem may be held (indicated by mounting_here
  * being true).
  */
-int follow_down(struct path *path, bool mounting_here)
+int follow_down(struct path *path)
 {
 	unsigned managed;
 	int ret;
@@ -1086,7 +1086,7 @@ int follow_down(struct path *path, bool mounting_here)
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
 			ret = path->dentry->d_op->d_manage(
-				path->dentry, mounting_here, false);
+				path->dentry, false, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index da1d9701f8e4..ff93025ae2f7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -87,7 +87,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 			    .dentry = dget(dentry)};
 	int err = 0;
 
-	err = follow_down(&path, false);
+	err = follow_down(&path);
 	if (err < 0)
 		goto out;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 9c8603872c36..eba45ea10298 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -85,7 +85,7 @@ extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
-extern int follow_down(struct path *, bool);
+extern int follow_down(struct path *);
 extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
-- 
cgit v1.2.3-59-g8ed1b


From 1aed3e4204dd787d53b3cd6363eb63bb4900c38e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 18 Mar 2011 09:09:02 -0400
Subject: lose 'mounting_here' argument in ->d_manage()

it's always false...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/vfs.txt | 6 +-----
 fs/autofs4/root.c                 | 6 +++---
 fs/namei.c                        | 7 +++----
 include/linux/dcache.h            | 2 +-
 4 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index ef0714aa8e40..306f0ae8df09 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -873,7 +873,7 @@ struct dentry_operations {
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
-	int (*d_manage)(struct dentry *, bool, bool);
+	int (*d_manage)(struct dentry *, bool);
 };
 
   d_revalidate: called when the VFS needs to revalidate a dentry. This
@@ -969,10 +969,6 @@ struct dentry_operations {
 	mounted on it and not to check the automount flag.  Any other error
 	code will abort pathwalk completely.
 
-	If the 'mounting_here' parameter is true, then namespace_sem is being
-	held by the caller and the function should not initiate any mounts or
-	unmounts that it will then wait for.
-
 	If the 'rcu_walk' parameter is true, then the caller is doing a
 	pathwalk in RCU-walk mode.  Sleeping is not permitted in this mode,
 	and the caller can be asked to leave it and call again by returing
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 014e7aba3b08..e6f84d26f4cf 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -36,7 +36,7 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(struct dentry *, bool, bool);
+static int autofs4_d_manage(struct dentry *, bool);
 static void autofs4_dentry_release(struct dentry *);
 
 const struct file_operations autofs4_root_operations = {
@@ -446,7 +446,7 @@ done:
 	return NULL;
 }
 
-int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
+int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 
@@ -454,7 +454,7 @@ int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
 		dentry, dentry->d_name.len, dentry->d_name.name);
 
 	/* The daemon never waits. */
-	if (autofs4_oz_mode(sbi) || mounting_here) {
+	if (autofs4_oz_mode(sbi)) {
 		if (!d_mountpoint(dentry))
 			return -EISDIR;
 		return 0;
diff --git a/fs/namei.c b/fs/namei.c
index e092648a068c..5a9a6c3094da 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -933,8 +933,7 @@ static int follow_managed(struct path *path, unsigned flags)
 		if (managed & DCACHE_MANAGE_TRANSIT) {
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
-			ret = path->dentry->d_op->d_manage(path->dentry,
-							   false, false);
+			ret = path->dentry->d_op->d_manage(path->dentry, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
@@ -999,7 +998,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		struct vfsmount *mounted;
 		if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
 		    !reverse_transit &&
-		    path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+		    path->dentry->d_op->d_manage(path->dentry, true) < 0)
 			return false;
 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 		if (!mounted)
@@ -1086,7 +1085,7 @@ int follow_down(struct path *path)
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
 			ret = path->dentry->d_op->d_manage(
-				path->dentry, false, false);
+				path->dentry, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f958c19e3ca5..1a87760d6532 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -168,7 +168,7 @@ struct dentry_operations {
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
-	int (*d_manage)(struct dentry *, bool, bool);
+	int (*d_manage)(struct dentry *, bool);
 } ____cacheline_aligned;
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 24ff6663ccfdaf088dfa7acae489cb11ed4f43c4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 18 Nov 2010 20:52:55 -0500
Subject: fs: call security_d_instantiate in d_obtain_alias V2

While trying to track down some NFS problems with BTRFS, I kept noticing I was
getting -EACCESS for no apparent reason.  Eric Paris and printk() helped me
figure out that it was SELinux that was giving me grief, with the following
denial

type=AVC msg=audit(1290013638.413:95): avc:  denied  { 0x800000 } for  pid=1772
comm="nfsd" name="" dev=sda1 ino=256 scontext=system_u:system_r:kernel_t:s0
tcontext=system_u:object_r:unlabeled_t:s0 tclass=file

Turns out this is because in d_obtain_alias if we can't find an alias we create
one and do all the normal instantiation stuff, but we don't do the
security_d_instantiate.

Usually we are protected from getting a hashed dentry that hasn't yet run
security_d_instantiate() by the parent's i_mutex, but obviously this isn't an
option there, so in order to deal with the case that a second thread comes in
and finds our new dentry before we get to run security_d_instantiate(), we go
ahead and call it if we find a dentry already.  Eric assures me that this is ok
as the code checks to see if the dentry has been initialized already so calling
security_d_instantiate() against the same dentry multiple times is ok.  With
this patch I'm no longer getting errant -EACCESS values.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index a39fe47c466f..1baddc1cec48 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1612,10 +1612,13 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	__bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
 	spin_unlock(&tmp->d_lock);
 	spin_unlock(&inode->i_lock);
+	security_d_instantiate(tmp, inode);
 
 	return tmp;
 
  out_iput:
+	if (res && !IS_ERR(res))
+		security_d_instantiate(res, inode);
 	iput(inode);
 	return res;
 }
-- 
cgit v1.2.3-59-g8ed1b