From 13bcc6a2853435bb5dad368bcbaa9d2a5b9c0ac4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 16 Jul 2016 15:22:55 -0500
Subject: sysctl: Stop implicitly passing current into sysctl_table_root.lookup

Passing nsproxy into sysctl_table_root.lookup was a premature
optimization in attempt to avoid depending on current.  The
directory /proc/self/sys has not appeared and if and when
it does this code will need to be reviewed closely and reworked
anyway.  So remove the premature optimization.

Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/proc_sysctl.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1b93650dda2f..a80acdfbe180 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -72,7 +72,7 @@ static DEFINE_SPINLOCK(sysctl_lock);
 
 static void drop_sysctl_table(struct ctl_table_header *header);
 static int sysctl_follow_link(struct ctl_table_header **phead,
-	struct ctl_table **pentry, struct nsproxy *namespaces);
+	struct ctl_table **pentry);
 static int insert_links(struct ctl_table_header *head);
 static void put_links(struct ctl_table_header *header);
 
@@ -319,11 +319,11 @@ static void sysctl_head_finish(struct ctl_table_header *head)
 }
 
 static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+lookup_header_set(struct ctl_table_root *root)
 {
 	struct ctl_table_set *set = &root->default_set;
 	if (root->lookup)
-		set = root->lookup(root, namespaces);
+		set = root->lookup(root);
 	return set;
 }
 
@@ -491,7 +491,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 		goto out;
 
 	if (S_ISLNK(p->mode)) {
-		ret = sysctl_follow_link(&h, &p, current->nsproxy);
+		ret = sysctl_follow_link(&h, &p);
 		err = ERR_PTR(ret);
 		if (ret)
 			goto out;
@@ -659,7 +659,7 @@ static bool proc_sys_link_fill_cache(struct file *file,
 
 	if (S_ISLNK(table->mode)) {
 		/* It is not an error if we can not follow the link ignore it */
-		int err = sysctl_follow_link(&head, &table, current->nsproxy);
+		int err = sysctl_follow_link(&head, &table);
 		if (err)
 			goto out;
 	}
@@ -976,7 +976,7 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
 }
 
 static int sysctl_follow_link(struct ctl_table_header **phead,
-	struct ctl_table **pentry, struct nsproxy *namespaces)
+	struct ctl_table **pentry)
 {
 	struct ctl_table_header *head;
 	struct ctl_table_root *root;
@@ -988,7 +988,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
 	ret = 0;
 	spin_lock(&sysctl_lock);
 	root = (*pentry)->data;
-	set = lookup_header_set(root, namespaces);
+	set = lookup_header_set(root);
 	dir = xlate_dir(set, (*phead)->parent);
 	if (IS_ERR(dir))
 		ret = PTR_ERR(dir);
-- 
cgit v1.2.3-59-g8ed1b


From 537f7ccb396804c6d0057b93ba8eb104ba44f851 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 8 Aug 2016 14:37:37 -0500
Subject: mntns: Add a limit on the number of mount namespaces.

v2: Fixed the very obvious lack of setting ucounts
    on struct mnt_ns reported by Andrei Vagin, and the kbuild
    test report.

Reported-by: Andrei Vagin <avagin@openvz.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/mount.h                     |  1 +
 fs/namespace.c                 | 22 +++++++++++++++++++++-
 include/linux/user_namespace.h |  1 +
 kernel/ucount.c                |  1 +
 4 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/mount.h b/fs/mount.h
index 14db05d424f7..e037981d8351 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,6 +10,7 @@ struct mnt_namespace {
 	struct mount *	root;
 	struct list_head	list;
 	struct user_namespace	*user_ns;
+	struct ucounts		*ucounts;
 	u64			seq;	/* Sequence number to prevent loops */
 	wait_queue_head_t poll;
 	u64 event;
diff --git a/fs/namespace.c b/fs/namespace.c
index 7bb2cda3bfef..491b8f3e4c9a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2719,9 +2719,20 @@ dput_out:
 	return retval;
 }
 
+static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
+{
+	return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
+}
+
+static void dec_mnt_namespaces(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
+}
+
 static void free_mnt_ns(struct mnt_namespace *ns)
 {
 	ns_free_inum(&ns->ns);
+	dec_mnt_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	kfree(ns);
 }
@@ -2738,14 +2749,22 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 {
 	struct mnt_namespace *new_ns;
+	struct ucounts *ucounts;
 	int ret;
 
+	ucounts = inc_mnt_namespaces(user_ns);
+	if (!ucounts)
+		return ERR_PTR(-ENFILE);
+
 	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
-	if (!new_ns)
+	if (!new_ns) {
+		dec_mnt_namespaces(ucounts);
 		return ERR_PTR(-ENOMEM);
+	}
 	ret = ns_alloc_inum(&new_ns->ns);
 	if (ret) {
 		kfree(new_ns);
+		dec_mnt_namespaces(ucounts);
 		return ERR_PTR(ret);
 	}
 	new_ns->ns.ops = &mntns_operations;
@@ -2756,6 +2775,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->event = 0;
 	new_ns->user_ns = get_user_ns(user_ns);
+	new_ns->ucounts = ucounts;
 	return new_ns;
 }
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index c6bc980b06a9..30ffe10cda18 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -30,6 +30,7 @@ enum ucount_type {
 	UCOUNT_UTS_NAMESPACES,
 	UCOUNT_IPC_NAMESPACES,
 	UCOUNT_NET_NAMESPACES,
+	UCOUNT_MNT_NAMESPACES,
 	UCOUNT_CGROUP_NAMESPACES,
 	UCOUNT_COUNTS,
 };
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 205f1a07faac..9d20d5dd298a 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -72,6 +72,7 @@ static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_uts_namespaces"),
 	UCOUNT_ENTRY("max_ipc_namespaces"),
 	UCOUNT_ENTRY("max_net_namespaces"),
+	UCOUNT_ENTRY("max_mnt_namespaces"),
 	UCOUNT_ENTRY("max_cgroup_namespaces"),
 	{ }
 };
-- 
cgit v1.2.3-59-g8ed1b


From df75e7748bae1c7098bfa358485389b897f71305 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 22 Sep 2016 13:08:36 -0500
Subject: userns: When the per user per user namespace limit is reached return
 ENOSPC

The current error codes returned when a the per user per user
namespace limit are hit (EINVAL, EUSERS, and ENFILE) are wrong.  I
asked for advice on linux-api and it we made clear that those were
the wrong error code, but a correct effor code was not suggested.

The best general error code I have found for hitting a resource limit
is ENOSPC.  It is not perfect but as it is unambiguous it will serve
until someone comes up with a better error code.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c           | 2 +-
 ipc/namespace.c          | 2 +-
 kernel/cgroup.c          | 2 +-
 kernel/pid_namespace.c   | 2 +-
 kernel/user_namespace.c  | 2 +-
 kernel/utsname.c         | 2 +-
 net/core/net_namespace.c | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 491b8f3e4c9a..cf2cc234c8b4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2754,7 +2754,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 
 	ucounts = inc_mnt_namespaces(user_ns);
 	if (!ucounts)
-		return ERR_PTR(-ENFILE);
+		return ERR_PTR(-ENOSPC);
 
 	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
 	if (!new_ns) {
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 730914214135..fab727d9fe09 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -33,7 +33,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	struct ucounts *ucounts;
 	int err;
 
-	err = -ENFILE;
+	err = -ENOSPC;
 	ucounts = inc_ipc_namespaces(user_ns);
 	if (!ucounts)
 		goto fail;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e9e4427fec46..f1dd4b076210 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6354,7 +6354,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
 
 	ucounts = inc_cgroup_namespaces(user_ns);
 	if (!ucounts)
-		return ERR_PTR(-ENFILE);
+		return ERR_PTR(-ENOSPC);
 
 	/* It is not safe to take cgroup_mutex here */
 	spin_lock_irq(&css_set_lock);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 30a7f3351932..7542b28cc929 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -98,7 +98,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	int i;
 	int err;
 
-	err = -EINVAL;
+	err = -ENOSPC;
 	if (level > MAX_PID_NS_LEVEL)
 		goto out;
 	ucounts = inc_pid_namespaces(user_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0edafe305861..f2c5ba5505f1 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -76,7 +76,7 @@ int create_user_ns(struct cred *new)
 	struct ucounts *ucounts;
 	int ret, i;
 
-	ret = -EUSERS;
+	ret = -ENOSPC;
 	if (parent_ns->level > 32)
 		goto fail;
 
diff --git a/kernel/utsname.c b/kernel/utsname.c
index f3b0bb4ac3ba..35587b76faa3 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -49,7 +49,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
 	struct ucounts *ucounts;
 	int err;
 
-	err = -ENFILE;
+	err = -ENOSPC;
 	ucounts = inc_uts_namespaces(user_ns);
 	if (!ucounts)
 		goto fail;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3e2812aeceb7..06af5d6a883c 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -370,7 +370,7 @@ struct net *copy_net_ns(unsigned long flags,
 
 	ucounts = inc_net_namespaces(user_ns);
 	if (!ucounts)
-		return ERR_PTR(-ENFILE);
+		return ERR_PTR(-ENOSPC);
 
 	net = net_alloc();
 	if (!net) {
-- 
cgit v1.2.3-59-g8ed1b


From 208904793abad8892422edad0e712f2d939c496b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 20 Apr 2016 12:01:39 -0500
Subject: devpts: Move parse_mount_options into fill_super

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/devpts/inode.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d116453b0276..f582a4be1a7a 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -395,6 +395,7 @@ static int
 devpts_fill_super(struct super_block *s, void *data, int silent)
 {
 	struct inode *inode;
+	int error;
 
 	s->s_iflags &= ~SB_I_NODEV;
 	s->s_blocksize = 1024;
@@ -403,10 +404,16 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	s->s_op = &devpts_sops;
 	s->s_time_gran = 1;
 
+	error = -ENOMEM;
 	s->s_fs_info = new_pts_fs_info(s);
 	if (!s->s_fs_info)
 		goto fail;
 
+	error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts);
+	if (error)
+		goto fail;
+
+	error = -ENOMEM;
 	inode = new_inode(s);
 	if (!inode)
 		goto fail;
@@ -424,7 +431,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	pr_err("get root dentry failed\n");
 
 fail:
-	return -ENOMEM;
+	return error;
 }
 
 /*
@@ -437,13 +444,8 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	int error;
-	struct pts_mount_opts opts;
 	struct super_block *s;
 
-	error = parse_mount_options(data, PARSE_MOUNT, &opts);
-	if (error)
-		return ERR_PTR(error);
-
 	s = sget(fs_type, NULL, set_anon_super, flags, NULL);
 	if (IS_ERR(s))
 		return ERR_CAST(s);
@@ -455,8 +457,6 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
 		s->s_flags |= MS_ACTIVE;
 	}
 
-	memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
-
 	error = mknod_ptmx(s);
 	if (error)
 		goto out_undo_sget;
-- 
cgit v1.2.3-59-g8ed1b


From 7dd17f713474504fa6d61d666e27b02e4a608abe Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 19 Apr 2016 17:51:04 -0500
Subject: devpts: Move the creation of /dev/pts/ptmx into fill_super

The code makes more sense here and things are just clearer.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/devpts/inode.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f582a4be1a7a..f3277f711b25 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -425,11 +425,19 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	set_nlink(inode, 2);
 
 	s->s_root = d_make_root(inode);
-	if (s->s_root)
-		return 0;
+	if (!s->s_root) {
+		pr_err("get root dentry failed\n");
+		goto fail;
+	}
 
-	pr_err("get root dentry failed\n");
+	error = mknod_ptmx(s);
+	if (error)
+		goto fail_dput;
 
+	return 0;
+fail_dput:
+	dput(s->s_root);
+	s->s_root = NULL;
 fail:
 	return error;
 }
@@ -456,11 +464,6 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
 			goto out_undo_sget;
 		s->s_flags |= MS_ACTIVE;
 	}
-
-	error = mknod_ptmx(s);
-	if (error)
-		goto out_undo_sget;
-
 	return dget(s->s_root);
 
 out_undo_sget:
-- 
cgit v1.2.3-59-g8ed1b


From ec0a9ba6f201bbb4801344aa11c5d13c1ca27675 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 19 Apr 2016 17:52:53 -0500
Subject: devpts: Simplify devpts_mount by using mount_nodev

Now that all of the work of setting up a superblock has been moved to
devpts_fill_super simplify devpts_mount by calling mount_nodev instead
of rolling mount_nodev by hand.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/devpts/inode.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f3277f711b25..5e216749bd96 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -451,24 +451,7 @@ fail:
 static struct dentry *devpts_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	int error;
-	struct super_block *s;
-
-	s = sget(fs_type, NULL, set_anon_super, flags, NULL);
-	if (IS_ERR(s))
-		return ERR_CAST(s);
-
-	if (!s->s_root) {
-		error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-		if (error)
-			goto out_undo_sget;
-		s->s_flags |= MS_ACTIVE;
-	}
-	return dget(s->s_root);
-
-out_undo_sget:
-	deactivate_locked_super(s);
-	return ERR_PTR(error);
+	return mount_nodev(fs_type, flags, data, devpts_fill_super);
 }
 
 static void devpts_kill_sb(struct super_block *sb)
-- 
cgit v1.2.3-59-g8ed1b


From 0d126a7ff77a02e88b6bf37a726abf8990226bf4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 22 Dec 2015 17:39:18 -0600
Subject: devpts: Make devpts_kill_sb safe if fsi is NULL

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/devpts/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5e216749bd96..2b0f24cb7d54 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -458,7 +458,8 @@ static void devpts_kill_sb(struct super_block *sb)
 {
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 
-	ida_destroy(&fsi->allocated_ptys);
+	if (fsi)
+		ida_destroy(&fsi->allocated_ptys);
 	kfree(fsi);
 	kill_litter_super(sb);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 985e5d856cbcfc17a6646740f2200eb625c76e89 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 20 Apr 2016 12:02:09 -0500
Subject: devpts: Remove sync_filesystems

devpts does not and never will have anything to sync
so don't bother calling sync_filesystems on remount.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/devpts/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 2b0f24cb7d54..d08971e8eaae 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -336,7 +336,6 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
 
-	sync_filesystem(sb);
 	err = parse_mount_options(data, PARSE_REMOUNT, opts);
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 93f0a88bd4ad99a515f500a09f4a489ff03073eb Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 8 Dec 2015 00:36:51 -0600
Subject: devpts: Change the owner of /dev/pts/ptmx to the mounter of /dev/pts

In 99.99% of the cases only root in a user namespace can mount /dev/pts
and in those cases the owner of /dev/pts/ptmx will remain root.root

In the oddball case where someone else has CAP_SYS_ADMIN this code
modifies the /dev/pts mount code to use current_fsuid and current_fsgid
as the values to use when creating the /dev/ptmx inode.  As is done
when any other file is created.

This is a code simplification, and it allows running without a root
user entirely.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/devpts/inode.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d08971e8eaae..154cc45c19e8 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -272,13 +272,8 @@ static int mknod_ptmx(struct super_block *sb)
 	struct dentry *root = sb->s_root;
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
-	kuid_t root_uid;
-	kgid_t root_gid;
-
-	root_uid = make_kuid(current_user_ns(), 0);
-	root_gid = make_kgid(current_user_ns(), 0);
-	if (!uid_valid(root_uid) || !gid_valid(root_gid))
-		return -EINVAL;
+	kuid_t ptmx_uid = current_fsuid();
+	kgid_t ptmx_gid = current_fsgid();
 
 	inode_lock(d_inode(root));
 
@@ -309,8 +304,8 @@ static int mknod_ptmx(struct super_block *sb)
 
 	mode = S_IFCHR|opts->ptmxmode;
 	init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
-	inode->i_uid = root_uid;
-	inode->i_gid = root_gid;
+	inode->i_uid = ptmx_uid;
+	inode->i_gid = ptmx_gid;
 
 	d_add(dentry, inode);
 
-- 
cgit v1.2.3-59-g8ed1b


From bcac25a58bfc6bd79191ac5d7afb49bea96da8c9 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Tue, 6 Sep 2016 00:47:13 -0700
Subject: kernel: add a helper to get an owning user namespace for a namespace

Return -EPERM if an owning user namespace is outside of a process
current user namespace.

v2: In a first version ns_get_owner returned ENOENT for init_user_ns.
    This special cases was removed from this version. There is nothing
    outside of init_user_ns, so we can return EPERM.
v3: rename ns->get_owner() to ns->owner(). get_* usually means that it
grabs a reference.

Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/namespace.c                 |  6 ++++++
 include/linux/proc_ns.h        |  1 +
 include/linux/user_namespace.h |  7 +++++++
 ipc/namespace.c                |  6 ++++++
 kernel/cgroup.c                |  6 ++++++
 kernel/pid_namespace.c         |  6 ++++++
 kernel/user_namespace.c        | 24 ++++++++++++++++++++++++
 kernel/utsname.c               |  6 ++++++
 net/core/net_namespace.c       |  6 ++++++
 9 files changed, 68 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 7bb2cda3bfef..fea56f310547 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3348,10 +3348,16 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 	return 0;
 }
 
+static struct user_namespace *mntns_owner(struct ns_common *ns)
+{
+	return to_mnt_ns(ns)->user_ns;
+}
+
 const struct proc_ns_operations mntns_operations = {
 	.name		= "mnt",
 	.type		= CLONE_NEWNS,
 	.get		= mntns_get,
 	.put		= mntns_put,
 	.install	= mntns_install,
+	.owner		= mntns_owner,
 };
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index de0e7719d4c5..ca85a4348ffc 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -18,6 +18,7 @@ struct proc_ns_operations {
 	struct ns_common *(*get)(struct task_struct *task);
 	void (*put)(struct ns_common *ns);
 	int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
+	struct user_namespace *(*owner)(struct ns_common *ns);
 };
 
 extern const struct proc_ns_operations netns_operations;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 9217169c64cb..190cf0760815 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -73,6 +73,8 @@ extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t,
 extern int proc_setgroups_show(struct seq_file *m, void *v);
 extern bool userns_may_setgroups(const struct user_namespace *ns);
 extern bool current_in_userns(const struct user_namespace *target_ns);
+
+struct ns_common *ns_get_owner(struct ns_common *ns);
 #else
 
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -106,6 +108,11 @@ static inline bool current_in_userns(const struct user_namespace *target_ns)
 {
 	return true;
 }
+
+static inline struct ns_common *ns_get_owner(struct ns_common *ns)
+{
+	return ERR_PTR(-EPERM);
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/ipc/namespace.c b/ipc/namespace.c
index d87e6baa1323..578d93be619d 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -165,10 +165,16 @@ static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
 	return 0;
 }
 
+static struct user_namespace *ipcns_owner(struct ns_common *ns)
+{
+	return to_ipc_ns(ns)->user_ns;
+}
+
 const struct proc_ns_operations ipcns_operations = {
 	.name		= "ipc",
 	.type		= CLONE_NEWIPC,
 	.get		= ipcns_get,
 	.put		= ipcns_put,
 	.install	= ipcns_install,
+	.owner		= ipcns_owner,
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7f5221..86b0e8b16426 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6403,12 +6403,18 @@ static void cgroupns_put(struct ns_common *ns)
 	put_cgroup_ns(to_cg_ns(ns));
 }
 
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+	return to_cg_ns(ns)->user_ns;
+}
+
 const struct proc_ns_operations cgroupns_operations = {
 	.name		= "cgroup",
 	.type		= CLONE_NEWCGROUP,
 	.get		= cgroupns_get,
 	.put		= cgroupns_put,
 	.install	= cgroupns_install,
+	.owner		= cgroupns_owner,
 };
 
 static __init int cgroup_namespaces_init(void)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba137fd15..c02d744225e1 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -388,12 +388,18 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 	return 0;
 }
 
+static struct user_namespace *pidns_owner(struct ns_common *ns)
+{
+	return to_pid_ns(ns)->user_ns;
+}
+
 const struct proc_ns_operations pidns_operations = {
 	.name		= "pid",
 	.type		= CLONE_NEWPID,
 	.get		= pidns_get,
 	.put		= pidns_put,
 	.install	= pidns_install,
+	.owner		= pidns_owner,
 };
 
 static __init int pid_namespaces_init(void)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 68f594212759..0ef683a03c20 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1004,12 +1004,36 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 	return commit_creds(cred);
 }
 
+struct ns_common *ns_get_owner(struct ns_common *ns)
+{
+	struct user_namespace *my_user_ns = current_user_ns();
+	struct user_namespace *owner, *p;
+
+	/* See if the owner is in the current user namespace */
+	owner = p = ns->ops->owner(ns);
+	for (;;) {
+		if (!p)
+			return ERR_PTR(-EPERM);
+		if (p == my_user_ns)
+			break;
+		p = p->parent;
+	}
+
+	return &get_user_ns(owner)->ns;
+}
+
+static struct user_namespace *userns_owner(struct ns_common *ns)
+{
+	return to_user_ns(ns)->parent;
+}
+
 const struct proc_ns_operations userns_operations = {
 	.name		= "user",
 	.type		= CLONE_NEWUSER,
 	.get		= userns_get,
 	.put		= userns_put,
 	.install	= userns_install,
+	.owner		= userns_owner,
 };
 
 static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 831ea7108232..e1211a8a5c18 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -130,10 +130,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
 	return 0;
 }
 
+static struct user_namespace *utsns_owner(struct ns_common *ns)
+{
+	return to_uts_ns(ns)->user_ns;
+}
+
 const struct proc_ns_operations utsns_operations = {
 	.name		= "uts",
 	.type		= CLONE_NEWUTS,
 	.get		= utsns_get,
 	.put		= utsns_put,
 	.install	= utsns_install,
+	.owner		= utsns_owner,
 };
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..861efa34f08c 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -996,11 +996,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 	return 0;
 }
 
+static struct user_namespace *netns_owner(struct ns_common *ns)
+{
+	return to_net_ns(ns)->user_ns;
+}
+
 const struct proc_ns_operations netns_operations = {
 	.name		= "net",
 	.type		= CLONE_NEWNET,
 	.get		= netns_get,
 	.put		= netns_put,
 	.install	= netns_install,
+	.owner		= netns_owner,
 };
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 6786741dbf99e44fb0c0ed85a37582b8a26f1c3b Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Tue, 6 Sep 2016 00:47:14 -0700
Subject: nsfs: add ioctl to get an owning user namespace for ns file
 descriptor

Each namespace has an owning user namespace and now there is not way
to discover these relationships.

Understending namespaces relationships allows to answer the question:
what capability does process X have to perform operations on a resource
governed by namespace Y?

After a long discussion, Eric W. Biederman proposed to use ioctl-s for
this purpose.

The NS_GET_USERNS ioctl returns a file descriptor to an owning user
namespace.
It returns EPERM if a target namespace is outside of a current user
namespace.

v2: rename parent to relative

v3: Add a missing mntput when returning -EAGAIN --EWB

Acked-by: Serge Hallyn <serge@hallyn.com>
Link: https://lkml.org/lkml/2016/7/6/158
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/nsfs.c                 | 96 ++++++++++++++++++++++++++++++++++++++++-------
 include/uapi/linux/nsfs.h | 11 ++++++
 2 files changed, 94 insertions(+), 13 deletions(-)
 create mode 100644 include/uapi/linux/nsfs.h

(limited to 'fs')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8f20d6016e20..3887da470f7e 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -5,11 +5,16 @@
 #include <linux/magic.h>
 #include <linux/ktime.h>
 #include <linux/seq_file.h>
+#include <linux/user_namespace.h>
+#include <linux/nsfs.h>
 
 static struct vfsmount *nsfs_mnt;
 
+static long ns_ioctl(struct file *filp, unsigned int ioctl,
+			unsigned long arg);
 static const struct file_operations ns_file_operations = {
 	.llseek		= no_llseek,
+	.unlocked_ioctl = ns_ioctl,
 };
 
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
@@ -44,22 +49,14 @@ static void nsfs_evict(struct inode *inode)
 	ns->ops->put(ns);
 }
 
-void *ns_get_path(struct path *path, struct task_struct *task,
-			const struct proc_ns_operations *ns_ops)
+static void *__ns_get_path(struct path *path, struct ns_common *ns)
 {
 	struct vfsmount *mnt = mntget(nsfs_mnt);
 	struct qstr qname = { .name = "", };
 	struct dentry *dentry;
 	struct inode *inode;
-	struct ns_common *ns;
 	unsigned long d;
 
-again:
-	ns = ns_ops->get(task);
-	if (!ns) {
-		mntput(mnt);
-		return ERR_PTR(-ENOENT);
-	}
 	rcu_read_lock();
 	d = atomic_long_read(&ns->stashed);
 	if (!d)
@@ -68,7 +65,7 @@ again:
 	if (!lockref_get_not_dead(&dentry->d_lockref))
 		goto slow;
 	rcu_read_unlock();
-	ns_ops->put(ns);
+	ns->ops->put(ns);
 got_it:
 	path->mnt = mnt;
 	path->dentry = dentry;
@@ -77,7 +74,7 @@ slow:
 	rcu_read_unlock();
 	inode = new_inode_pseudo(mnt->mnt_sb);
 	if (!inode) {
-		ns_ops->put(ns);
+		ns->ops->put(ns);
 		mntput(mnt);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -95,17 +92,90 @@ slow:
 		return ERR_PTR(-ENOMEM);
 	}
 	d_instantiate(dentry, inode);
-	dentry->d_fsdata = (void *)ns_ops;
+	dentry->d_fsdata = (void *)ns->ops;
 	d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
 	if (d) {
 		d_delete(dentry);	/* make sure ->d_prune() does nothing */
 		dput(dentry);
+		mntput(mnt);
 		cpu_relax();
-		goto again;
+		return ERR_PTR(-EAGAIN);
 	}
 	goto got_it;
 }
 
+void *ns_get_path(struct path *path, struct task_struct *task,
+			const struct proc_ns_operations *ns_ops)
+{
+	struct ns_common *ns;
+	void *ret;
+
+again:
+	ns = ns_ops->get(task);
+	if (!ns)
+		return ERR_PTR(-ENOENT);
+
+	ret = __ns_get_path(path, ns);
+	if (IS_ERR(ret) && PTR_ERR(ret) == -EAGAIN)
+		goto again;
+	return ret;
+}
+
+static int open_related_ns(struct ns_common *ns,
+		   struct ns_common *(*get_ns)(struct ns_common *ns))
+{
+	struct path path = {};
+	struct file *f;
+	void *err;
+	int fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	while (1) {
+		struct ns_common *relative;
+
+		relative = get_ns(ns);
+		if (IS_ERR(relative)) {
+			put_unused_fd(fd);
+			return PTR_ERR(relative);
+		}
+
+		err = __ns_get_path(&path, relative);
+		if (IS_ERR(err) && PTR_ERR(err) == -EAGAIN)
+			continue;
+		break;
+	}
+	if (IS_ERR(err)) {
+		put_unused_fd(fd);
+		return PTR_ERR(err);
+	}
+
+	f = dentry_open(&path, O_RDONLY, current_cred());
+	path_put(&path);
+	if (IS_ERR(f)) {
+		put_unused_fd(fd);
+		fd = PTR_ERR(f);
+	} else
+		fd_install(fd, f);
+
+	return fd;
+}
+
+static long ns_ioctl(struct file *filp, unsigned int ioctl,
+			unsigned long arg)
+{
+	struct ns_common *ns = get_proc_ns(file_inode(filp));
+
+	switch (ioctl) {
+	case NS_GET_USERNS:
+		return open_related_ns(ns, ns_get_owner);
+	default:
+		return -ENOTTY;
+	}
+}
+
 int ns_get_name(char *buf, size_t size, struct task_struct *task,
 			const struct proc_ns_operations *ns_ops)
 {
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
new file mode 100644
index 000000000000..5cacd5c1b5d7
--- /dev/null
+++ b/include/uapi/linux/nsfs.h
@@ -0,0 +1,11 @@
+#ifndef __LINUX_NSFS_H
+#define __LINUX_NSFS_H
+
+#include <linux/ioctl.h>
+
+#define NSIO	0xb7
+
+/* Returns a file descriptor that refers to an owning user namespace */
+#define NS_GET_USERNS	_IO(NSIO, 0x1)
+
+#endif /* __LINUX_NSFS_H */
-- 
cgit v1.2.3-59-g8ed1b


From a7306ed8d94af729ecef8b6e37506a1c6fc14788 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Tue, 6 Sep 2016 00:47:15 -0700
Subject: nsfs: add ioctl to get a parent namespace

Pid and user namepaces are hierarchical. There is no way to discover
parent-child relationships.

In a future we will use this interface to dump and restore nested
namespaces.

Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/nsfs.c                 |  4 ++++
 include/linux/proc_ns.h   |  1 +
 include/uapi/linux/nsfs.h |  2 ++
 kernel/pid_namespace.c    | 19 +++++++++++++++++++
 kernel/user_namespace.c   |  1 +
 5 files changed, 27 insertions(+)

(limited to 'fs')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 3887da470f7e..fb7b397a1297 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -171,6 +171,10 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 	switch (ioctl) {
 	case NS_GET_USERNS:
 		return open_related_ns(ns, ns_get_owner);
+	case NS_GET_PARENT:
+		if (!ns->ops->get_parent)
+			return -EINVAL;
+		return open_related_ns(ns, ns->ops->get_parent);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index ca85a4348ffc..12cb8bd81d2d 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -19,6 +19,7 @@ struct proc_ns_operations {
 	void (*put)(struct ns_common *ns);
 	int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
 	struct user_namespace *(*owner)(struct ns_common *ns);
+	struct ns_common *(*get_parent)(struct ns_common *ns);
 };
 
 extern const struct proc_ns_operations netns_operations;
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 5cacd5c1b5d7..3af617230d1b 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -7,5 +7,7 @@
 
 /* Returns a file descriptor that refers to an owning user namespace */
 #define NS_GET_USERNS	_IO(NSIO, 0x1)
+/* Returns a file descriptor that refers to a parent namespace */
+#define NS_GET_PARENT	_IO(NSIO, 0x2)
 
 #endif /* __LINUX_NSFS_H */
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c02d744225e1..4fa2d56a936c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -388,6 +388,24 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 	return 0;
 }
 
+static struct ns_common *pidns_get_parent(struct ns_common *ns)
+{
+	struct pid_namespace *active = task_active_pid_ns(current);
+	struct pid_namespace *pid_ns, *p;
+
+	/* See if the parent is in the current namespace */
+	pid_ns = p = to_pid_ns(ns)->parent;
+	for (;;) {
+		if (!p)
+			return ERR_PTR(-EPERM);
+		if (p == active)
+			break;
+		p = p->parent;
+	}
+
+	return &get_pid_ns(pid_ns)->ns;
+}
+
 static struct user_namespace *pidns_owner(struct ns_common *ns)
 {
 	return to_pid_ns(ns)->user_ns;
@@ -400,6 +418,7 @@ const struct proc_ns_operations pidns_operations = {
 	.put		= pidns_put,
 	.install	= pidns_install,
 	.owner		= pidns_owner,
+	.get_parent	= pidns_get_parent,
 };
 
 static __init int pid_namespaces_init(void)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0ef683a03c20..a58a219b99c6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1034,6 +1034,7 @@ const struct proc_ns_operations userns_operations = {
 	.put		= userns_put,
 	.install	= userns_install,
 	.owner		= userns_owner,
+	.get_parent	= ns_get_owner,
 };
 
 static __init int user_namespaces_init(void)
-- 
cgit v1.2.3-59-g8ed1b


From 213b067ce314f9d7e72307c7036ba3cd285b80da Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 22 Sep 2016 19:39:20 -0500
Subject: nsfs: Simplify __ns_get_path

Move mntget from the very beginning of __ns_get_path to
the success path of __ns_get_path, and remove the mntget
calls.

This removes the possibility that there will be a mntget/mntput
pair of __ns_get_path has to retry, and generally simplifies the code.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/nsfs.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index fb7b397a1297..30bb10034120 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -51,7 +51,7 @@ static void nsfs_evict(struct inode *inode)
 
 static void *__ns_get_path(struct path *path, struct ns_common *ns)
 {
-	struct vfsmount *mnt = mntget(nsfs_mnt);
+	struct vfsmount *mnt = nsfs_mnt;
 	struct qstr qname = { .name = "", };
 	struct dentry *dentry;
 	struct inode *inode;
@@ -67,7 +67,7 @@ static void *__ns_get_path(struct path *path, struct ns_common *ns)
 	rcu_read_unlock();
 	ns->ops->put(ns);
 got_it:
-	path->mnt = mnt;
+	path->mnt = mntget(mnt);
 	path->dentry = dentry;
 	return NULL;
 slow:
@@ -75,7 +75,6 @@ slow:
 	inode = new_inode_pseudo(mnt->mnt_sb);
 	if (!inode) {
 		ns->ops->put(ns);
-		mntput(mnt);
 		return ERR_PTR(-ENOMEM);
 	}
 	inode->i_ino = ns->inum;
@@ -88,7 +87,6 @@ slow:
 	dentry = d_alloc_pseudo(mnt->mnt_sb, &qname);
 	if (!dentry) {
 		iput(inode);
-		mntput(mnt);
 		return ERR_PTR(-ENOMEM);
 	}
 	d_instantiate(dentry, inode);
@@ -97,7 +95,6 @@ slow:
 	if (d) {
 		d_delete(dentry);	/* make sure ->d_prune() does nothing */
 		dput(dentry);
-		mntput(mnt);
 		cpu_relax();
 		return ERR_PTR(-EAGAIN);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From d29216842a85c7970c536108e093963f02714498 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 28 Sep 2016 00:27:17 -0500
Subject: mnt: Add a per mount namespace limit on the number of mounts

CAI Qian <caiqian@redhat.com> pointed out that the semantics
of shared subtrees make it possible to create an exponentially
increasing number of mounts in a mount namespace.

    mkdir /tmp/1 /tmp/2
    mount --make-rshared /
    for i in $(seq 1 20) ; do mount --bind /tmp/1 /tmp/2 ; done

Will create create 2^20 or 1048576 mounts, which is a practical problem
as some people have managed to hit this by accident.

As such CVE-2016-6213 was assigned.

Ian Kent <raven@themaw.net> described the situation for autofs users
as follows:

> The number of mounts for direct mount maps is usually not very large because of
> the way they are implemented, large direct mount maps can have performance
> problems. There can be anywhere from a few (likely case a few hundred) to less
> than 10000, plus mounts that have been triggered and not yet expired.
>
> Indirect mounts have one autofs mount at the root plus the number of mounts that
> have been triggered and not yet expired.
>
> The number of autofs indirect map entries can range from a few to the common
> case of several thousand and in rare cases up to between 30000 and 50000. I've
> not heard of people with maps larger than 50000 entries.
>
> The larger the number of map entries the greater the possibility for a large
> number of active mounts so it's not hard to expect cases of a 1000 or somewhat
> more active mounts.

So I am setting the default number of mounts allowed per mount
namespace at 100,000.  This is more than enough for any use case I
know of, but small enough to quickly stop an exponential increase
in mounts.  Which should be perfect to catch misconfigurations and
malfunctioning programs.

For anyone who needs a higher limit this can be changed by writing
to the new /proc/sys/fs/mount-max sysctl.

Tested-by: CAI Qian <caiqian@redhat.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 Documentation/sysctl/fs.txt |  7 +++++++
 fs/mount.h                  |  2 ++
 fs/namespace.c              | 49 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/pnode.c                  |  2 +-
 fs/pnode.h                  |  1 +
 include/linux/mount.h       |  2 ++
 kernel/sysctl.c             |  9 +++++++++
 7 files changed, 70 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 302b5ed616a6..35e17f748ca7 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -265,6 +265,13 @@ aio-nr can grow to.
 
 ==============================================================
 
+mount-max:
+
+This denotes the maximum number of mounts that may exist
+in a mount namespace.
+
+==============================================================
+
 
 2. /proc/sys/fs/binfmt_misc
 ----------------------------------------------------------
diff --git a/fs/mount.h b/fs/mount.h
index e037981d8351..d2e25d7b64b3 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -14,6 +14,8 @@ struct mnt_namespace {
 	u64			seq;	/* Sequence number to prevent loops */
 	wait_queue_head_t poll;
 	u64 event;
+	unsigned int		mounts; /* # of mounts in the namespace */
+	unsigned int		pending_mounts;
 };
 
 struct mnt_pcp {
diff --git a/fs/namespace.c b/fs/namespace.c
index 8a0e90eb81d3..db1b5a38864e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,9 @@
 #include "pnode.h"
 #include "internal.h"
 
+/* Maximum number of mounts in a mount namespace */
+unsigned int sysctl_mount_max __read_mostly = 100000;
+
 static unsigned int m_hash_mask __read_mostly;
 static unsigned int m_hash_shift __read_mostly;
 static unsigned int mp_hash_mask __read_mostly;
@@ -899,6 +902,9 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
 
 	list_splice(&head, n->list.prev);
 
+	n->mounts += n->pending_mounts;
+	n->pending_mounts = 0;
+
 	attach_shadowed(mnt, parent, shadows);
 	touch_mnt_namespace(n);
 }
@@ -1419,11 +1425,16 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 		propagate_umount(&tmp_list);
 
 	while (!list_empty(&tmp_list)) {
+		struct mnt_namespace *ns;
 		bool disconnect;
 		p = list_first_entry(&tmp_list, struct mount, mnt_list);
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
-		__touch_mnt_namespace(p->mnt_ns);
+		ns = p->mnt_ns;
+		if (ns) {
+			ns->mounts--;
+			__touch_mnt_namespace(ns);
+		}
 		p->mnt_ns = NULL;
 		if (how & UMOUNT_SYNC)
 			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
@@ -1840,6 +1851,28 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
 	return 0;
 }
 
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
+{
+	unsigned int max = READ_ONCE(sysctl_mount_max);
+	unsigned int mounts = 0, old, pending, sum;
+	struct mount *p;
+
+	for (p = mnt; p; p = next_mnt(p, mnt))
+		mounts++;
+
+	old = ns->mounts;
+	pending = ns->pending_mounts;
+	sum = old + pending;
+	if ((old > sum) ||
+	    (pending > sum) ||
+	    (max < sum) ||
+	    (mounts > (max - sum)))
+		return -ENOSPC;
+
+	ns->pending_mounts = pending + mounts;
+	return 0;
+}
+
 /*
  *  @source_mnt : mount tree to be attached
  *  @nd         : place the mount tree @source_mnt is attached
@@ -1909,10 +1942,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 			struct path *parent_path)
 {
 	HLIST_HEAD(tree_list);
+	struct mnt_namespace *ns = dest_mnt->mnt_ns;
 	struct mount *child, *p;
 	struct hlist_node *n;
 	int err;
 
+	/* Is there space to add these mounts to the mount namespace? */
+	if (!parent_path) {
+		err = count_mounts(ns, source_mnt);
+		if (err)
+			goto out;
+	}
+
 	if (IS_MNT_SHARED(dest_mnt)) {
 		err = invent_group_ids(source_mnt, true);
 		if (err)
@@ -1949,11 +1990,13 @@ static int attach_recursive_mnt(struct mount *source_mnt,
  out_cleanup_ids:
 	while (!hlist_empty(&tree_list)) {
 		child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+		child->mnt_parent->mnt_ns->pending_mounts = 0;
 		umount_tree(child, UMOUNT_SYNC);
 	}
 	unlock_mount_hash();
 	cleanup_group_ids(source_mnt, NULL);
  out:
+	ns->pending_mounts = 0;
 	return err;
 }
 
@@ -2776,6 +2819,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 	new_ns->event = 0;
 	new_ns->user_ns = get_user_ns(user_ns);
 	new_ns->ucounts = ucounts;
+	new_ns->mounts = 0;
+	new_ns->pending_mounts = 0;
 	return new_ns;
 }
 
@@ -2825,6 +2870,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	q = new;
 	while (p) {
 		q->mnt_ns = new_ns;
+		new_ns->mounts++;
 		if (new_fs) {
 			if (&p->mnt == new_fs->root.mnt) {
 				new_fs->root.mnt = mntget(&q->mnt);
@@ -2863,6 +2909,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
 		struct mount *mnt = real_mount(m);
 		mnt->mnt_ns = new_ns;
 		new_ns->root = mnt;
+		new_ns->mounts++;
 		list_add(&mnt->mnt_list, &new_ns->list);
 	} else {
 		mntput(m);
diff --git a/fs/pnode.c b/fs/pnode.c
index 99899705b105..234a9ac49958 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -259,7 +259,7 @@ static int propagate_one(struct mount *m)
 		read_sequnlock_excl(&mount_lock);
 	}
 	hlist_add_head(&child->mnt_hash, list);
-	return 0;
+	return count_mounts(m->mnt_ns, child);
 }
 
 /*
diff --git a/fs/pnode.h b/fs/pnode.h
index 0fcdbe7ca648..550f5a8b4fcf 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -52,4 +52,5 @@ void mnt_set_mountpoint(struct mount *, struct mountpoint *,
 struct mount *copy_tree(struct mount *, struct dentry *, int);
 bool is_path_reachable(struct mount *, struct dentry *,
 			 const struct path *root);
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt);
 #endif /* _LINUX_PNODE_H */
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 54a594d49733..1172cce949a4 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -96,4 +96,6 @@ extern void mark_mounts_for_expiry(struct list_head *mounts);
 
 extern dev_t name_to_dev_t(const char *name);
 
+extern unsigned int sysctl_mount_max;
+
 #endif /* _LINUX_MOUNT_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b43d0b27c1fe..03f18cc15697 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -65,6 +65,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/kexec.h>
 #include <linux/bpf.h>
+#include <linux/mount.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -1838,6 +1839,14 @@ static struct ctl_table fs_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+	{
+		.procname	= "mount-max",
+		.data		= &sysctl_mount_max,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+	},
 	{ }
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 069d5ac9ae0d271903cc4607890616418118379a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 30 Sep 2016 11:28:05 -0500
Subject: autofs:  Fix automounts by using current_real_cred()->uid

Seth Forshee reports that in 4.8-rcN some automounts are failing
because the requesting the automount changed.

The relevant call path is:
follow_automount()
    ->d_automount
    autofs4_d_automount
       autofs4_mount_wait
           autofs4_wait

In autofs4_wait wq_uid and wq_gid are set to current_uid() and
current_gid respectively.  With follow_automount now overriding creds
uid that we export to userspace changes and that breaks existing
setups.

To remove the regression set wq_uid and wq_gid from
current_real_cred()->uid and current_real_cred()->gid respectively.
This restores the current behavior as current->real_cred is identical
to current->cred except when override creds are used.

Cc: stable@vger.kernel.org
Fixes: aeaa4a79ff6a ("fs: Call d_automount with the filesystems creds")
Reported-by: Seth Forshee <seth.forshee@canonical.com>
Tested-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/autofs4/waitq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 431fd7ee3488..e44271dfceb6 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -431,8 +431,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 		memcpy(&wq->name, &qstr, sizeof(struct qstr));
 		wq->dev = autofs4_get_dev(sbi);
 		wq->ino = autofs4_get_ino(sbi);
-		wq->uid = current_uid();
-		wq->gid = current_gid();
+		wq->uid = current_real_cred()->uid;
+		wq->gid = current_real_cred()->gid;
 		wq->pid = pid;
 		wq->tgid = tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
-- 
cgit v1.2.3-59-g8ed1b