From 537f7ccb396804c6d0057b93ba8eb104ba44f851 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Aug 2016 14:37:37 -0500 Subject: mntns: Add a limit on the number of mount namespaces. v2: Fixed the very obvious lack of setting ucounts on struct mnt_ns reported by Andrei Vagin, and the kbuild test report. Reported-by: Andrei Vagin Acked-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 7bb2cda3bfef..491b8f3e4c9a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2719,9 +2719,20 @@ dput_out: return retval; } +static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); +} + +static void dec_mnt_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); +} + static void free_mnt_ns(struct mnt_namespace *ns) { ns_free_inum(&ns->ns); + dec_mnt_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kfree(ns); } @@ -2738,14 +2749,22 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; + struct ucounts *ucounts; int ret; + ucounts = inc_mnt_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENFILE); + new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); - if (!new_ns) + if (!new_ns) { + dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); + } ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); + dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } new_ns->ns.ops = &mntns_operations; @@ -2756,6 +2775,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) init_waitqueue_head(&new_ns->poll); new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; return new_ns; } -- cgit v1.2.3-59-g8ed1b From df75e7748bae1c7098bfa358485389b897f71305 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 22 Sep 2016 13:08:36 -0500 Subject: userns: When the per user per user namespace limit is reached return ENOSPC The current error codes returned when a the per user per user namespace limit are hit (EINVAL, EUSERS, and ENFILE) are wrong. I asked for advice on linux-api and it we made clear that those were the wrong error code, but a correct effor code was not suggested. The best general error code I have found for hitting a resource limit is ENOSPC. It is not perfect but as it is unambiguous it will serve until someone comes up with a better error code. Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 2 +- ipc/namespace.c | 2 +- kernel/cgroup.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 491b8f3e4c9a..cf2cc234c8b4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2754,7 +2754,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) ucounts = inc_mnt_namespaces(user_ns); if (!ucounts) - return ERR_PTR(-ENFILE); + return ERR_PTR(-ENOSPC); new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) { diff --git a/ipc/namespace.c b/ipc/namespace.c index 730914214135..fab727d9fe09 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -33,7 +33,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, struct ucounts *ucounts; int err; - err = -ENFILE; + err = -ENOSPC; ucounts = inc_ipc_namespaces(user_ns); if (!ucounts) goto fail; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e9e4427fec46..f1dd4b076210 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6354,7 +6354,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, ucounts = inc_cgroup_namespaces(user_ns); if (!ucounts) - return ERR_PTR(-ENFILE); + return ERR_PTR(-ENOSPC); /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 30a7f3351932..7542b28cc929 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -98,7 +98,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns int i; int err; - err = -EINVAL; + err = -ENOSPC; if (level > MAX_PID_NS_LEVEL) goto out; ucounts = inc_pid_namespaces(user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0edafe305861..f2c5ba5505f1 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -76,7 +76,7 @@ int create_user_ns(struct cred *new) struct ucounts *ucounts; int ret, i; - ret = -EUSERS; + ret = -ENOSPC; if (parent_ns->level > 32) goto fail; diff --git a/kernel/utsname.c b/kernel/utsname.c index f3b0bb4ac3ba..35587b76faa3 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -49,7 +49,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct ucounts *ucounts; int err; - err = -ENFILE; + err = -ENOSPC; ucounts = inc_uts_namespaces(user_ns); if (!ucounts) goto fail; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 3e2812aeceb7..06af5d6a883c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -370,7 +370,7 @@ struct net *copy_net_ns(unsigned long flags, ucounts = inc_net_namespaces(user_ns); if (!ucounts) - return ERR_PTR(-ENFILE); + return ERR_PTR(-ENOSPC); net = net_alloc(); if (!net) { -- cgit v1.2.3-59-g8ed1b From bcac25a58bfc6bd79191ac5d7afb49bea96da8c9 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 6 Sep 2016 00:47:13 -0700 Subject: kernel: add a helper to get an owning user namespace for a namespace Return -EPERM if an owning user namespace is outside of a process current user namespace. v2: In a first version ns_get_owner returned ENOENT for init_user_ns. This special cases was removed from this version. There is nothing outside of init_user_ns, so we can return EPERM. v3: rename ns->get_owner() to ns->owner(). get_* usually means that it grabs a reference. Acked-by: Serge Hallyn Signed-off-by: Andrei Vagin Signed-off-by: Eric W. Biederman --- fs/namespace.c | 6 ++++++ include/linux/proc_ns.h | 1 + include/linux/user_namespace.h | 7 +++++++ ipc/namespace.c | 6 ++++++ kernel/cgroup.c | 6 ++++++ kernel/pid_namespace.c | 6 ++++++ kernel/user_namespace.c | 24 ++++++++++++++++++++++++ kernel/utsname.c | 6 ++++++ net/core/net_namespace.c | 6 ++++++ 9 files changed, 68 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 7bb2cda3bfef..fea56f310547 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3348,10 +3348,16 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct user_namespace *mntns_owner(struct ns_common *ns) +{ + return to_mnt_ns(ns)->user_ns; +} + const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, + .owner = mntns_owner, }; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index de0e7719d4c5..ca85a4348ffc 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -18,6 +18,7 @@ struct proc_ns_operations { struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); + struct user_namespace *(*owner)(struct ns_common *ns); }; extern const struct proc_ns_operations netns_operations; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 9217169c64cb..190cf0760815 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -73,6 +73,8 @@ extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool current_in_userns(const struct user_namespace *target_ns); + +struct ns_common *ns_get_owner(struct ns_common *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) @@ -106,6 +108,11 @@ static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; } + +static inline struct ns_common *ns_get_owner(struct ns_common *ns) +{ + return ERR_PTR(-EPERM); +} #endif #endif /* _LINUX_USER_H */ diff --git a/ipc/namespace.c b/ipc/namespace.c index d87e6baa1323..578d93be619d 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -165,10 +165,16 @@ static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new) return 0; } +static struct user_namespace *ipcns_owner(struct ns_common *ns) +{ + return to_ipc_ns(ns)->user_ns; +} + const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, + .owner = ipcns_owner, }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..86b0e8b16426 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6403,12 +6403,18 @@ static void cgroupns_put(struct ns_common *ns) put_cgroup_ns(to_cg_ns(ns)); } +static struct user_namespace *cgroupns_owner(struct ns_common *ns) +{ + return to_cg_ns(ns)->user_ns; +} + const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, + .owner = cgroupns_owner, }; static __init int cgroup_namespaces_init(void) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a65ba137fd15..c02d744225e1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -388,12 +388,18 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct user_namespace *pidns_owner(struct ns_common *ns) +{ + return to_pid_ns(ns)->user_ns; +} + const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, + .owner = pidns_owner, }; static __init int pid_namespaces_init(void) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 68f594212759..0ef683a03c20 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1004,12 +1004,36 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) return commit_creds(cred); } +struct ns_common *ns_get_owner(struct ns_common *ns) +{ + struct user_namespace *my_user_ns = current_user_ns(); + struct user_namespace *owner, *p; + + /* See if the owner is in the current user namespace */ + owner = p = ns->ops->owner(ns); + for (;;) { + if (!p) + return ERR_PTR(-EPERM); + if (p == my_user_ns) + break; + p = p->parent; + } + + return &get_user_ns(owner)->ns; +} + +static struct user_namespace *userns_owner(struct ns_common *ns) +{ + return to_user_ns(ns)->parent; +} + const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, + .owner = userns_owner, }; static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index 831ea7108232..e1211a8a5c18 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -130,10 +130,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) return 0; } +static struct user_namespace *utsns_owner(struct ns_common *ns) +{ + return to_uts_ns(ns)->user_ns; +} + const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, + .owner = utsns_owner, }; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2c2eb1b629b1..861efa34f08c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -996,11 +996,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct user_namespace *netns_owner(struct ns_common *ns) +{ + return to_net_ns(ns)->user_ns; +} + const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, + .owner = netns_owner, }; #endif -- cgit v1.2.3-59-g8ed1b From d29216842a85c7970c536108e093963f02714498 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 28 Sep 2016 00:27:17 -0500 Subject: mnt: Add a per mount namespace limit on the number of mounts CAI Qian pointed out that the semantics of shared subtrees make it possible to create an exponentially increasing number of mounts in a mount namespace. mkdir /tmp/1 /tmp/2 mount --make-rshared / for i in $(seq 1 20) ; do mount --bind /tmp/1 /tmp/2 ; done Will create create 2^20 or 1048576 mounts, which is a practical problem as some people have managed to hit this by accident. As such CVE-2016-6213 was assigned. Ian Kent described the situation for autofs users as follows: > The number of mounts for direct mount maps is usually not very large because of > the way they are implemented, large direct mount maps can have performance > problems. There can be anywhere from a few (likely case a few hundred) to less > than 10000, plus mounts that have been triggered and not yet expired. > > Indirect mounts have one autofs mount at the root plus the number of mounts that > have been triggered and not yet expired. > > The number of autofs indirect map entries can range from a few to the common > case of several thousand and in rare cases up to between 30000 and 50000. I've > not heard of people with maps larger than 50000 entries. > > The larger the number of map entries the greater the possibility for a large > number of active mounts so it's not hard to expect cases of a 1000 or somewhat > more active mounts. So I am setting the default number of mounts allowed per mount namespace at 100,000. This is more than enough for any use case I know of, but small enough to quickly stop an exponential increase in mounts. Which should be perfect to catch misconfigurations and malfunctioning programs. For anyone who needs a higher limit this can be changed by writing to the new /proc/sys/fs/mount-max sysctl. Tested-by: CAI Qian Signed-off-by: "Eric W. Biederman" --- Documentation/sysctl/fs.txt | 7 +++++++ fs/mount.h | 2 ++ fs/namespace.c | 49 ++++++++++++++++++++++++++++++++++++++++++++- fs/pnode.c | 2 +- fs/pnode.h | 1 + include/linux/mount.h | 2 ++ kernel/sysctl.c | 9 +++++++++ 7 files changed, 70 insertions(+), 2 deletions(-) (limited to 'fs/namespace.c') diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 302b5ed616a6..35e17f748ca7 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -265,6 +265,13 @@ aio-nr can grow to. ============================================================== +mount-max: + +This denotes the maximum number of mounts that may exist +in a mount namespace. + +============================================================== + 2. /proc/sys/fs/binfmt_misc ---------------------------------------------------------- diff --git a/fs/mount.h b/fs/mount.h index e037981d8351..d2e25d7b64b3 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -14,6 +14,8 @@ struct mnt_namespace { u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event; + unsigned int mounts; /* # of mounts in the namespace */ + unsigned int pending_mounts; }; struct mnt_pcp { diff --git a/fs/namespace.c b/fs/namespace.c index 8a0e90eb81d3..db1b5a38864e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,9 @@ #include "pnode.h" #include "internal.h" +/* Maximum number of mounts in a mount namespace */ +unsigned int sysctl_mount_max __read_mostly = 100000; + static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; static unsigned int mp_hash_mask __read_mostly; @@ -899,6 +902,9 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); + n->mounts += n->pending_mounts; + n->pending_mounts = 0; + attach_shadowed(mnt, parent, shadows); touch_mnt_namespace(n); } @@ -1419,11 +1425,16 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) propagate_umount(&tmp_list); while (!list_empty(&tmp_list)) { + struct mnt_namespace *ns; bool disconnect; p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); - __touch_mnt_namespace(p->mnt_ns); + ns = p->mnt_ns; + if (ns) { + ns->mounts--; + __touch_mnt_namespace(ns); + } p->mnt_ns = NULL; if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; @@ -1840,6 +1851,28 @@ static int invent_group_ids(struct mount *mnt, bool recurse) return 0; } +int count_mounts(struct mnt_namespace *ns, struct mount *mnt) +{ + unsigned int max = READ_ONCE(sysctl_mount_max); + unsigned int mounts = 0, old, pending, sum; + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) + mounts++; + + old = ns->mounts; + pending = ns->pending_mounts; + sum = old + pending; + if ((old > sum) || + (pending > sum) || + (max < sum) || + (mounts > (max - sum))) + return -ENOSPC; + + ns->pending_mounts = pending + mounts; + return 0; +} + /* * @source_mnt : mount tree to be attached * @nd : place the mount tree @source_mnt is attached @@ -1909,10 +1942,18 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct path *parent_path) { HLIST_HEAD(tree_list); + struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mount *child, *p; struct hlist_node *n; int err; + /* Is there space to add these mounts to the mount namespace? */ + if (!parent_path) { + err = count_mounts(ns, source_mnt); + if (err) + goto out; + } + if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) @@ -1949,11 +1990,13 @@ static int attach_recursive_mnt(struct mount *source_mnt, out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); + child->mnt_parent->mnt_ns->pending_mounts = 0; umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: + ns->pending_mounts = 0; return err; } @@ -2776,6 +2819,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; + new_ns->mounts = 0; + new_ns->pending_mounts = 0; return new_ns; } @@ -2825,6 +2870,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, q = new; while (p) { q->mnt_ns = new_ns; + new_ns->mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); @@ -2863,6 +2909,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; new_ns->root = mnt; + new_ns->mounts++; list_add(&mnt->mnt_list, &new_ns->list); } else { mntput(m); diff --git a/fs/pnode.c b/fs/pnode.c index 99899705b105..234a9ac49958 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -259,7 +259,7 @@ static int propagate_one(struct mount *m) read_sequnlock_excl(&mount_lock); } hlist_add_head(&child->mnt_hash, list); - return 0; + return count_mounts(m->mnt_ns, child); } /* diff --git a/fs/pnode.h b/fs/pnode.h index 0fcdbe7ca648..550f5a8b4fcf 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -52,4 +52,5 @@ void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); +int count_mounts(struct mnt_namespace *ns, struct mount *mnt); #endif /* _LINUX_PNODE_H */ diff --git a/include/linux/mount.h b/include/linux/mount.h index 54a594d49733..1172cce949a4 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -96,4 +96,6 @@ extern void mark_mounts_for_expiry(struct list_head *mounts); extern dev_t name_to_dev_t(const char *name); +extern unsigned int sysctl_mount_max; + #endif /* _LINUX_MOUNT_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b43d0b27c1fe..03f18cc15697 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -1838,6 +1839,14 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, { } }; -- cgit v1.2.3-59-g8ed1b