From 17cf22c33e1f1b5e435469c84e43872579497653 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 14:51:53 -0800 Subject: pidns: Use task_active_pid_ns where appropriate The expressions tsk->nsproxy->pid_ns and task_active_pid_ns aka ns_of_pid(task_pid(tsk)) should have the same number of cache line misses with the practical difference that ns_of_pid(task_pid(tsk)) is released later in a processes life. Furthermore by using task_active_pid_ns it becomes trivial to write an unshare implementation for the the pid namespace. So I have used task_active_pid_ns everywhere I can. In fork since the pid has not yet been attached to the process I use ns_of_pid, to achieve the same effect. Signed-off-by: Eric W. Biederman --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..7798c247f4b9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1442,7 +1442,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { if (is_child_reaper(pid)) - p->nsproxy->pid_ns->child_reaper = p; + ns_of_pid(pid)->child_reaper = p; p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); -- cgit v1.2.3-59-g8ed1b From 0a01f2cc390e10633a54f72c608cc3fe19a50c3d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2012 10:33:47 -0700 Subject: pidns: Make the pidns proc mount/umount logic obvious. Track the number of pids in the proc hash table. When the number of pids goes to 0 schedule work to unmount the kernel mount of proc. Move the mount of proc into alloc_pid when we allocate the pid for init. Remove the surprising calls of pid_ns_release proc in fork and proc_flush_task. Those code paths really shouldn't know about proc namespace implementation details and people have demonstrated several times that finding and understanding those code paths is difficult and non-obvious. Because of the call path detach pid is alwasy called with the rtnl_lock held free_pid is not allowed to sleep, so the work to unmounting proc is moved to a work queue. This has the side benefit of not blocking the entire world waiting for the unnecessary rcu_barrier in deactivate_locked_super. In the process of making the code clear and obvious this fixes a bug reported by Gao feng where we would leak a mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns succeeded and copy_net_ns failed. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- fs/proc/base.c | 4 ---- fs/proc/root.c | 5 ----- include/linux/pid_namespace.h | 2 ++ kernel/fork.c | 2 -- kernel/pid.c | 21 +++++++++++++++++---- kernel/pid_namespace.c | 14 +++++++------- 6 files changed, 26 insertions(+), 22 deletions(-) (limited to 'kernel/fork.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6177fc238fdb..7621dc51cff8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task) proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } - - upid = &pid->numbers[pid->level]; - if (upid->nr == 1) - pid_ns_release_proc(upid->ns); } static struct dentry *proc_pid_instantiate(struct inode *dir, diff --git a/fs/proc/root.c b/fs/proc/root.c index fc1609321a78..f2f251158d35 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -155,11 +155,6 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; - err = pid_ns_prepare_proc(&init_pid_ns); - if (err) { - unregister_filesystem(&proc_fs_type); - return; - } proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c89c9cfcd247..4c96acdb2489 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -21,6 +21,7 @@ struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; int last_pid; + int nr_hashed; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; @@ -32,6 +33,7 @@ struct pid_namespace { struct bsd_acct_struct *bacct; #endif struct user_namespace *user_ns; + struct work_struct proc_work; kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ diff --git a/kernel/fork.c b/kernel/fork.c index 7798c247f4b9..666dc8b06606 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1476,8 +1476,6 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - if (unlikely(clone_flags & CLONE_NEWPID)) - pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) diff --git a/kernel/pid.c b/kernel/pid.c index 3a5f238c1ca0..e957f8b09136 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -270,8 +271,12 @@ void free_pid(struct pid *pid) unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); + for (i = 0; i <= pid->level; i++) { + struct upid *upid = pid->numbers + i; + hlist_del_rcu(&upid->pid_chain); + if (--upid->ns->nr_hashed == 0) + schedule_work(&upid->ns->proc_work); + } spin_unlock_irqrestore(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) @@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) goto out; tmp = ns; + pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); if (nr < 0) @@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = tmp->parent; } + if (unlikely(is_child_reaper(pid))) { + if (pid_ns_prepare_proc(ns)) + goto out_free; + } + get_pid_ns(ns); - pid->level = ns->level; atomic_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for ( ; upid >= pid->numbers; --upid) + for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + upid->ns->nr_hashed++; + } spin_unlock_irq(&pidmap_lock); out: @@ -570,6 +582,7 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); + init_pid_ns.nr_hashed = 1; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b2604950aa50..84591cfeefc1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -72,6 +72,12 @@ err_alloc: return NULL; } +static void proc_cleanup_work(struct work_struct *work) +{ + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); + pid_ns_release_proc(ns); +} + /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 @@ -105,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); + INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -112,15 +119,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - err = pid_ns_prepare_proc(ns); - if (err) - goto out_put_parent_pid_ns; - return ns; -out_put_parent_pid_ns: - put_pid_ns(parent_pid_ns); - put_user_ns(user_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: -- cgit v1.2.3-59-g8ed1b From 1c4042c29bd2e85aac4110552ca8ade763762e84 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Jul 2010 17:10:36 -0700 Subject: pidns: Consolidate initialzation of special init task state Instead of setting child_reaper and SIGNAL_UNKILLABLE one way for the system init process, and another way for pid namespace init processes test pid->nr == 1 and use the same code for both. For the global init this results in SIGNAL_UNKILLABLE being set much earlier in the initialization process. This is a small cleanup and it paves the way for allowing unshare and enter of the pid namespace as that path like our global init also will not set CLONE_NEWPID. Signed-off-by: Eric W. Biederman --- init/main.c | 1 - kernel/fork.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/init/main.c b/init/main.c index 9cf77ab138a6..317750a18f74 100644 --- a/init/main.c +++ b/init/main.c @@ -810,7 +810,6 @@ static int __ref kernel_init(void *unused) system_state = SYSTEM_RUNNING; numa_default_policy(); - current->signal->flags |= SIGNAL_UNKILLABLE; flush_delayed_fput(); if (ramdisk_execute_command) { diff --git a/kernel/fork.c b/kernel/fork.c index 666dc8b06606..0f2bbce311fc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1039,8 +1039,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); - if (clone_flags & CLONE_NEWPID) - sig->flags |= SIGNAL_UNKILLABLE; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); @@ -1441,8 +1439,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { - if (is_child_reaper(pid)) + if (is_child_reaper(pid)) { ns_of_pid(pid)->child_reaper = p; + p->signal->flags |= SIGNAL_UNKILLABLE; + } p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); -- cgit v1.2.3-59-g8ed1b From 50804fe3737ca6a5942fdc2057a18a8141d00141 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 15:41:50 -0800 Subject: pidns: Support unsharing the pid namespace. Unsharing of the pid namespace unlike unsharing of other namespaces does not take affect immediately. Instead it affects the children created with fork and clone. The first of these children becomes the init process of the new pid namespace, the rest become oddball children of pid 0. From the point of view of the new pid namespace the process that created it is pid 0, as it's pid does not map. A couple of different semantics were considered but this one was settled on because it is easy to implement and it is usable from pam modules. The core reasons for the existence of unshare. I took a survey of the callers of pam modules and the following appears to be a representative sample of their logic. { setup stuff include pam child = fork(); if (!child) { setuid() exec /bin/bash } waitpid(child); pam and other cleanup } As you can see there is a fork to create the unprivileged user space process. Which means that the unprivileged user space process will appear as pid 1 in the new pid namespace. Further most login processes do not cope with extraneous children which means shifting the duty of reaping extraneous child process to the creator of those extraneous children makes the system more comprehensible. The practical reason for this set of pid namespace semantics is that it is simple to implement and verify they work correctly. Whereas an implementation that requres changing the struct pid on a process comes with a lot more races and pain. Not the least of which is that glibc caches getpid(). These semantics are implemented by having two notions of the pid namespace of a proces. There is task_active_pid_ns which is the pid namspace the process was created with and the pid namespace that all pids are presented to that process in. The task_active_pid_ns is stored in the struct pid of the task. Then there is the pid namespace that will be used for children that pid namespace is stored in task->nsproxy->pid_ns. Signed-off-by: Eric W. Biederman --- kernel/fork.c | 32 +++++++++++++++++++++++++------- kernel/nsproxy.c | 2 +- kernel/pid_namespace.c | 2 -- 3 files changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 0f2bbce311fc..811ffbad7889 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1565,9 +1565,11 @@ long do_fork(unsigned long clone_flags, * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ - if (clone_flags & CLONE_NEWUSER) { - if (clone_flags & CLONE_THREAD) + if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { + if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) return -EINVAL; + } + if (clone_flags & CLONE_NEWUSER) { /* hopefully this check will go away when userns support is * complete */ @@ -1692,7 +1694,8 @@ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| + CLONE_NEWPID)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to @@ -1763,15 +1766,30 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) int do_sysvsem = 0; int err; - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; - + /* + * If unsharing a pid namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWPID) + unshare_flags |= CLONE_THREAD; + /* + * If unsharing a thread from a thread group, must also unshare vm. + */ + if (unshare_flags & CLONE_THREAD) + unshare_flags |= CLONE_VM; + /* + * If unsharing vm, must also unshare signal handlers. + */ + if (unshare_flags & CLONE_VM) + unshare_flags |= CLONE_SIGHAND; /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index acc92680381a..b8d4d8709d70 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -188,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET))) + CLONE_NEWNET | CLONE_NEWPID))) return 0; if (!capable(CAP_SYS_ADMIN)) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f78fc48c86bc..68508d330634 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -144,8 +144,6 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & (CLONE_THREAD|CLONE_PARENT)) - return ERR_PTR(-EINVAL); if (task_active_pid_ns(current) != old_ns) return ERR_PTR(-EINVAL); return create_pid_namespace(user_ns, old_ns); -- cgit v1.2.3-59-g8ed1b From 5eaf563e53294d6696e651466697eb9d491f3946 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 21 Nov 2011 17:22:31 -0800 Subject: userns: Allow unprivileged users to create user namespaces. Now that we have been through every permission check in the kernel having uid == 0 and gid == 0 in your local user namespace no longer adds any special privileges. Even having a full set of caps in your local user namespace is safe because capabilies are relative to your local user namespace, and do not confer unexpected privileges. Over the long term this should allow much more of the kernels functionality to be safely used by non-root users. Functionality like unsharing the mount namespace that is only unsafe because it can fool applications whose privileges are raised when they are executed. Since those applications have no privileges in a user namespaces it becomes safe to spoof and confuse those applications all you want. Those capabilities will still need to be enabled carefully because we may still need things like rlimits on the number of unprivileged mounts but that is to avoid DOS attacks not to avoid fooling root owned processes. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/fork.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 811ffbad7889..8c29abb19014 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1569,14 +1569,6 @@ long do_fork(unsigned long clone_flags, if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) return -EINVAL; } - if (clone_flags & CLONE_NEWUSER) { - /* hopefully this check will go away when userns support is - * complete - */ - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || - !capable(CAP_SETGID)) - return -EPERM; - } /* * Determine whether and which event to report to ptracer. When -- cgit v1.2.3-59-g8ed1b From b2e0d98705e60e45bbb3c0032c48824ad7ae0704 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 05:15:35 -0700 Subject: userns: Implement unshare of the user namespace - Add CLONE_THREAD to the unshare flags if CLONE_NEWUSER is selected As changing user namespaces is only valid if all there is only a single thread. - Restore the code to add CLONE_VM if CLONE_THREAD is selected and the code to addCLONE_SIGHAND if CLONE_VM is selected. Making the constraints in the code clear. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- include/linux/nsproxy.h | 2 +- include/linux/user_namespace.h | 9 +++++++++ kernel/fork.c | 25 ++++++++++++++++++++++--- kernel/nsproxy.c | 8 ++++---- kernel/user_namespace.c | 15 +++++++++++++++ 5 files changed, 51 insertions(+), 8 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index cc37a55ad004..10e5947491c7 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -67,7 +67,7 @@ void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, - struct fs_struct *); + struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 95142cae446a..17651f08d67f 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -39,6 +39,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns) } extern int create_user_ns(struct cred *new); +extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void free_user_ns(struct kref *kref); static inline void put_user_ns(struct user_namespace *ns) @@ -66,6 +67,14 @@ static inline int create_user_ns(struct cred *new) return -EINVAL; } +static inline int unshare_userns(unsigned long unshare_flags, + struct cred **new_cred) +{ + if (unshare_flags & CLONE_NEWUSER) + return -EINVAL; + return 0; +} + static inline void put_user_ns(struct user_namespace *ns) { } diff --git a/kernel/fork.c b/kernel/fork.c index 8c29abb19014..38e53b87402c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1687,7 +1687,7 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWPID)) + CLONE_NEWUSER|CLONE_NEWPID)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to @@ -1754,10 +1754,16 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; + struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; + /* + * If unsharing a user namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWUSER) + unshare_flags |= CLONE_THREAD; /* * If unsharing a pid namespace must also unshare the thread. */ @@ -1795,11 +1801,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; - err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); + err = unshare_userns(unshare_flags, &new_cred); if (err) goto bad_unshare_cleanup_fd; + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_cred, new_fs); + if (err) + goto bad_unshare_cleanup_cred; - if (new_fs || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1832,11 +1842,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } task_unlock(current); + + if (new_cred) { + /* Install the new user namespace */ + commit_creds(new_cred); + new_cred = NULL; + } } if (new_nsproxy) put_nsproxy(new_nsproxy); +bad_unshare_cleanup_cred: + if (new_cred) + put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 2ddd81657a2a..78e2ecb20165 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -186,7 +186,7 @@ void free_nsproxy(struct nsproxy *ns) * On success, returns the new nsproxy. */ int unshare_nsproxy_namespaces(unsigned long unshare_flags, - struct nsproxy **new_nsp, struct fs_struct *new_fs) + struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) { struct user_namespace *user_ns; int err = 0; @@ -195,12 +195,12 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, CLONE_NEWNET | CLONE_NEWPID))) return 0; - if (!nsown_capable(CAP_SYS_ADMIN)) + user_ns = new_cred ? new_cred->user_ns : current_user_ns(); + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - user_ns = current_user_ns(); *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, - new_fs ? new_fs : current->fs); + new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); goto out; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a9460774e77d..ce92f7e6290a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -82,6 +82,21 @@ int create_user_ns(struct cred *new) return 0; } +int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) +{ + struct cred *cred; + + if (!(unshare_flags & CLONE_NEWUSER)) + return 0; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + *new_cred = cred; + return create_user_ns(cred); +} + void free_user_ns(struct kref *kref) { struct user_namespace *parent, *ns = -- cgit v1.2.3-59-g8ed1b