diff options
Diffstat (limited to 'kernel/pid_namespace.c')
-rw-r--r-- | kernel/pid_namespace.c | 65 |
1 files changed, 30 insertions, 35 deletions
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index d40017e79ebe..f4f8cb0435b4 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -51,18 +51,13 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) - *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0); + *pkc = kmem_cache_create(name, len, 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); } -static void proc_cleanup_work(struct work_struct *work) -{ - struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); - pid_ns_release_proc(ns); -} - static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); @@ -108,13 +103,12 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns goto out_free_idr; ns->ns.ops = &pidns_operations; - kref_init(&ns->kref); + refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; - INIT_WORK(&ns->proc_work, proc_cleanup_work); return ns; @@ -155,22 +149,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, return create_pid_namespace(user_ns, old_ns); } -static void free_pid_ns(struct kref *kref) -{ - struct pid_namespace *ns; - - ns = container_of(kref, struct pid_namespace, kref); - destroy_pid_namespace(ns); -} - void put_pid_ns(struct pid_namespace *ns) { struct pid_namespace *parent; while (ns != &init_pid_ns) { parent = ns->parent; - if (!kref_put(&ns->kref, free_pid_ns)) + if (!refcount_dec_and_test(&ns->ns.count)) break; + destroy_pid_namespace(ns); ns = parent; } } @@ -231,20 +218,27 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } while (rc != -ECHILD); /* - * kernel_wait4() above can't reap the EXIT_DEAD children but we do not - * really care, we could reparent them to the global init. We could - * exit and reap ->child_reaper even if it is not the last thread in - * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), - * pid_ns can not go away until proc_kill_sb() drops the reference. + * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE + * process whose parents processes are outside of the pid + * namespace. Such processes are created with setns()+fork(). + * + * If those EXIT_ZOMBIE processes are not reaped by their + * parents before their parents exit, they will be reparented + * to pid_ns->child_reaper. Thus pidns->child_reaper needs to + * stay valid until they all go away. + * + * The code relies on the pid_ns->child_reaper ignoring + * SIGCHILD to cause those EXIT_ZOMBIE processes to be + * autoreaped if reparented. * - * But this ns can also have other tasks injected by setns()+fork(). - * Again, ignoring the user visible semantics we do not really need - * to wait until they are all reaped, but they can be reparented to - * us and thus we need to ensure that pid->child_reaper stays valid - * until they all go away. See free_pid()->wake_up_process(). + * Semantically it is also desirable to wait for EXIT_ZOMBIE + * processes before allowing the child_reaper to be reaped, as + * that gives the invariant that when the init process of a + * pid namespace is reaped all of the processes in the pid + * namespace are gone. * - * We rely on ignored SIGCHLD, an injected zombie must be autoreaped - * if reparented. + * Once all of the other tasks are gone from the pid_namespace + * free_pid() will awaken this task. */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -263,13 +257,13 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; int ret, next; - if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) return -EPERM; /* @@ -378,13 +372,14 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } -static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int pidns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* @@ -455,7 +450,7 @@ const struct proc_ns_operations pidns_for_children_operations = { static __init int pid_namespaces_init(void) { - pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); |