aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/pid_namespace.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/pid_namespace.c')
-rw-r--r--kernel/pid_namespace.c65
1 files changed, 30 insertions, 35 deletions
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index d40017e79ebe..f4f8cb0435b4 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -51,18 +51,13 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
mutex_lock(&pid_caches_mutex);
/* Name collision forces to do allocation under mutex. */
if (!*pkc)
- *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0);
+ *pkc = kmem_cache_create(name, len, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
mutex_unlock(&pid_caches_mutex);
/* current can fail, but someone else can succeed. */
return READ_ONCE(*pkc);
}
-static void proc_cleanup_work(struct work_struct *work)
-{
- struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
- pid_ns_release_proc(ns);
-}
-
static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
{
return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
@@ -108,13 +103,12 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
goto out_free_idr;
ns->ns.ops = &pidns_operations;
- kref_init(&ns->kref);
+ refcount_set(&ns->ns.count, 1);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
- INIT_WORK(&ns->proc_work, proc_cleanup_work);
return ns;
@@ -155,22 +149,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
return create_pid_namespace(user_ns, old_ns);
}
-static void free_pid_ns(struct kref *kref)
-{
- struct pid_namespace *ns;
-
- ns = container_of(kref, struct pid_namespace, kref);
- destroy_pid_namespace(ns);
-}
-
void put_pid_ns(struct pid_namespace *ns)
{
struct pid_namespace *parent;
while (ns != &init_pid_ns) {
parent = ns->parent;
- if (!kref_put(&ns->kref, free_pid_ns))
+ if (!refcount_dec_and_test(&ns->ns.count))
break;
+ destroy_pid_namespace(ns);
ns = parent;
}
}
@@ -231,20 +218,27 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
} while (rc != -ECHILD);
/*
- * kernel_wait4() above can't reap the EXIT_DEAD children but we do not
- * really care, we could reparent them to the global init. We could
- * exit and reap ->child_reaper even if it is not the last thread in
- * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
- * pid_ns can not go away until proc_kill_sb() drops the reference.
+ * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE
+ * process whose parents processes are outside of the pid
+ * namespace. Such processes are created with setns()+fork().
+ *
+ * If those EXIT_ZOMBIE processes are not reaped by their
+ * parents before their parents exit, they will be reparented
+ * to pid_ns->child_reaper. Thus pidns->child_reaper needs to
+ * stay valid until they all go away.
+ *
+ * The code relies on the pid_ns->child_reaper ignoring
+ * SIGCHILD to cause those EXIT_ZOMBIE processes to be
+ * autoreaped if reparented.
*
- * But this ns can also have other tasks injected by setns()+fork().
- * Again, ignoring the user visible semantics we do not really need
- * to wait until they are all reaped, but they can be reparented to
- * us and thus we need to ensure that pid->child_reaper stays valid
- * until they all go away. See free_pid()->wake_up_process().
+ * Semantically it is also desirable to wait for EXIT_ZOMBIE
+ * processes before allowing the child_reaper to be reaped, as
+ * that gives the invariant that when the init process of a
+ * pid namespace is reaped all of the processes in the pid
+ * namespace are gone.
*
- * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
- * if reparented.
+ * Once all of the other tasks are gone from the pid_namespace
+ * free_pid() will awaken this task.
*/
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -263,13 +257,13 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
#ifdef CONFIG_CHECKPOINT_RESTORE
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
int ret, next;
- if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+ if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
return -EPERM;
/*
@@ -378,13 +372,14 @@ static void pidns_put(struct ns_common *ns)
put_pid_ns(to_pid_ns(ns));
}
-static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int pidns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct pid_namespace *active = task_active_pid_ns(current);
struct pid_namespace *ancestor, *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
@@ -455,7 +450,7 @@ const struct proc_ns_operations pidns_for_children_operations = {
static __init int pid_namespaces_init(void)
{
- pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_paths(kern_path, pid_ns_ctl_table);