aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/fork.c
diff options
context:
space:
mode:
authorChristian Brauner <christian.brauner@ubuntu.com>2020-02-05 14:26:22 +0100
committerTejun Heo <tj@kernel.org>2020-02-12 17:57:51 -0500
commitef2c41cf38a7559bbf91af42d5b6a4429db8fc68 (patch)
treead5801b2782c5268dc0ad1b6f0f591b1da582eee /kernel/fork.c
parentcgroup: add cgroup_may_write() helper (diff)
downloadlinux-dev-ef2c41cf38a7559bbf91af42d5b6a4429db8fc68.tar.xz
linux-dev-ef2c41cf38a7559bbf91af42d5b6a4429db8fc68.zip
clone3: allow spawning processes into cgroups
This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Li Zefan <lizefan@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com> Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c13
1 files changed, 9 insertions, 4 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 9245b6e53f55..635d6369dfb9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2180,7 +2180,7 @@ static __latent_entropy struct task_struct *copy_process(
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
- retval = cgroup_can_fork(p);
+ retval = cgroup_can_fork(p, args);
if (retval)
goto bad_fork_put_pidfd;
@@ -2287,7 +2287,7 @@ static __latent_entropy struct task_struct *copy_process(
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
+ cgroup_post_fork(p, args);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
@@ -2298,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process(
bad_fork_cancel_cgroup:
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
- cgroup_cancel_fork(p);
+ cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) {
fput(pidfile);
@@ -2627,6 +2627,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
!valid_signal(args.exit_signal)))
return -EINVAL;
+ if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
+ return -EINVAL;
+
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
@@ -2637,6 +2640,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
+ .cgroup = args.cgroup,
};
if (args.set_tid &&
@@ -2680,7 +2684,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
- if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
+ if (kargs->flags &
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
return false;
/*