From 1018016c706f7ff9f56fde3a649789c47085a293 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:22 -0700 Subject: sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability While running a database workload, we found a scalability issue with itimers. Much of the problem was caused by the thread_group_cputimer spinlock. Each time we account for group system/user time, we need to obtain a thread_group_cputimer's spinlock to update the timers. On larger systems (such as a 16 socket machine), this caused more than 30% of total time spent trying to obtain this kernel lock to update these group timer stats. This patch converts the timers to 64-bit atomic variables and use atomic add to update them without a lock. With this patch, the percent of total time spent updating thread group cputimer timers was reduced from 30% down to less than 1%. Note: On 32-bit systems using the generic 64-bit atomics, this causes sample_group_cputimer() to take locks 3 times instead of just 1 time. However, we tested this patch on a 32-bit system ARM system using the generic atomics and did not find the overhead to be much of an issue. An explanation for why this isn't an issue is that 32-bit systems usually have small numbers of CPUs, and cacheline contention from extra spinlocks called periodically is not really apparent on smaller systems. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Cc: Waiman Long Link: http://lkml.kernel.org/r/1430251224-5764-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux/init_task.h') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 696d22312b31..7b9d8b59e7bf 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -50,9 +50,10 @@ extern struct fs_struct init_fs; .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ .cputimer = { \ - .cputime = INIT_CPUTIME, \ - .running = 0, \ - .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + .utime = ATOMIC64_INIT(0), \ + .stime = ATOMIC64_INIT(0), \ + .sum_exec_runtime = ATOMIC64_INIT(0), \ + .running = 0 \ }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ -- cgit v1.2.3-59-g8ed1b From 7110744516276e906f9197e2857d026eb2343393 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:24 -0700 Subject: sched, timer: Use the atomic task_cputime in thread_group_cputimer Recent optimizations were made to thread_group_cputimer to improve its scalability by keeping track of cputime stats without a lock. However, the values were open coded to the structure, causing them to be at a different abstraction level from the regular task_cputime structure. Furthermore, any subsequent similar optimizations would not be able to share the new code, since they are specific to thread_group_cputimer. This patch adds the new task_cputime_atomic data structure (introduced in the previous patch in the series) to thread_group_cputimer for keeping track of the cputime atomically, which also helps generalize the code. Suggested-by: Ingo Molnar Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Cc: Waiman Long Link: http://lkml.kernel.org/r/1430251224-5764-6-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 6 ++---- include/linux/sched.h | 4 +--- kernel/sched/stats.h | 6 +++--- kernel/time/posix-cpu-timers.c | 26 +++++++++++++------------- 4 files changed, 19 insertions(+), 23 deletions(-) (limited to 'include/linux/init_task.h') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 7b9d8b59e7bf..bb9b075f0eb0 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -50,10 +50,8 @@ extern struct fs_struct init_fs; .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ .cputimer = { \ - .utime = ATOMIC64_INIT(0), \ - .stime = ATOMIC64_INIT(0), \ - .sum_exec_runtime = ATOMIC64_INIT(0), \ - .running = 0 \ + .cputime_atomic = INIT_CPUTIME_ATOMIC, \ + .running = 0, \ }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6eb78cd45da7..4adc536a3b03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -615,9 +615,7 @@ struct task_cputime_atomic { * used for thread group CPU timer calculations. */ struct thread_group_cputimer { - atomic64_t utime; - atomic64_t stime; - atomic64_t sum_exec_runtime; + struct task_cputime_atomic cputime_atomic; int running; }; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c6d1c7da3ea5..077ebbd5e10f 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -216,7 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - atomic64_add(cputime, &cputimer->utime); + atomic64_add(cputime, &cputimer->cputime_atomic.utime); } /** @@ -237,7 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - atomic64_add(cputime, &cputimer->stime); + atomic64_add(cputime, &cputimer->cputime_atomic.stime); } /** @@ -258,5 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - atomic64_add(ns, &cputimer->sum_exec_runtime); + atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime); } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index d85730669410..892e3dae0aac 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -211,20 +211,20 @@ retry: } } -static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum) +static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum) { - __update_gt_cputime(&cputimer->utime, sum->utime); - __update_gt_cputime(&cputimer->stime, sum->stime); - __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime); + __update_gt_cputime(&cputime_atomic->utime, sum->utime); + __update_gt_cputime(&cputime_atomic->stime, sum->stime); + __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime); } -/* Sample thread_group_cputimer values in "cputimer", store results in "times". */ -static inline void sample_group_cputimer(struct task_cputime *times, - struct thread_group_cputimer *cputimer) +/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */ +static inline void sample_cputime_atomic(struct task_cputime *times, + struct task_cputime_atomic *atomic_times) { - times->utime = atomic64_read(&cputimer->utime); - times->stime = atomic64_read(&cputimer->stime); - times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime); + times->utime = atomic64_read(&atomic_times->utime); + times->stime = atomic64_read(&atomic_times->stime); + times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime); } void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) @@ -240,7 +240,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) * to synchronize the timer to the clock every time we start it. */ thread_group_cputime(tsk, &sum); - update_gt_cputime(cputimer, &sum); + update_gt_cputime(&cputimer->cputime_atomic, &sum); /* * We're setting cputimer->running without a lock. Ensure @@ -251,7 +251,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) */ WRITE_ONCE(cputimer->running, 1); } - sample_group_cputimer(times, cputimer); + sample_cputime_atomic(times, &cputimer->cputime_atomic); } /* @@ -1137,7 +1137,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (READ_ONCE(sig->cputimer.running)) { struct task_cputime group_sample; - sample_group_cputimer(&group_sample, &sig->cputimer); + sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; -- cgit v1.2.3-59-g8ed1b From d59cfc09c32a2ae31f1c3bc2983a0cd79afb3f14 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 May 2015 16:35:17 -0400 Subject: sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem The cgroup side of threadgroup locking uses signal_struct->group_rwsem to synchronize against threadgroup changes. This per-process rwsem adds small overhead to thread creation, exit and exec paths, forces cgroup code paths to do lock-verify-unlock-retry dance in a couple places and makes it impossible to atomically perform operations across multiple processes. This patch replaces signal_struct->group_rwsem with a global percpu_rwsem cgroup_threadgroup_rwsem which is cheaper on the reader side and contained in cgroups proper. This patch converts one-to-one. This does make writer side heavier and lower the granularity; however, cgroup process migration is a fairly cold path, we do want to optimize thread operations over it and cgroup migration operations don't take enough time for the lower granularity to matter. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/cgroup-defs.h | 27 ++++++++++++++-- include/linux/init_task.h | 8 ----- include/linux/sched.h | 12 ------- init/Kconfig | 1 + kernel/cgroup.c | 77 ++++++++++++--------------------------------- kernel/fork.c | 4 --- 6 files changed, 46 insertions(+), 83 deletions(-) (limited to 'include/linux/init_task.h') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b8c93806dbd..7d83d7f73420 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -461,8 +461,31 @@ struct cgroup_subsys { unsigned int depends_on; }; -void cgroup_threadgroup_change_begin(struct task_struct *tsk); -void cgroup_threadgroup_change_end(struct task_struct *tsk); +extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + +/** + * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_begin() and allows cgroup operations to + * synchronize against threadgroup changes using a percpu_rw_semaphore. + */ +static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) +{ + percpu_down_read(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_end(). Counterpart of + * cgroup_threadcgroup_change_begin(). + */ +static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) +{ + percpu_up_read(&cgroup_threadgroup_rwsem); +} #else /* CONFIG_CGROUPS */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 696d22312b31..0cc0bbf20022 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,13 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; -#ifdef CONFIG_CGROUPS -#define INIT_GROUP_RWSEM(sig) \ - .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem), -#else -#define INIT_GROUP_RWSEM(sig) -#endif - #ifdef CONFIG_CPUSETS #define INIT_CPUSET_SEQ(tsk) \ .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), @@ -56,7 +49,6 @@ extern struct fs_struct init_fs; }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ - INIT_GROUP_RWSEM(sig) \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index 5ee290003470..add524a910bd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -743,18 +743,6 @@ struct signal_struct { unsigned audit_tty_log_passwd; struct tty_audit_buf *tty_audit_buf; #endif -#ifdef CONFIG_CGROUPS - /* - * group_rwsem prevents new tasks from entering the threadgroup and - * member tasks from exiting,a more specifically, setting of - * PF_EXITING. fork and exit paths are protected with this rwsem - * using threadgroup_change_begin/end(). Users which require - * threadgroup to remain stable should use threadgroup_[un]lock() - * which also takes care of exec path. Currently, cgroup is the - * only user. - */ - struct rw_semaphore group_rwsem; -#endif oom_flags_t oom_flags; short oom_score_adj; /* OOM kill score adjustment */ diff --git a/init/Kconfig b/init/Kconfig index dc24dec60232..b9b824bf8f6b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -938,6 +938,7 @@ config NUMA_BALANCING_DEFAULT_ENABLED menuconfig CGROUPS bool "Control Group support" select KERNFS + select PERCPU_RWSEM help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 980b1f52f39f..77578a169b8c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(release_agent_path_lock); +struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + #define cgroup_assert_mutex_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ lockdep_is_held(&cgroup_mutex), \ @@ -848,48 +851,6 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } -void cgroup_threadgroup_change_begin(struct task_struct *tsk) -{ - down_read(&tsk->signal->group_rwsem); -} - -void cgroup_threadgroup_change_end(struct task_struct *tsk) -{ - up_read(&tsk->signal->group_rwsem); -} - -/** - * threadgroup_lock - lock threadgroup - * @tsk: member task of the threadgroup to lock - * - * Lock the threadgroup @tsk belongs to. No new task is allowed to enter - * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or - * change ->group_leader/pid. This is useful for cases where the threadgroup - * needs to stay stable across blockable operations. - * - * fork and exit explicitly call threadgroup_change_{begin|end}() for - * synchronization. While held, no new task will be added to threadgroup - * and no existing live task will have its PF_EXITING set. - * - * de_thread() does threadgroup_change_{begin|end}() when a non-leader - * sub-thread becomes a new leader. - */ -static void threadgroup_lock(struct task_struct *tsk) -{ - down_write(&tsk->signal->group_rwsem); -} - -/** - * threadgroup_unlock - unlock threadgroup - * @tsk: member task of the threadgroup to unlock - * - * Reverse threadgroup_lock(). - */ -static inline void threadgroup_unlock(struct task_struct *tsk) -{ - up_write(&tsk->signal->group_rwsem); -} - static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; @@ -2095,9 +2056,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, lockdep_assert_held(&css_set_rwsem); /* - * We are synchronized through threadgroup_lock() against PF_EXITING - * setting such that we can't race against cgroup_exit() changing the - * css_set to init_css_set and dropping the old one. + * We are synchronized through cgroup_threadgroup_rwsem against + * PF_EXITING setting such that we can't race against cgroup_exit() + * changing the css_set to init_css_set and dropping the old one. */ WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); @@ -2154,10 +2115,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * @src_cset and add it to @preloaded_csets, which should later be cleaned * up by cgroup_migrate_finish(). * - * This function may be called without holding threadgroup_lock even if the - * target is a process. Threads may be created and destroyed but as long - * as cgroup_mutex is not dropped, no new css_set can be put into play and - * the preloaded css_sets are guaranteed to cover all migrations. + * This function may be called without holding cgroup_threadgroup_rwsem + * even if the target is a process. Threads may be created and destroyed + * but as long as cgroup_mutex is not dropped, no new css_set can be put + * into play and the preloaded css_sets are guaranteed to cover all + * migrations. */ static void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, @@ -2260,7 +2222,7 @@ err: * @threadgroup: whether @leader points to the whole process or a single task * * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding threadgroup_lock of @leader. The + * process, the caller must be holding cgroup_threadgroup_rwsem. The * caller is also responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). @@ -2388,7 +2350,7 @@ out_release_tset: * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * - * Call holding cgroup_mutex and threadgroup_lock of @leader. + * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) @@ -2481,7 +2443,7 @@ retry_find_task: get_task_struct(tsk); rcu_read_unlock(); - threadgroup_lock(tsk); + percpu_down_write(&cgroup_threadgroup_rwsem); if (threadgroup) { if (!thread_group_leader(tsk)) { /* @@ -2491,7 +2453,7 @@ retry_find_task: * try again; this is * "double-double-toil-and-trouble-check locking". */ - threadgroup_unlock(tsk); + percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(tsk); goto retry_find_task; } @@ -2499,7 +2461,7 @@ retry_find_task: ret = cgroup_attach_task(cgrp, tsk, threadgroup); - threadgroup_unlock(tsk); + percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(tsk); out_unlock_cgroup: @@ -2704,17 +2666,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; last_task = task; - threadgroup_lock(task); + percpu_down_write(&cgroup_threadgroup_rwsem); /* raced against de_thread() from another thread? */ if (!thread_group_leader(task)) { - threadgroup_unlock(task); + percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(task); continue; } ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); - threadgroup_unlock(task); + percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(task); if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) @@ -5032,6 +4994,7 @@ int __init cgroup_init(void) unsigned long key; int ssid, err; + BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); diff --git a/kernel/fork.c b/kernel/fork.c index 03c1eaaa6ef5..9531275e12a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1144,10 +1144,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); -#ifdef CONFIG_CGROUPS - init_rwsem(&sig->group_rwsem); -#endif - sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; -- cgit v1.2.3-59-g8ed1b