From 459b09b5a3254008b63382bf41a9b36d0b590f57 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 18 May 2021 14:07:25 +0100 Subject: sched/debug: Don't update sched_domain debug directories before sched_debug_init() Since CPU capacity asymmetry can stem purely from maximum frequency differences (e.g. Pixel 1), a rebuild of the scheduler topology can be issued upon loading cpufreq, see: arch_topology.c::init_cpu_capacity_callback() Turns out that if this rebuild happens *before* sched_debug_init() is run (which is a late initcall), we end up messing up the sched_domain debug directory: passing a NULL parent to debugfs_create_dir() ends up creating the directory at the debugfs root, which in this case creates /sys/kernel/debug/domains (instead of /sys/kernel/debug/sched/domains). This currently doesn't happen on asymmetric systems which use cpufreq-scpi or cpufreq-dt drivers, as those are loaded via deferred_probe_initcall() (it is also a late initcall, but appears to be ordered *after* sched_debug_init()). Ionela has been working on detecting maximum frequency asymmetry via ACPI, and that actually happens via a *device* initcall, thus before sched_debug_init(), and causes the aforementionned debugfs mayhem. One option would be to punt sched_debug_init() down to fs_initcall_sync(). Preventing update_sched_domain_debugfs() from running before sched_debug_init() appears to be the safer option. Fixes: 3b87f136f8fc ("sched,debug: Convert sysctl sched_domains to debugfs") Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: http://lore.kernel.org/r/20210514095339.12979-1-ionela.voinescu@arm.com --- kernel/sched/debug.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0c5ec2776ddf..7e08e3d947c2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -388,6 +388,13 @@ void update_sched_domain_debugfs(void) { int cpu, i; + /* + * This can unfortunately be invoked before sched_debug_init() creates + * the debug directory. Don't touch sd_sysctl_cpus until then. + */ + if (!debugfs_sched) + return; + if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; -- cgit v1.2.3-59-g8ed1b From 77eccd0dfae353a64a2088d308bed3b373a4220f Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 1 Jun 2021 17:11:20 +0200 Subject: wait: use LIST_HEAD_INIT() to initialize wait_queue_head Replace the open-coded initialization with the right macro. Signed-off-by: Julian Wiedmann Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210601151120.329223-1-jwi@linux.ibm.com --- include/linux/wait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index fe10e8570a52..99c5f05718cd 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -56,7 +56,7 @@ struct task_struct; #define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ - .head = { &(name).head, &(name).head } } + .head = LIST_HEAD_INIT(name.head) } #define DECLARE_WAIT_QUEUE_HEAD(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) -- cgit v1.2.3-59-g8ed1b From 18765447c3b7867b3f8cccde52dc9d822852e71b Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Sun, 6 Jun 2021 19:54:51 +0800 Subject: sched/sysctl: Move extern sysctl declarations to sched.h Since commit '8a99b6833c88(sched: Move SCHED_DEBUG sysctl to debugfs)', SCHED_DEBUG sysctls are moved to debugfs, so these extern sysctls in include/linux/sched/sysctl.h are no longer needed for sysctl.c, even some are no longer needed. So move those extern sysctls that needed by kernel/sched/debug.c to kernel/sched/sched.h, and remove others that are no longer needed. Signed-off-by: Hailong Liu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210606115451.26745-1-liuhailongg6@163.com --- include/linux/sched/sysctl.h | 18 ------------------ kernel/sched/sched.h | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index db2c0f34aaaf..304f431178fd 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -28,30 +28,12 @@ enum { sysctl_hung_task_timeout_secs = 0 }; extern unsigned int sysctl_sched_child_runs_first; -extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; -extern unsigned int sysctl_sched_wakeup_granularity; - enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, SCHED_TUNABLESCALING_LINEAR, SCHED_TUNABLESCALING_END, }; -extern unsigned int sysctl_sched_tunable_scaling; - -extern unsigned int sysctl_numa_balancing_scan_delay; -extern unsigned int sysctl_numa_balancing_scan_period_min; -extern unsigned int sysctl_numa_balancing_scan_period_max; -extern unsigned int sysctl_numa_balancing_scan_size; - -#ifdef CONFIG_SCHED_DEBUG -extern __read_mostly unsigned int sysctl_sched_migration_cost; -extern __read_mostly unsigned int sysctl_sched_nr_migrate; - -extern int sysctl_resched_latency_warn_ms; -extern int sysctl_resched_latency_warn_once; -#endif /* * control realtime throttling: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c80d42e9589b..9a1c6aeb9165 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2385,6 +2385,21 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_DEBUG +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern int sysctl_resched_latency_warn_ms; +extern int sysctl_resched_latency_warn_once; + +extern unsigned int sysctl_sched_tunable_scaling; + +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_size; +#endif + #ifdef CONFIG_SCHED_HRTICK /* -- cgit v1.2.3-59-g8ed1b From 031e3bd8986fffe31e1ddbf5264cccfe30c9abd7 Mon Sep 17 00:00:00 2001 From: Yuan ZhaoXiong Date: Sun, 6 Jun 2021 21:11:55 +0800 Subject: sched: Optimize housekeeping_cpumask() in for_each_cpu_and() On a 128 cores AMD machine, there are 8 cores in nohz_full mode, and the others are used for housekeeping. When many housekeeping cpus are in idle state, we can observe huge time burn in the loop for searching nearest busy housekeeper cpu by ftrace. 9) | get_nohz_timer_target() { 9) | housekeeping_test_cpu() { 9) 0.390 us | housekeeping_get_mask.part.1(); 9) 0.561 us | } 9) 0.090 us | __rcu_read_lock(); 9) 0.090 us | housekeeping_cpumask(); 9) 0.521 us | housekeeping_cpumask(); 9) 0.140 us | housekeeping_cpumask(); ... 9) 0.500 us | housekeeping_cpumask(); 9) | housekeeping_any_cpu() { 9) 0.090 us | housekeeping_get_mask.part.1(); 9) 0.100 us | sched_numa_find_closest(); 9) 0.491 us | } 9) 0.100 us | __rcu_read_unlock(); 9) + 76.163 us | } for_each_cpu_and() is a micro function, so in get_nohz_timer_target() function the for_each_cpu_and(i, sched_domain_span(sd), housekeeping_cpumask(HK_FLAG_TIMER)) equals to below: for (i = -1; i = cpumask_next_and(i, sched_domain_span(sd), housekeeping_cpumask(HK_FLAG_TIMER)), i < nr_cpu_ids;) That will cause that housekeeping_cpumask() will be invoked many times. The housekeeping_cpumask() function returns a const value, so it is unnecessary to invoke it every time. This patch can minimize the worst searching time from ~76us to ~16us in my testing. Similarly, the find_new_ilb() function has the same problem. Co-developed-by: Li RongQing Signed-off-by: Li RongQing Signed-off-by: Yuan ZhaoXiong Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1622985115-51007-1-git-send-email-yuanzhaoxiong@baidu.com --- kernel/sched/core.c | 6 ++++-- kernel/sched/fair.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2883c22eef10..0c22cd026440 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -993,6 +993,7 @@ int get_nohz_timer_target(void) { int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; + const struct cpumask *hk_mask; if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { if (!idle_cpu(cpu)) @@ -1000,10 +1001,11 @@ int get_nohz_timer_target(void) default_cpu = cpu; } + hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); + rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu_and(i, sched_domain_span(sd), - housekeeping_cpumask(HK_FLAG_TIMER)) { + for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { if (cpu == i) continue; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 45edf61eed73..11d22943753f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10188,9 +10188,11 @@ static inline int on_null_domain(struct rq *rq) static inline int find_new_ilb(void) { int ilb; + const struct cpumask *hk_mask; - for_each_cpu_and(ilb, nohz.idle_cpus_mask, - housekeeping_cpumask(HK_FLAG_MISC)) { + hk_mask = housekeeping_cpumask(HK_FLAG_MISC); + + for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { if (ilb == smp_processor_id()) continue; -- cgit v1.2.3-59-g8ed1b From 1c6829cfd3d5124b125e6df41158665aea413b35 Mon Sep 17 00:00:00 2001 From: Mika Penttilä Date: Thu, 22 Jul 2021 09:39:46 +0300 Subject: sched/numa: Fix is_core_idle() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the loop variable instead of the function argument to test the other SMT siblings for idle. Fixes: ff7db0bf24db ("sched/numa: Prefer using an idle CPU as a migration target instead of comparing tasks") Signed-off-by: Mika Penttilä Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Acked-by: Pankaj Gupta Link: https://lkml.kernel.org/r/20210722063946.28951-1-mika.penttila@gmail.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11d22943753f..13c3fd40bd34 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1486,7 +1486,7 @@ static inline bool is_core_idle(int cpu) if (cpu == sibling) continue; - if (!idle_cpu(cpu)) + if (!idle_cpu(sibling)) return false; } #endif -- cgit v1.2.3-59-g8ed1b From f912d051619d11411867f642d2004928eb0b41b1 Mon Sep 17 00:00:00 2001 From: Wang Hui Date: Wed, 21 Jul 2021 17:11:09 +0800 Subject: sched: remove redundant on_rq status change activate_task/deactivate_task will change on_rq status, no need to do it again. Signed-off-by: Wang Hui Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210721091109.1406043-1-john.wanghui@huawei.com --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0c22cd026440..6c562ad1ad5d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5659,11 +5659,9 @@ static bool try_steal_cookie(int this, int that) if (p->core_occupation > dst->idle->core_occupation) goto next; - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src, p, 0); set_task_cpu(p, this); activate_task(dst, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; resched_curr(dst); -- cgit v1.2.3-59-g8ed1b From f95091536f78971b269ec321b057b8d630b0ad8a Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 27 Jul 2021 11:11:01 +0100 Subject: sched/deadline: Fix reset_on_fork reporting of DL tasks It is possible for sched_getattr() to incorrectly report the state of the reset_on_fork flag when called on a deadline task. Indeed, if the flag was set on a deadline task using sched_setattr() with flags (SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_KEEP_PARAMS), then p->sched_reset_on_fork will be set, but __setscheduler() will bail out early, which means that the dl_se->flags will not get updated by __setscheduler_params()->__setparam_dl(). Consequently, if sched_getattr() is then called on the task, __getparam_dl() will override kattr.sched_flags with the now out-of-date copy in dl_se->flags and report the stale value to userspace. To fix this, make sure to only copy the flags that are relevant to sched_deadline to and from the dl_se->flags field. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210727101103.2729607-2-qperret@google.com --- kernel/sched/deadline.c | 7 ++++--- kernel/sched/sched.h | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aaacd6cfd42f..5cafc642e647 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2741,7 +2741,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; + dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } @@ -2754,7 +2754,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + attr->sched_flags &= ~SCHED_DL_FLAGS; + attr->sched_flags |= dl_se->flags; } /* @@ -2851,7 +2852,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) if (dl_se->dl_runtime != attr->sched_runtime || dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) + dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS)) return true; return false; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9a1c6aeb9165..75a5c120e7d9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -227,6 +227,8 @@ static inline void update_avg(u64 *avg, u64 sample) */ #define SCHED_FLAG_SUGOV 0x10000000 +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -- cgit v1.2.3-59-g8ed1b From 7ad721bf10718a4e480a27ded8bb16b8f6feb2d1 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 27 Jul 2021 11:11:02 +0100 Subject: sched: Don't report SCHED_FLAG_SUGOV in sched_getattr() SCHED_FLAG_SUGOV is supposed to be a kernel-only flag that userspace cannot interact with. However, sched_getattr() currently reports it in sched_flags if called on a sugov worker even though it is not actually defined in a UAPI header. To avoid this, make sure to clean-up the sched_flags field in sched_getattr() before returning to userspace. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210727101103.2729607-3-qperret@google.com --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6c562ad1ad5d..314f70db3e5c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7530,6 +7530,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_priority = p->rt_priority; else kattr.sched_nice = task_nice(p); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK /* -- cgit v1.2.3-59-g8ed1b From 89aafd67f28c9e3b725aa30b44b7f61ad3e348ce Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Aug 2021 12:58:56 +0100 Subject: sched/fair: Use prev instead of new target as recent_used_cpu After select_idle_sibling, p->recent_used_cpu is set to the new target. However on the next wakeup, prev will be the same as recent_used_cpu unless the load balancer has moved the task since the last wakeup. It still works, but is less efficient than it could be. This patch preserves recent_used_cpu for longer. The impact on SIS efficiency is tiny so the SIS statistic patches were used to track the hit rate for using recent_used_cpu. With perf bench pipe on a 2-socket Cascadelake machine, the hit rate went from 57.14% to 85.32%. For more intensive wakeup loads like hackbench, the hit rate is almost negligible but rose from 0.21% to 6.64%. For scaling loads like tbench, the hit rate goes from almost 0% to 25.42% overall. Broadly speaking, on tbench, the success rate is much higher for lower thread counts and drops to almost 0 as the workload scales to towards saturation. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210804115857.6253-2-mgorman@techsingularity.net --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 13c3fd40bd34..7ee394b4b4cd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6347,6 +6347,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && @@ -6873,9 +6874,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - - if (want_affine) - current->recent_used_cpu = cpu; } rcu_read_unlock(); -- cgit v1.2.3-59-g8ed1b From 56498cfb045d7147cdcba33795d19429afcd1d00 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Aug 2021 12:58:57 +0100 Subject: sched/fair: Avoid a second scan of target in select_idle_cpu When select_idle_cpu starts scanning for an idle CPU, it starts with a target CPU that has already been checked by select_idle_sibling. This patch starts with the next CPU instead. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210804115857.6253-3-mgorman@techsingularity.net --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ee394b4b4cd..47a0fbf224b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6220,7 +6220,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); } - for_each_cpu_wrap(cpu, cpus, target) { + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsigned int)i < nr_cpumask_bits) -- cgit v1.2.3-59-g8ed1b From b4da13aa28d4fd0071247b7b41c579ee8a86c81a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 4 Aug 2021 15:59:25 +0200 Subject: sched/deadline: Fix missing clock update in migrate_task_rq_dl() A missing clock update is causing the following warning: rq->clock_update_flags < RQCF_ACT_SKIP WARNING: CPU: 112 PID: 2041 at kernel/sched/sched.h:1453 sub_running_bw.isra.0+0x190/0x1a0 ... CPU: 112 PID: 2041 Comm: sugov:112 Tainted: G W 5.14.0-rc1 #1 Hardware name: WIWYNN Mt.Jade Server System B81.030Z1.0007/Mt.Jade Motherboard, BIOS 1.6.20210526 (SCP: 1.06.20210526) 2021/05/26 ... Call trace: sub_running_bw.isra.0+0x190/0x1a0 migrate_task_rq_dl+0xf8/0x1e0 set_task_cpu+0xa8/0x1f0 try_to_wake_up+0x150/0x3d4 wake_up_q+0x64/0xc0 __up_write+0xd0/0x1c0 up_write+0x4c/0x2b0 cppc_set_perf+0x120/0x2d0 cppc_cpufreq_set_target+0xe0/0x1a4 [cppc_cpufreq] __cpufreq_driver_target+0x74/0x140 sugov_work+0x64/0x80 kthread_worker_fn+0xe0/0x230 kthread+0x138/0x140 ret_from_fork+0x10/0x18 The task causing this is the `cppc_fie` DL task introduced by commit 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance"). With CONFIG_ACPI_CPPC_CPUFREQ_FIE=y and schedutil cpufreq governor on slow-switching system (like on this Ampere Altra WIWYNN Mt. Jade Arm Server): DL task `curr=sugov:112` lets `p=cppc_fie` migrate and since the latter is in `non_contending` state, migrate_task_rq_dl() calls sub_running_bw()->__sub_running_bw()->cpufreq_update_util()-> rq_clock()->assert_clock_updated() on p. Fix this by updating the clock for a non_contending task in migrate_task_rq_dl() before calling sub_running_bw(). Reported-by: Bruno Goncalves Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Link: https://lore.kernel.org/r/20210804135925.3734605-1-dietmar.eggemann@arm.com --- kernel/sched/deadline.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5cafc642e647..e94314633b39 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1733,6 +1733,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused */ raw_spin_rq_lock(rq); if (p->dl.dl_non_contending) { + update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* -- cgit v1.2.3-59-g8ed1b From ca4984a7dd863f3e1c0df775ae3e744bff24c303 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 5 Aug 2021 11:21:53 +0100 Subject: sched: Fix UCLAMP_FLAG_IDLE setting The UCLAMP_FLAG_IDLE flag is set on a runqueue when dequeueing the last uclamp active task (that is, when buckets.tasks reaches 0 for all buckets) to maintain the last uclamp.max and prevent blocked util from suddenly becoming visible. However, there is an asymmetry in how the flag is set and cleared which can lead to having the flag set whilst there are active tasks on the rq. Specifically, the flag is cleared in the uclamp_rq_inc() path, which is called at enqueue time, but set in uclamp_rq_dec_id() which is called both when dequeueing a task _and_ in the update_uclamp_active() path. As a result, when both uclamp_rq_{dec,ind}_id() are called from update_uclamp_active(), the flag ends up being set but not cleared, hence leaving the runqueue in a broken state. Fix this by clearing the flag in update_uclamp_active() as well. Fixes: e496187da710 ("sched/uclamp: Enforce last task's UCLAMP_MAX") Reported-by: Rick Yiu Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Qais Yousef Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20210805102154.590709-2-qperret@google.com --- kernel/sched/core.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 314f70db3e5c..df0480ad59b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1621,6 +1621,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) +{ + if (!p->uclamp[clamp_id].active) + return; + + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + + /* + * Make sure to clear the idle flag if we've transiently reached 0 + * active tasks on rq. + */ + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} + static inline void uclamp_update_active(struct task_struct *p) { @@ -1644,12 +1661,8 @@ uclamp_update_active(struct task_struct *p) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - for_each_clamp_id(clamp_id) { - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - } - } + for_each_clamp_id(clamp_id) + uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); } -- cgit v1.2.3-59-g8ed1b From f4dddf90d58d77b48492b775868af4041a217f4c Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 5 Aug 2021 11:21:54 +0100 Subject: sched: Skip priority checks with SCHED_FLAG_KEEP_PARAMS SCHED_FLAG_KEEP_PARAMS can be passed to sched_setattr to specify that the call must not touch scheduling parameters (nice or priority). This is particularly handy for uclamp when used in conjunction with SCHED_FLAG_KEEP_POLICY as that allows to issue a syscall that only impacts uclamp values. However, sched_setattr always checks whether the priorities and nice values passed in sched_attr are valid first, even if those never get used down the line. This is useless at best since userspace can trivially bypass this check to set the uclamp values by specifying low priorities. However, it is cumbersome to do so as there is no single expression of this that skips both RT and CFS checks at once. As such, userspace needs to query the task policy first with e.g. sched_getattr and then set sched_attr.sched_priority accordingly. This is racy and slower than a single call. As the priority and nice checks are useless when SCHED_FLAG_KEEP_PARAMS is specified, simply inherit them in this case to match the policy inheritance of SCHED_FLAG_KEEP_POLICY. Reported-by: Wei Wang Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Qais Yousef Link: https://lore.kernel.org/r/20210805102154.590709-3-qperret@google.com --- kernel/sched/core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index df0480ad59b0..433b4001e71e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7328,6 +7328,16 @@ err_size: return -E2BIG; } +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ + if (task_has_dl_policy(p)) + __getparam_dl(p, attr); + else if (task_has_rt_policy(p)) + attr->sched_priority = p->rt_priority; + else + attr->sched_nice = task_nice(p); +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@ -7389,6 +7399,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, rcu_read_unlock(); if (likely(p)) { + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); retval = sched_setattr(p, &attr); put_task_struct(p); } @@ -7537,12 +7549,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - if (task_has_dl_policy(p)) - __getparam_dl(p, &kattr); - else if (task_has_rt_policy(p)) - kattr.sched_priority = p->rt_priority; - else - kattr.sched_nice = task_nice(p); + get_params(p, &kattr); kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK -- cgit v1.2.3-59-g8ed1b From 746f5ea9c4283d98353c1cd41864aec475e0edbd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:15 +0200 Subject: sched: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210803141621.780504-33-bigeasy@linutronix.de --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 433b4001e71e..d67c5dd51c16 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9840,7 +9840,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@ -9884,7 +9884,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); - put_online_cpus(); + cpus_read_unlock(); return ret; } -- cgit v1.2.3-59-g8ed1b From 0083242c93759dde353a963a90cb351c5c283379 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 18 Aug 2021 13:13:33 +0530 Subject: sched/topology: Skip updating masks for non-online nodes The scheduler currently expects NUMA node distances to be stable from init onwards, and as a consequence builds the related data structures once-and-for-all at init (see sched_init_numa()). Unfortunately, on some architectures node distance is unreliable for offline nodes and may very well change upon onlining. Skip over offline nodes during sched_init_numa(). Track nodes that have been onlined at least once, and trigger a build of a node's NUMA masks when it is first onlined post-init. Reported-by: Geetika Moolchandani Signed-off-by: Srikar Dronamraju Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210818074333.48645-1-srikar@linux.vnet.ibm.com --- kernel/sched/topology.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b77ad49dc14f..4e8698e62f07 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1482,6 +1482,8 @@ int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; + +static unsigned long __read_mostly *sched_numa_onlined_nodes; #endif /* @@ -1833,6 +1835,16 @@ void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; for_each_node(k) { + /* + * Distance information can be unreliable for + * offline nodes, defer building the node + * masks to its bringup. + * This relies on all unique distance values + * still being visible at init time. + */ + if (!node_online(j)) + continue; + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); @@ -1886,6 +1898,53 @@ void sched_init_numa(void) sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); + + sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL); + if (!sched_numa_onlined_nodes) + return; + + bitmap_zero(sched_numa_onlined_nodes, nr_node_ids); + for_each_online_node(i) + bitmap_set(sched_numa_onlined_nodes, i, 1); +} + +static void __sched_domains_numa_masks_set(unsigned int node) +{ + int i, j; + + /* + * NUMA masks are not built for offline nodes in sched_init_numa(). + * Thus, when a CPU of a never-onlined-before node gets plugged in, + * adding that new CPU to the right NUMA masks is not sufficient: the + * masks of that CPU's node must also be updated. + */ + if (test_bit(node, sched_numa_onlined_nodes)) + return; + + bitmap_set(sched_numa_onlined_nodes, node, 1); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (!node_online(j) || node == j) + continue; + + if (node_distance(j, node) > sched_domains_numa_distance[i]) + continue; + + /* Add remote nodes in our masks */ + cpumask_or(sched_domains_numa_masks[i][node], + sched_domains_numa_masks[i][node], + sched_domains_numa_masks[0][j]); + } + } + + /* + * A new node has been brought up, potentially changing the topology + * classification. + * + * Note that this is racy vs any use of sched_numa_topology_type :/ + */ + init_numa_topology_type(); } void sched_domains_numa_masks_set(unsigned int cpu) @@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu) int node = cpu_to_node(cpu); int i, j; + __sched_domains_numa_masks_set(node); + for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { + if (!node_online(j)) + continue; + + /* Set ourselves in the remote node's masks */ if (node_distance(j, node) <= sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } -- cgit v1.2.3-59-g8ed1b From 304000390f88d049c85e9a0958ac5567f38816ee Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 29 Jul 2021 19:00:18 -0700 Subject: sched: Cgroup SCHED_IDLE support This extends SCHED_IDLE to cgroups. Interface: cgroup/cpu.idle. 0: default behavior 1: SCHED_IDLE Extending SCHED_IDLE to cgroups means that we incorporate the existing aspects of SCHED_IDLE; a SCHED_IDLE cgroup will count all of its descendant threads towards the idle_h_nr_running count of all of its ancestor cgroups. Thus, sched_idle_rq() will work properly. Additionally, SCHED_IDLE cgroups are configured with minimum weight. There are two key differences between the per-task and per-cgroup SCHED_IDLE interface: - The cgroup interface allows tasks within a SCHED_IDLE hierarchy to maintain their relative weights. The entity that is "idle" is the cgroup, not the tasks themselves. - Since the idle entity is the cgroup, our SCHED_IDLE wakeup preemption decision is not made by comparing the current task with the woken task, but rather by comparing their matching sched_entity. A typical use-case for this is a user that creates an idle and a non-idle subtree. The non-idle subtree will dominate competition vs the idle subtree, but the idle subtree will still be high priority vs other users on the system. The latter is accomplished via comparing matching sched_entity in the waken preemption path (this could also be improved by making the sched_idle_rq() decision dependent on the perspective of a specific task). For now, we maintain the existing SCHED_IDLE semantics. Future patches may make improvements that extend how we treat SCHED_IDLE entities. The per-task_group idle field is an integer that currently only holds either a 0 or a 1. This is explicitly typed as an integer to allow for further extensions to this API. For example, a negative value may indicate a highly latency-sensitive cgroup that should be preferred for preemption/placement/etc. Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20210730020019.1487127-2-joshdon@google.com --- kernel/sched/core.c | 25 +++++++ kernel/sched/debug.c | 3 + kernel/sched/fair.c | 197 ++++++++++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 8 +++ 4 files changed, 208 insertions(+), 25 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d67c5dd51c16..7fa6ce74f40e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10135,6 +10135,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->idle; +} + +static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 idle) +{ + return sched_group_set_idle(css_tg(css), idle); +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -10142,6 +10156,11 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, + { + .name = "idle", + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -10349,6 +10368,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, + { + .name = "idle", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7e08e3d947c2..49716228efb4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -607,6 +607,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", + cfs_rq->idle_h_nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 47a0fbf224b2..6cd05f1d77ef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -431,6 +431,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } +static int tg_is_idle(struct task_group *tg) +{ + return tg->idle > 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return cfs_rq->idle > 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + if (entity_is_task(se)) + return task_has_idle_policy(task_of(se)); + return cfs_rq_is_idle(group_cfs_rq(se)); +} + #else /* !CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ @@ -468,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } +static int tg_is_idle(struct task_group *tg) +{ + return 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + return 0; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline @@ -4812,6 +4844,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; @@ -4831,6 +4866,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) update_load_avg(qcfs_rq, se, 0); se_update_runnable(se); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; } @@ -4875,39 +4913,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + if (se->on_rq) break; - cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; } for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(qcfs_rq, se, UPDATE_TG); se_update_runnable(se); - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; /* * One parent has been throttled and cfs_rq removed from the * list. Add it back to not break the leaf list. */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (throttled_hierarchy(qcfs_rq)) + list_add_leaf_cfs_rq(qcfs_rq); } /* At this point se is NULL and we are at root level*/ @@ -4920,9 +4964,9 @@ unthrottle_throttle: * assertion below. */ for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (list_add_leaf_cfs_rq(cfs_rq)) + if (list_add_leaf_cfs_rq(qcfs_rq)) break; } @@ -5545,6 +5589,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@ -5562,6 +5609,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@ -5639,6 +5689,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@ -5668,6 +5721,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@ -7010,24 +7066,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->last = se; } } static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->next = se; } } @@ -7048,6 +7102,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; + int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) return; @@ -7092,8 +7147,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; find_matching_se(&se, &pse); - update_curr(cfs_rq_of(se)); BUG_ON(!pse); + + cse_is_idle = se_is_idle(se); + pse_is_idle = se_is_idle(pse); + + /* + * Preempt an idle group in favor of a non-idle group (and don't preempt + * in the inverse case). + */ + if (cse_is_idle && !pse_is_idle) + goto preempt; + if (cse_is_idle != pse_is_idle) + return; + + update_curr(cfs_rq_of(se)); if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is @@ -11387,10 +11455,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, static DEFINE_MUTEX(shares_mutex); -int sched_group_set_shares(struct task_group *tg, unsigned long shares) +static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + lockdep_assert_held(&shares_mutex); + /* * We can't change the weight of the root cgroup. */ @@ -11399,9 +11469,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - mutex_lock(&shares_mutex); if (tg->shares == shares) - goto done; + return 0; tg->shares = shares; for_each_possible_cpu(i) { @@ -11419,10 +11488,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) rq_unlock_irqrestore(rq, &rf); } -done: + return 0; +} + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int ret; + + mutex_lock(&shares_mutex); + if (tg_is_idle(tg)) + ret = -EINVAL; + else + ret = __sched_group_set_shares(tg, shares); + mutex_unlock(&shares_mutex); + + return ret; +} + +int sched_group_set_idle(struct task_group *tg, long idle) +{ + int i; + + if (tg == &root_task_group) + return -EINVAL; + + if (idle < 0 || idle > 1) + return -EINVAL; + + mutex_lock(&shares_mutex); + + if (tg->idle == idle) { + mutex_unlock(&shares_mutex); + return 0; + } + + tg->idle = idle; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se = tg->se[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + bool was_idle = cfs_rq_is_idle(grp_cfs_rq); + long idle_task_delta; + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + + grp_cfs_rq->idle = idle; + if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) + goto next_cpu; + + idle_task_delta = grp_cfs_rq->h_nr_running - + grp_cfs_rq->idle_h_nr_running; + if (!cfs_rq_is_idle(grp_cfs_rq)) + idle_task_delta *= -1; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!se->on_rq) + break; + + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* Already accounted at parent level and above. */ + if (cfs_rq_is_idle(cfs_rq)) + break; + } + +next_cpu: + rq_unlock_irqrestore(rq, &rf); + } + + /* Idle groups have minimum weight. */ + if (tg_is_idle(tg)) + __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); + else + __sched_group_set_shares(tg, NICE_0_LOAD); + mutex_unlock(&shares_mutex); return 0; } + #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 75a5c120e7d9..5fa02902c143 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -396,6 +396,9 @@ struct task_group { struct cfs_rq **cfs_rq; unsigned long shares; + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; + #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put @@ -505,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern int sched_group_set_idle(struct task_group *tg, long idle); + #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); @@ -601,6 +606,9 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + /* Locally cached copy of our task_group's idle value */ + int idle; + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; -- cgit v1.2.3-59-g8ed1b From 9ae606bc74dd0e58d4de894e3c5cbb9d45599267 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:28 +0100 Subject: sched: Introduce task_cpu_possible_mask() to limit fallback rq selection Asymmetric systems may not offer the same level of userspace ISA support across all CPUs, meaning that some applications cannot be executed by some CPUs. As a concrete example, upcoming arm64 big.LITTLE designs do not feature support for 32-bit applications on both clusters. On such a system, we must take care not to migrate a task to an unsupported CPU when forcefully moving tasks in select_fallback_rq() in response to a CPU hot-unplug operation. Introduce a task_cpu_possible_mask() hook which, given a task argument, allows an architecture to return a cpumask of CPUs that are capable of executing that task. The default implementation returns the cpu_possible_mask, since sane machines do not suffer from per-cpu ISA limitations that affect scheduling. The new mask is used when selecting the fallback runqueue as a last resort before forcing a migration to the first active CPU. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Quentin Perret Link: https://lore.kernel.org/r/20210730112443.23245-2-will@kernel.org --- include/linux/mmu_context.h | 14 ++++++++++++++ kernel/sched/core.c | 9 +++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index 03dee12d2b61..b9b970f7ab45 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -14,4 +14,18 @@ static inline void leave_mm(int cpu) { } #endif +/* + * CPUs that are capable of running user task @p. Must contain at least one + * active CPU. It is assumed that the kernel can run on all CPUs, so calling + * this for a kernel thread is pointless. + * + * By default, we assume a sane, homogeneous system. + */ +#ifndef task_cpu_possible_mask +# define task_cpu_possible_mask(p) cpu_possible_mask +# define task_cpu_possible(cpu, p) true +#else +# define task_cpu_possible(cpu, p) cpumask_test_cpu((cpu), task_cpu_possible_mask(p)) +#endif + #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7fa6ce74f40e..6f31267c4beb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2173,7 +2173,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) /* Non kernel threads are not allowed during either online or offline. */ if (!(p->flags & PF_KTHREAD)) - return cpu_active(cpu); + return cpu_active(cpu) && task_cpu_possible(cpu, p); /* KTHREAD_IS_PER_CPU is always allowed. */ if (kthread_is_per_cpu(p)) @@ -3124,9 +3124,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) + if (is_cpu_allowed(p, dest_cpu)) return dest_cpu; } } @@ -3156,10 +3154,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * * More yuck to audit. */ - do_set_cpus_allowed(p, cpu_possible_mask); + do_set_cpus_allowed(p, task_cpu_possible_mask(p)); state = fail; break; - case fail: BUG(); break; -- cgit v1.2.3-59-g8ed1b From d4b96fb92ae7fe7533e11e662504d96161928575 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:29 +0100 Subject: cpuset: Don't use the cpu_possible_mask as a last resort for cgroup v1 If the scheduler cannot find an allowed CPU for a task, cpuset_cpus_allowed_fallback() will widen the affinity to cpu_possible_mask if cgroup v1 is in use. In preparation for allowing architectures to provide their own fallback mask, just return early if we're either using cgroup v1 or we're using cgroup v2 with a mask that contains invalid CPUs. This will allow select_fallback_rq() to figure out the mask by itself. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Quentin Perret Link: https://lkml.kernel.org/r/20210730112443.23245-3-will@kernel.org --- include/linux/cpuset.h | 1 + kernel/cgroup/cpuset.c | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 04c20de66afc..ed6ec677dd6b 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #ifdef CONFIG_CPUSETS diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index adb5190c4429..a8693783f385 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3322,9 +3322,13 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + const struct cpumask *cs_mask; + rcu_read_lock(); - do_set_cpus_allowed(tsk, is_in_v2_mode() ? - task_cs(tsk)->cpus_allowed : cpu_possible_mask); + cs_mask = task_cs(tsk)->cpus_allowed; + if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) + do_set_cpus_allowed(tsk, cs_mask); rcu_read_unlock(); /* -- cgit v1.2.3-59-g8ed1b From 431c69fac05baa7477d61a44f2708e069f2bed6c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:30 +0100 Subject: cpuset: Honour task_cpu_possible_mask() in guarantee_online_cpus() Asymmetric systems may not offer the same level of userspace ISA support across all CPUs, meaning that some applications cannot be executed by some CPUs. As a concrete example, upcoming arm64 big.LITTLE designs do not feature support for 32-bit applications on both clusters. Modify guarantee_online_cpus() to take task_cpu_possible_mask() into account when trying to find a suitable set of online CPUs for a given task. This will avoid passing an invalid mask to set_cpus_allowed_ptr() during ->attach() and will subsequently allow the cpuset hierarchy to be taken into account when forcefully overriding the affinity mask for a task which requires migration to a compatible CPU. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210730112443.23245-4-will@kernel.org --- include/linux/cpuset.h | 2 +- kernel/cgroup/cpuset.c | 43 ++++++++++++++++++++++++++----------------- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index ed6ec677dd6b..414a8e694413 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -185,7 +185,7 @@ static inline void cpuset_read_unlock(void) { } static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { - cpumask_copy(mask, cpu_possible_mask); + cpumask_copy(mask, task_cpu_possible_mask(p)); } static inline void cpuset_cpus_allowed_fallback(struct task_struct *p) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a8693783f385..391813245cb2 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -372,18 +372,29 @@ static inline bool is_in_v2_mode(void) } /* - * Return in pmask the portion of a cpusets's cpus_allowed that - * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. + * Return in pmask the portion of a task's cpusets's cpus_allowed that + * are online and are capable of running the task. If none are found, + * walk up the cpuset hierarchy until we find one that does have some + * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * * Call with callback_lock or cpuset_mutex held. */ -static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) +static void guarantee_online_cpus(struct task_struct *tsk, + struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + struct cpuset *cs; + + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) + cpumask_copy(pmask, cpu_online_mask); + + rcu_read_lock(); + cs = task_cs(tsk); + + while (!cpumask_intersects(cs->effective_cpus, pmask)) { cs = parent_cs(cs); if (unlikely(!cs)) { /* @@ -393,11 +404,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ - cpumask_copy(pmask, cpu_online_mask); - return; + goto out_unlock; } } - cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); + cpumask_and(pmask, pmask, cs->effective_cpus); + +out_unlock: + rcu_read_unlock(); } /* @@ -2199,15 +2212,13 @@ static void cpuset_attach(struct cgroup_taskset *tset) percpu_down_write(&cpuset_rwsem); - /* prepare for attach */ - if (cs == &top_cpuset) - cpumask_copy(cpus_attach, cpu_possible_mask); - else - guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, css, tset) { + if (cs != &top_cpuset) + guarantee_online_cpus(task, cpus_attach); + else + cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -3302,9 +3313,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); - guarantee_online_cpus(task_cs(tsk), pmask); - rcu_read_unlock(); + guarantee_online_cpus(tsk, pmask); spin_unlock_irqrestore(&callback_lock, flags); } -- cgit v1.2.3-59-g8ed1b From 97c0054dbe2c3c59d1156fd233f2d44e91981c8e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:31 +0100 Subject: cpuset: Cleanup cpuset_cpus_allowed_fallback() use in select_fallback_rq() select_fallback_rq() only needs to recheck for an allowed CPU if the affinity mask of the task has changed since the last check. Return a 'bool' from cpuset_cpus_allowed_fallback() to indicate whether the affinity mask was updated, and use this to elide the allowed check when the mask has been left alone. No functional change. Suggested-by: Valentin Schneider Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20210730112443.23245-5-will@kernel.org --- include/linux/cpuset.h | 5 +++-- kernel/cgroup/cpuset.c | 10 ++++++++-- kernel/sched/core.c | 3 +-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 414a8e694413..d2b9c41c8edf 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -59,7 +59,7 @@ extern void cpuset_wait_for_hotplug(void); extern void cpuset_read_lock(void); extern void cpuset_read_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); -extern void cpuset_cpus_allowed_fallback(struct task_struct *p); +extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -188,8 +188,9 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_copy(mask, task_cpu_possible_mask(p)); } -static inline void cpuset_cpus_allowed_fallback(struct task_struct *p) +static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) { + return false; } static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 391813245cb2..6500cbe0ce16 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3327,17 +3327,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) * which will not contain a sane cpumask during cases such as cpu hotplugging. * This is the absolute last resort for the scheduler and it is only used if * _every_ other avenue has been traveled. + * + * Returns true if the affinity of @tsk was changed, false otherwise. **/ -void cpuset_cpus_allowed_fallback(struct task_struct *tsk) +bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); const struct cpumask *cs_mask; + bool changed = false; rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; - if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) + if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { do_set_cpus_allowed(tsk, cs_mask); + changed = true; + } rcu_read_unlock(); /* @@ -3357,6 +3362,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) * select_fallback_rq() will fix things ups and set cpu_possible_mask * if required. */ + return changed; } void __init cpuset_init_current_mems_allowed(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6f31267c4beb..b9d4bae922a8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3141,8 +3141,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - if (IS_ENABLED(CONFIG_CPUSETS)) { - cpuset_cpus_allowed_fallback(p); + if (cpuset_cpus_allowed_fallback(p)) { state = possible; break; } -- cgit v1.2.3-59-g8ed1b From 234a503e670be01f72841be9fcf68dfb89a1fa8b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:32 +0100 Subject: sched: Reject CPU affinity changes based on task_cpu_possible_mask() Reject explicit requests to change the affinity mask of a task via set_cpus_allowed_ptr() if the requested mask is not a subset of the mask returned by task_cpu_possible_mask(). This ensures that the 'cpus_mask' for a given task cannot contain CPUs which are incapable of executing it, except in cases where the affinity is forced. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Quentin Perret Link: https://lore.kernel.org/r/20210730112443.23245-6-will@kernel.org --- kernel/sched/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b9d4bae922a8..8cec0d24c88c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2709,7 +2709,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { + const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; + bool kthread = p->flags & PF_KTHREAD; unsigned int dest_cpu; struct rq_flags rf; struct rq *rq; @@ -2718,7 +2720,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { + if (kthread || is_migration_disabled(p)) { /* * Kernel threads are allowed on online && !active CPUs, * however, during cpu-hot-unplug, even these might get pushed @@ -2732,6 +2734,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, cpu_valid_mask = cpu_online_mask; } + if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { + ret = -EINVAL; + goto out; + } + /* * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. -- cgit v1.2.3-59-g8ed1b From b90ca8badbd11488e5f762346b028666808164e7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:33 +0100 Subject: sched: Introduce task_struct::user_cpus_ptr to track requested affinity In preparation for saving and restoring the user-requested CPU affinity mask of a task, add a new cpumask_t pointer to 'struct task_struct'. If the pointer is non-NULL, then the mask is copied across fork() and freed on task exit. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20210730112443.23245-7-will@kernel.org --- include/linux/sched.h | 13 +++++++++++++ init/init_task.c | 1 + kernel/fork.c | 2 ++ kernel/sched/core.c | 20 ++++++++++++++++++++ 4 files changed, 36 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 50db9496c99d..2c5d638daaad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -748,6 +748,7 @@ struct task_struct { unsigned int policy; int nr_cpus_allowed; const cpumask_t *cpus_ptr; + cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; #ifdef CONFIG_SMP @@ -1706,6 +1707,8 @@ extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_ #ifdef CONFIG_SMP extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); +extern void release_user_cpus_ptr(struct task_struct *p); #else static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { @@ -1716,6 +1719,16 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma return -EINVAL; return 0; } +static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) +{ + if (src->user_cpus_ptr) + return -EINVAL; + return 0; +} +static inline void release_user_cpus_ptr(struct task_struct *p) +{ + WARN_ON(p->user_cpus_ptr); +} #endif extern int yield_to(struct task_struct *p, bool preempt); diff --git a/init/init_task.c b/init/init_task.c index 562f2ef8d157..2d024066e27b 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -80,6 +80,7 @@ struct task_struct init_task .normal_prio = MAX_PRIO - 20, .policy = SCHED_NORMAL, .cpus_ptr = &init_task.cpus_mask, + .user_cpus_ptr = NULL, .cpus_mask = CPU_MASK_ALL, .nr_cpus_allowed= NR_CPUS, .mm = NULL, diff --git a/kernel/fork.c b/kernel/fork.c index 1a9af73b47c1..5d7addf0c41a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -446,6 +446,7 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { + release_user_cpus_ptr(tsk); scs_release(tsk); #ifndef CONFIG_THREAD_INFO_IN_TASK @@ -919,6 +920,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #endif if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; + dup_user_cpus_ptr(tsk, orig, node); /* * One for the user space visible state that goes away when reaped. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8cec0d24c88c..360a3ec6d03b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2480,6 +2480,26 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) __do_set_cpus_allowed(p, new_mask, 0); } +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, + int node) +{ + if (!src->user_cpus_ptr) + return 0; + + dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); + if (!dst->user_cpus_ptr) + return -ENOMEM; + + cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + return 0; +} + +void release_user_cpus_ptr(struct task_struct *p) +{ + kfree(p->user_cpus_ptr); + p->user_cpus_ptr = NULL; +} + /* * This function is wildly self concurrent; here be dragons. * -- cgit v1.2.3-59-g8ed1b From db3b02ae896e88b6bb7a95c1373602e87e0de84c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:34 +0100 Subject: sched: Split the guts of sched_setaffinity() into a helper function In preparation for replaying user affinity requests using a saved mask, split sched_setaffinity() up so that the initial task lookup and security checks are only performed when the request is coming directly from userspace. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20210730112443.23245-8-will@kernel.org --- kernel/sched/core.c | 105 ++++++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 48 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 360a3ec6d03b..672d0fcbf2ef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7594,53 +7594,22 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) { - cpumask_var_t cpus_allowed, new_mask; - struct task_struct *p; int retval; + cpumask_var_t cpus_allowed, new_mask; - rcu_read_lock(); - - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); - return -ESRCH; - } - - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) + return -ENOMEM; - if (p->flags & PF_NO_SETAFFINITY) { - retval = -EINVAL; - goto out_put_task; - } - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { retval = -ENOMEM; goto out_free_cpus_allowed; } - retval = -EPERM; - if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); - } - - retval = security_task_setscheduler(p); - if (retval) - goto out_free_new_mask; - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); + cpumask_and(new_mask, mask, cpus_allowed); /* * Since bandwidth control happens on root_domain basis, @@ -7661,23 +7630,63 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) #endif again: retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); + if (retval) + goto out_free_new_mask; - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset update. + * Just reset the cpumask to the cpuset's cpus_allowed. + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; } + out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: free_cpumask_var(cpus_allowed); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + struct task_struct *p; + int retval; + + rcu_read_lock(); + + p = find_process_by_pid(pid); + if (!p) { + rcu_read_unlock(); + return -ESRCH; + } + + /* Prevent p going away */ + get_task_struct(p); + rcu_read_unlock(); + + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } + + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + retval = -EPERM; + goto out_put_task; + } + rcu_read_unlock(); + } + + retval = security_task_setscheduler(p); + if (retval) + goto out_put_task; + + retval = __sched_setaffinity(p, in_mask); out_put_task: put_task_struct(p); return retval; -- cgit v1.2.3-59-g8ed1b From 07ec77a1d4e82526e1588979fff2f024f8e96df2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:35 +0100 Subject: sched: Allow task CPU affinity to be restricted on asymmetric systems Asymmetric systems may not offer the same level of userspace ISA support across all CPUs, meaning that some applications cannot be executed by some CPUs. As a concrete example, upcoming arm64 big.LITTLE designs do not feature support for 32-bit applications on both clusters. Although userspace can carefully manage the affinity masks for such tasks, one place where it is particularly problematic is execve() because the CPU on which the execve() is occurring may be incompatible with the new application image. In such a situation, it is desirable to restrict the affinity mask of the task and ensure that the new image is entered on a compatible CPU. From userspace's point of view, this looks the same as if the incompatible CPUs have been hotplugged off in the task's affinity mask. Similarly, if a subsequent execve() reverts to a compatible image, then the old affinity is restored if it is still valid. In preparation for restricting the affinity mask for compat tasks on arm64 systems without uniform support for 32-bit applications, introduce {force,relax}_compatible_cpus_allowed_ptr(), which respectively restrict and restore the affinity mask for a task based on the compatible CPUs. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Quentin Perret Link: https://lore.kernel.org/r/20210730112443.23245-9-will@kernel.org --- include/linux/sched.h | 2 + kernel/sched/core.c | 198 +++++++++++++++++++++++++++++++++++++++++++++----- kernel/sched/sched.h | 1 + 3 files changed, 183 insertions(+), 18 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c5d638daaad..ce2d5cfc331e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1709,6 +1709,8 @@ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); +extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); +extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); #else static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 672d0fcbf2ef..6ee197049c9c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2494,10 +2494,18 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, return 0; } +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = NULL; + + swap(p->user_cpus_ptr, user_mask); + + return user_mask; +} + void release_user_cpus_ptr(struct task_struct *p) { - kfree(p->user_cpus_ptr); - p->user_cpus_ptr = NULL; + kfree(clear_user_cpus_ptr(p)); } /* @@ -2717,27 +2725,23 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * Called with both p->pi_lock and rq->lock held; drops both before returning. */ -static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags) +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags, + struct rq *rq, + struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) { const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; bool kthread = p->flags & PF_KTHREAD; + struct cpumask *user_mask = NULL; unsigned int dest_cpu; - struct rq_flags rf; - struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &rf); update_rq_clock(rq); if (kthread || is_migration_disabled(p)) { @@ -2793,20 +2797,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, __do_set_cpus_allowed(p, new_mask, flags); - return affine_move_task(rq, p, &rf, dest_cpu, flags); + if (flags & SCA_USER) + user_mask = clear_user_cpus_ptr(p); + + ret = affine_move_task(rq, p, rf, dest_cpu, flags); + + kfree(user_mask); + + return ret; out: - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, rf); return ret; } +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, u32 flags) +{ + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); +} + int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { return __set_cpus_allowed_ptr(p, new_mask, 0); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +/* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask + * and pointing @p->user_cpus_ptr to a copy of the old mask. + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ +static int restrict_cpus_allowed_ptr(struct task_struct *p, + struct cpumask *new_mask, + const struct cpumask *subset_mask) +{ + struct cpumask *user_mask = NULL; + struct rq_flags rf; + struct rq *rq; + int err; + + if (!p->user_cpus_ptr) { + user_mask = kmalloc(cpumask_size(), GFP_KERNEL); + if (!user_mask) + return -ENOMEM; + } + + rq = task_rq_lock(p, &rf); + + /* + * Forcefully restricting the affinity of a deadline task is + * likely to cause problems, so fail and noisily override the + * mask entirely. + */ + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + err = -EPERM; + goto err_unlock; + } + + if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { + err = -EINVAL; + goto err_unlock; + } + + /* + * We're about to butcher the task affinity, so keep track of what + * the user asked for in case we're able to restore it later on. + */ + if (user_mask) { + cpumask_copy(user_mask, p->cpus_ptr); + p->user_cpus_ptr = user_mask; + } + + return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + +err_unlock: + task_rq_unlock(rq, p, &rf); + kfree(user_mask); + return err; +} + +/* + * Restrict the CPU affinity of task @p so that it is a subset of + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ +void force_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + cpumask_var_t new_mask; + const struct cpumask *override_mask = task_cpu_possible_mask(p); + + alloc_cpumask_var(&new_mask, GFP_KERNEL); + + /* + * __migrate_task() can fail silently in the face of concurrent + * offlining of the chosen destination CPU, so take the hotplug + * lock to ensure that the migration succeeds. + */ + cpus_read_lock(); + if (!cpumask_available(new_mask)) + goto out_set_mask; + + if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) + goto out_free_mask; + + /* + * We failed to find a valid subset of the affinity mask for the + * task, so override it based on its cpuset hierarchy. + */ + cpuset_cpus_allowed(p, new_mask); + override_mask = new_mask; + +out_set_mask: + if (printk_ratelimit()) { + printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", + task_pid_nr(p), p->comm, + cpumask_pr_args(override_mask)); + } + + WARN_ON(set_cpus_allowed_ptr(p, override_mask)); +out_free_mask: + cpus_read_unlock(); + free_cpumask_var(new_mask); +} + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); + +/* + * Restore the affinity of a task @p which was previously restricted by a + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) + * @p->user_cpus_ptr. + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = p->user_cpus_ptr; + unsigned long flags; + + /* + * Try to restore the old affinity mask. If this fails, then + * we free the mask explicitly to avoid it being inherited across + * a subsequent fork(). + */ + if (!user_mask || !__sched_setaffinity(p, user_mask)) + return; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + user_mask = clear_user_cpus_ptr(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + kfree(user_mask); +} + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -7629,7 +7791,7 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask) } #endif again: - retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); if (retval) goto out_free_new_mask; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5fa02902c143..e7e2bba5b520 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2244,6 +2244,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 #define SCA_MIGRATE_ENABLE 0x04 +#define SCA_USER 0x08 #ifdef CONFIG_SMP -- cgit v1.2.3-59-g8ed1b From 234b8ab6476c5edd5262e2ff563de9498d60044a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:36 +0100 Subject: sched: Introduce dl_task_check_affinity() to check proposed affinity In preparation for restricting the affinity of a task during execve() on arm64, introduce a new dl_task_check_affinity() helper function to give an indication as to whether the restricted mask is admissible for a deadline task. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Link: https://lore.kernel.org/r/20210730112443.23245-10-will@kernel.org --- include/linux/sched.h | 6 ++++++ kernel/sched/core.c | 46 +++++++++++++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ce2d5cfc331e..3bb9fecfdaa1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1709,6 +1709,7 @@ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); +extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); #else @@ -1731,6 +1732,11 @@ static inline void release_user_cpus_ptr(struct task_struct *p) { WARN_ON(p->user_cpus_ptr); } + +static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) +{ + return 0; +} #endif extern int yield_to(struct task_struct *p, bool preempt); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6ee197049c9c..a22cc3c156ce 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7756,6 +7756,32 @@ out_unlock: return retval; } +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) +{ + int ret = 0; + + /* + * If the task isn't a deadline task or admission control is + * disabled then we don't care about affinity changes. + */ + if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) + return 0; + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, mask)) + ret = -EBUSY; + rcu_read_unlock(); + return ret; +} +#endif + static int __sched_setaffinity(struct task_struct *p, const struct cpumask *mask) { @@ -7773,23 +7799,9 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask) cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, mask, cpus_allowed); - /* - * Since bandwidth control happens on root_domain basis, - * if admission test is enabled, we only admit -deadline - * tasks allowed to run on all the CPUs in the task's - * root_domain. - */ -#ifdef CONFIG_SMP - if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { - rcu_read_lock(); - if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { - retval = -EBUSY; - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); - } -#endif + retval = dl_task_check_affinity(p, new_mask); + if (retval) + goto out_free_new_mask; again: retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); if (retval) -- cgit v1.2.3-59-g8ed1b From 366e7ad6ba5f4cb2ffd0b7316e404d6ee9c0f401 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Aug 2021 10:47:09 +0200 Subject: sched/fair: Mark tg_is_idle() an inline in the !CONFIG_FAIR_GROUP_SCHED case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's not actually used in the !CONFIG_FAIR_GROUP_SCHED case: kernel/sched/fair.c:488:12: warning: ‘tg_is_idle’ defined but not used [-Wunused-function] Keep around a placeholder nevertheless, for API completeness. Mark it inline, so the compiler doesn't think it must be used. Fixes: 304000390f88: ("sched: Cgroup SCHED_IDLE support") Signed-off-by: Ingo Molnar Cc: Josh Don Cc: Peter Zijlstra (Intel) Cc: Vincent Guittot --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6cd05f1d77ef..7b3e85912a1f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -485,7 +485,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } -static int tg_is_idle(struct task_group *tg) +static inline int tg_is_idle(struct task_group *tg) { return 0; } -- cgit v1.2.3-59-g8ed1b From b542e383d8c005f06a131e2b40d5889b812f19c6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 13:01:59 +0200 Subject: eventfd: Make signal recursion protection a task bit The recursion protection for eventfd_signal() is based on a per CPU variable and relies on the !RT semantics of spin_lock_irqsave() for protecting this per CPU variable. On RT kernels spin_lock_irqsave() neither disables preemption nor interrupts which allows the spin lock held section to be preempted. If the preempting task invokes eventfd_signal() as well, then the recursion warning triggers. Paolo suggested to protect the per CPU variable with a local lock, but that's heavyweight and actually not necessary. The goal of this protection is to prevent the task stack from overflowing, which can be achieved with a per task recursion protection as well. Replace the per CPU variable with a per task bit similar to other recursion protection bits like task_struct::in_page_owner. This works on both !RT and RT kernels and removes as a side effect the extra per CPU storage. No functional change for !RT kernels. Reported-by: Daniel Bristot de Oliveira Signed-off-by: Thomas Gleixner Tested-by: Daniel Bristot de Oliveira Acked-by: Jason Wang Cc: Al Viro Link: https://lore.kernel.org/r/87wnp9idso.ffs@tglx --- fs/aio.c | 2 +- fs/eventfd.c | 12 +++++------- include/linux/eventfd.h | 11 +++++------ include/linux/sched.h | 4 ++++ 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 76ce0cc3ee4e..51b08ab01dff 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); req->done = true; - if (iocb->ki_eventfd && eventfd_signal_count()) { + if (iocb->ki_eventfd && eventfd_signal_allowed()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); schedule_work(&req->work); diff --git a/fs/eventfd.c b/fs/eventfd.c index e265b6dd4f34..3627dd7d25db 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -25,8 +25,6 @@ #include #include -DEFINE_PER_CPU(int, eventfd_wake_count); - static DEFINE_IDA(eventfd_ida); struct eventfd_ctx { @@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) * Deadlock or stack overflow issues can happen if we recurse here * through waitqueue wakeup handlers. If the caller users potentially * nested waitqueues with custom wakeup handlers, then it should - * check eventfd_signal_count() before calling this function. If - * it returns true, the eventfd_signal() call should be deferred to a + * check eventfd_signal_allowed() before calling this function. If + * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) + if (WARN_ON_ONCE(current->in_eventfd_signal)) return 0; spin_lock_irqsave(&ctx->wqh.lock, flags); - this_cpu_inc(eventfd_wake_count); + current->in_eventfd_signal = 1; if (ULLONG_MAX - ctx->count < n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); - this_cpu_dec(eventfd_wake_count); + current->in_eventfd_signal = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index fa0a524baed0..305d5f19093b 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -14,6 +14,7 @@ #include #include #include +#include /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining @@ -43,11 +44,9 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *w __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); -DECLARE_PER_CPU(int, eventfd_wake_count); - -static inline bool eventfd_signal_count(void) +static inline bool eventfd_signal_allowed(void) { - return this_cpu_read(eventfd_wake_count); + return !current->in_eventfd_signal; } #else /* CONFIG_EVENTFD */ @@ -78,9 +77,9 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, return -ENOSYS; } -static inline bool eventfd_signal_count(void) +static inline bool eventfd_signal_allowed(void) { - return false; + return true; } static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) diff --git a/include/linux/sched.h b/include/linux/sched.h index 3bb9fecfdaa1..6421a9a8b4ca 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -864,6 +864,10 @@ struct task_struct { /* Used by page_owner=on to detect recursion in page tracking. */ unsigned in_page_owner:1; #endif +#ifdef CONFIG_EVENTFD + /* Recursion prevention for eventfd_signal() */ + unsigned in_eventfd_signal:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ -- cgit v1.2.3-59-g8ed1b