diff options
Diffstat (limited to 'kernel/sched')
37 files changed, 12662 insertions, 5116 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 21fb5a5662b5..976092b7bd45 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,12 +1,18 @@ # SPDX-License-Identifier: GPL-2.0 -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) -endif + +# The compilers are complaining about unused variables inside an if(0) scope +# block. This is daft, shut them up. +ccflags-y += $(call cc-disable-warning, unused-but-set-variable) # These files are disabled because they produce non-interesting flaky coverage # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n +# Disable KCSAN to avoid excessive noise and performance degradation. To avoid +# false positives ensure barriers implied by sched functions are instrumented. +KCSAN_SANITIZE := n +KCSAN_INSTRUMENT_BARRIERS := y + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond @@ -16,17 +22,13 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle.o fair.o rt.o deadline.o -obj-y += wait.o wait_bit.o swait.o completion.o - -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o -obj-$(CONFIG_SCHEDSTATS) += stats.o -obj-$(CONFIG_SCHED_DEBUG) += debug.o -obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -obj-$(CONFIG_CPU_FREQ) += cpufreq.o -obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o -obj-$(CONFIG_MEMBARRIER) += membarrier.o -obj-$(CONFIG_CPU_ISOLATION) += isolation.o -obj-$(CONFIG_PSI) += psi.o +# +# Build efficiency: +# +# These compilation units have roughly the same size and complexity - so their +# build parallelizes well and finishes roughly at once: +# +obj-y += core.o +obj-y += fair.o +obj-y += build_policy.o +obj-y += build_utility.o diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2067080bb235..991fc9002535 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,20 +1,42 @@ // SPDX-License-Identifier: GPL-2.0 + /* * Auto-group scheduling implementation: */ -#include <linux/nospec.h> -#include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_autogroup_sysctls[] = { + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static void __init sched_autogroup_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_autogroup_sysctls); +} +#else +#define sched_autogroup_sysctl_init() do { } while (0) +#endif + void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; kref_init(&autogroup_default.kref); init_rwsem(&autogroup_default.lock); init_task->signal->autogroup = &autogroup_default; + sched_autogroup_sysctl_init(); } void autogroup_free(struct task_group *tg) @@ -31,7 +53,7 @@ static inline void autogroup_destroy(struct kref *kref) ag->tg->rt_se = NULL; ag->tg->rt_rq = NULL; #endif - sched_offline_group(ag->tg); + sched_release_group(ag->tg); sched_destroy_group(ag->tg); } @@ -139,7 +161,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) struct task_struct *t; unsigned long flags; - BUG_ON(!lock_task_sighand(p, &flags)); + if (WARN_ON_ONCE(!lock_task_sighand(p, &flags))) + return; prev = p->signal->autogroup; if (prev == ag) { diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index b96419974a1f..90d69f2c5eaf 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_SCHED_AUTOGROUP_H +#define _KERNEL_SCHED_AUTOGROUP_H + #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { @@ -27,6 +30,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { + extern unsigned int sysctl_sched_autogroup_enabled; int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); if (enabled && task_wants_autogroup(p, tg)) @@ -58,3 +62,5 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) } #endif /* CONFIG_SCHED_AUTOGROUP */ + +#endif /* _KERNEL_SCHED_AUTOGROUP_H */ diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c new file mode 100644 index 000000000000..d9dc9ab3773f --- /dev/null +++ b/kernel/sched/build_policy.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * These are the scheduling policy related scheduler files, built + * in a single compilation unit for build efficiency reasons. + * + * ( Incidentally, the size of the compilation unit is roughly + * comparable to core.c and fair.c, the other two big + * compilation units. This helps balance build time, while + * coalescing source files to amortize header inclusion + * cost. ) + * + * core.c and fair.c are built separately. + */ + +/* Headers: */ +#include <linux/sched/clock.h> +#include <linux/sched/cputime.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/posix-timers.h> +#include <linux/sched/rt.h> + +#include <linux/cpuidle.h> +#include <linux/jiffies.h> +#include <linux/livepatch.h> +#include <linux/psi.h> +#include <linux/seqlock_api.h> +#include <linux/slab.h> +#include <linux/suspend.h> +#include <linux/tsacct_kern.h> +#include <linux/vtime.h> + +#include <uapi/linux/sched/types.h> + +#include "sched.h" +#include "smp.h" + +#include "autogroup.h" +#include "stats.h" +#include "pelt.h" + +/* Source code modules: */ + +#include "idle.c" + +#include "rt.c" + +#ifdef CONFIG_SMP +# include "cpudeadline.c" +# include "pelt.c" +#endif + +#include "cputime.c" +#include "deadline.c" + diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c new file mode 100644 index 000000000000..99bdd96f454f --- /dev/null +++ b/kernel/sched/build_utility.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * These are various utility functions of the scheduler, + * built in a single compilation unit for build efficiency reasons. + * + * ( Incidentally, the size of the compilation unit is roughly + * comparable to core.c, fair.c, smp.c and policy.c, the other + * big compilation units. This helps balance build time, while + * coalescing source files to amortize header inclusion + * cost. ) + */ +#include <linux/sched/clock.h> +#include <linux/sched/cputime.h> +#include <linux/sched/debug.h> +#include <linux/sched/isolation.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/nohz.h> +#include <linux/sched/mm.h> +#include <linux/sched/rseq_api.h> +#include <linux/sched/task_stack.h> + +#include <linux/cpufreq.h> +#include <linux/cpumask_api.h> +#include <linux/cpuset.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/energy_model.h> +#include <linux/hashtable_api.h> +#include <linux/irq.h> +#include <linux/kobject_api.h> +#include <linux/membarrier.h> +#include <linux/mempolicy.h> +#include <linux/nmi.h> +#include <linux/nospec.h> +#include <linux/proc_fs.h> +#include <linux/psi.h> +#include <linux/psi.h> +#include <linux/ptrace_api.h> +#include <linux/sched_clock.h> +#include <linux/security.h> +#include <linux/spinlock_api.h> +#include <linux/swait_api.h> +#include <linux/timex.h> +#include <linux/utsname.h> +#include <linux/wait_api.h> +#include <linux/workqueue_api.h> + +#include <uapi/linux/prctl.h> +#include <uapi/linux/sched/types.h> + +#include <asm/switch_to.h> + +#include "sched.h" +#include "sched-pelt.h" +#include "stats.h" +#include "autogroup.h" + +#include "clock.c" + +#ifdef CONFIG_CGROUP_CPUACCT +# include "cpuacct.c" +#endif + +#ifdef CONFIG_CPU_FREQ +# include "cpufreq.c" +#endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +# include "cpufreq_schedutil.c" +#endif + +#ifdef CONFIG_SCHED_DEBUG +# include "debug.c" +#endif + +#ifdef CONFIG_SCHEDSTATS +# include "stats.c" +#endif + +#include "loadavg.c" +#include "completion.c" +#include "swait.c" +#include "wait_bit.c" +#include "wait.c" + +#ifdef CONFIG_SMP +# include "cpupri.c" +# include "stop_task.c" +# include "topology.c" +#endif + +#ifdef CONFIG_SCHED_CORE +# include "core_sched.c" +#endif + +#ifdef CONFIG_PSI +# include "psi.c" +#endif + +#ifdef CONFIG_MEMBARRIER +# include "membarrier.c" +#endif + +#ifdef CONFIG_CPU_ISOLATION +# include "isolation.c" +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +# include "autogroup.c" +#endif diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 12bca64dff73..e374c0c923da 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -41,7 +41,7 @@ * Otherwise it tries to create a semi stable clock from a mixture of other * clocks, including: * - * - GTOD (clock monotomic) + * - GTOD (clock monotonic) * - sched_clock() * - explicit idle events * @@ -53,15 +53,13 @@ * that is otherwise invisible (TSC gets stopped). * */ -#include "sched.h" -#include <linux/sched_clock.h> /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __weak sched_clock(void) +notrace unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); @@ -95,28 +93,28 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -static inline struct sched_clock_data *this_scd(void) +notrace static inline struct sched_clock_data *this_scd(void) { return this_cpu_ptr(&sched_clock_data); } -static inline struct sched_clock_data *cpu_sdc(int cpu) +notrace static inline struct sched_clock_data *cpu_sdc(int cpu) { return &per_cpu(sched_clock_data, cpu); } -int sched_clock_stable(void) +notrace int sched_clock_stable(void) { return static_branch_likely(&__sched_clock_stable); } -static void __scd_stamp(struct sched_clock_data *scd) +notrace static void __scd_stamp(struct sched_clock_data *scd) { scd->tick_gtod = ktime_get_ns(); scd->tick_raw = sched_clock(); } -static void __set_sched_clock_stable(void) +notrace static void __set_sched_clock_stable(void) { struct sched_clock_data *scd; @@ -151,7 +149,7 @@ static void __set_sched_clock_stable(void) * The only way to fully avoid random clock jumps is to boot with: * "tsc=unstable". */ -static void __sched_clock_work(struct work_struct *work) +notrace static void __sched_clock_work(struct work_struct *work) { struct sched_clock_data *scd; int cpu; @@ -177,7 +175,7 @@ static void __sched_clock_work(struct work_struct *work) static DECLARE_WORK(sched_clock_work, __sched_clock_work); -static void __clear_sched_clock_stable(void) +notrace static void __clear_sched_clock_stable(void) { if (!sched_clock_stable()) return; @@ -186,7 +184,7 @@ static void __clear_sched_clock_stable(void) schedule_work(&sched_clock_work); } -void clear_sched_clock_stable(void) +notrace void clear_sched_clock_stable(void) { __sched_clock_stable_early = 0; @@ -196,7 +194,7 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } -static void __sched_clock_gtod_offset(void) +notrace static void __sched_clock_gtod_offset(void) { struct sched_clock_data *scd = this_scd(); @@ -246,12 +244,12 @@ late_initcall(sched_clock_init_late); * min, max except they take wrapping into account */ -static inline u64 wrap_min(u64 x, u64 y) +notrace static inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -static inline u64 wrap_max(u64 x, u64 y) +notrace static inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -262,7 +260,7 @@ static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -static u64 sched_clock_local(struct sched_clock_data *scd) +notrace static u64 sched_clock_local(struct sched_clock_data *scd) { u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; @@ -289,13 +287,13 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) + if (!try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; } -static u64 sched_clock_remote(struct sched_clock_data *scd) +notrace static u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock; @@ -351,7 +349,7 @@ again: val = remote_clock; } - if (cmpxchg64(ptr, old_val, val) != old_val) + if (!try_cmpxchg64(ptr, &old_val, val)) goto again; return val; @@ -362,7 +360,7 @@ again: * * See cpu_clock(). */ -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; u64 clock; @@ -386,7 +384,7 @@ u64 sched_clock_cpu(int cpu) } EXPORT_SYMBOL_GPL(sched_clock_cpu); -void sched_clock_tick(void) +notrace void sched_clock_tick(void) { struct sched_clock_data *scd; @@ -403,7 +401,7 @@ void sched_clock_tick(void) sched_clock_local(scd); } -void sched_clock_tick_stable(void) +notrace void sched_clock_tick_stable(void) { if (!sched_clock_stable()) return; @@ -423,7 +421,7 @@ void sched_clock_tick_stable(void) /* * We are going deep-idle (irqs are disabled): */ -void sched_clock_idle_sleep_event(void) +notrace void sched_clock_idle_sleep_event(void) { sched_clock_cpu(smp_processor_id()); } @@ -432,7 +430,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* * We just idled; resync with ktime. */ -void sched_clock_idle_wakeup_event(void) +notrace void sched_clock_idle_wakeup_event(void) { unsigned long flags; @@ -458,7 +456,7 @@ void __init sched_clock_init(void) local_irq_enable(); } -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { if (!static_branch_likely(&sched_clock_running)) return 0; @@ -476,7 +474,7 @@ u64 sched_clock_cpu(int cpu) * On bare metal this function should return the same as local_clock. * Architectures and sub-architectures can override this. */ -u64 __weak running_clock(void) +notrace u64 __weak running_clock(void) { return local_clock(); } diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a1ad5b7d5521..d57a5c1c1cd9 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 + /* * Generic wait-for-completion handler; * @@ -11,7 +12,6 @@ * typically be used for exclusion which gives rise to priority inversion. * Waiting for completion is a typically sync point, but not an exclusion point. */ -#include "sched.h" /** * complete: - signals a single thread waiting on this completion @@ -29,12 +29,12 @@ void complete(struct completion *x) { unsigned long flags; - spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); if (x->done != UINT_MAX) x->done++; - __wake_up_locked(&x->wait, TASK_NORMAL, 1); - spin_unlock_irqrestore(&x->wait.lock, flags); + swake_up_locked(&x->wait); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -58,10 +58,12 @@ void complete_all(struct completion *x) { unsigned long flags; - spin_lock_irqsave(&x->wait.lock, flags); + lockdep_assert_RT_in_threaded_ctx(); + + raw_spin_lock_irqsave(&x->wait.lock, flags); x->done = UINT_MAX; - __wake_up_locked(&x->wait, TASK_NORMAL, 0); - spin_unlock_irqrestore(&x->wait.lock, flags); + swake_up_all_locked(&x->wait); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -70,20 +72,20 @@ do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { - DECLARE_WAITQUEUE(wait, current); + DECLARE_SWAITQUEUE(wait); - __add_wait_queue_entry_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } + __prepare_to_swait(&x->wait, &wait); __set_current_state(state); - spin_unlock_irq(&x->wait.lock); + raw_spin_unlock_irq(&x->wait.lock); timeout = action(timeout); - spin_lock_irq(&x->wait.lock); + raw_spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); + __finish_swait(&x->wait, &wait); if (!x->done) return timeout; } @@ -100,9 +102,9 @@ __wait_for_common(struct completion *x, complete_acquire(x); - spin_lock_irq(&x->wait.lock); + raw_spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); - spin_unlock_irq(&x->wait.lock); + raw_spin_unlock_irq(&x->wait.lock); complete_release(x); @@ -202,6 +204,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout); int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) return t; return 0; @@ -239,12 +242,23 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) return t; return 0; } EXPORT_SYMBOL(wait_for_completion_killable); +int __sched wait_for_completion_state(struct completion *x, unsigned int state) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state); + + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_state); + /** * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) * @x: holds the state of this particular completion @@ -291,12 +305,12 @@ bool try_wait_for_completion(struct completion *x) if (!READ_ONCE(x->done)) return false; - spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = false; else if (x->done != UINT_MAX) x->done--; - spin_unlock_irqrestore(&x->wait.lock, flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -322,8 +336,8 @@ bool completion_done(struct completion *x) * otherwise we can end up freeing the completion before complete() * is done referencing it. */ - spin_lock_irqsave(&x->wait.lock, flags); - spin_unlock_irqrestore(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a9983da4408..cb2aa2b54c7a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6,23 +6,94 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ -#include "sched.h" - +#include <linux/highmem.h> +#include <linux/hrtimer_api.h> +#include <linux/ktime_api.h> +#include <linux/sched/signal.h> +#include <linux/syscalls_api.h> +#include <linux/debug_locks.h> +#include <linux/prefetch.h> +#include <linux/capability.h> +#include <linux/pgtable_api.h> +#include <linux/wait_bit.h> +#include <linux/jiffies.h> +#include <linux/spinlock_api.h> +#include <linux/cpumask_api.h> +#include <linux/lockdep_api.h> +#include <linux/hardirq.h> +#include <linux/softirq.h> +#include <linux/refcount_api.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> +#include <linux/sched/debug.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/init.h> +#include <linux/sched/isolation.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/mm.h> +#include <linux/sched/nohz.h> +#include <linux/sched/rseq_api.h> +#include <linux/sched/rt.h> + +#include <linux/blkdev.h> +#include <linux/context_tracking.h> +#include <linux/cpuset.h> +#include <linux/delayacct.h> +#include <linux/init_task.h> +#include <linux/interrupt.h> +#include <linux/ioprio.h> +#include <linux/kallsyms.h> +#include <linux/kcov.h> +#include <linux/kprobes.h> +#include <linux/llist_api.h> +#include <linux/mmu_context.h> +#include <linux/mmzone.h> +#include <linux/mutex_api.h> +#include <linux/nmi.h> #include <linux/nospec.h> +#include <linux/perf_event_api.h> +#include <linux/profile.h> +#include <linux/psi.h> +#include <linux/rcuwait_api.h> +#include <linux/sched/wake_q.h> +#include <linux/scs.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/vtime.h> +#include <linux/wait_api.h> +#include <linux/workqueue_api.h> + +#ifdef CONFIG_PREEMPT_DYNAMIC +# ifdef CONFIG_GENERIC_ENTRY +# include <linux/entry-common.h> +# endif +#endif -#include <linux/kcov.h> +#include <uapi/linux/sched/types.h> +#include <asm/irq_regs.h> #include <asm/switch_to.h> #include <asm/tlb.h> -#include "../workqueue_internal.h" -#include "../../fs/io-wq.h" -#include "../smpboot.h" +#define CREATE_TRACE_POINTS +#include <linux/sched/rseq_api.h> +#include <trace/events/sched.h> +#undef CREATE_TRACE_POINTS + +#include "sched.h" +#include "stats.h" +#include "autogroup.h" +#include "autogroup.h" #include "pelt.h" +#include "smp.h" +#include "stats.h" -#define CREATE_TRACE_POINTS -#include <trace/events/sched.h> +#include "../workqueue_internal.h" +#include "../../io_uring/io-wq.h" +#include "../smpboot.h" /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -33,11 +104,16 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * @@ -51,27 +127,478 @@ const_debug unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT -#endif + +/* + * Print a warning if need_resched is set for the given duration (if + * LATENCY_WARN is enabled). + * + * If sysctl_resched_latency_warn_once is set, only one warning will be shown + * per boot. + */ +__read_mostly int sysctl_resched_latency_warn_ms = 100; +__read_mostly int sysctl_resched_latency_warn_once = 1; +#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -const_debug unsigned int sysctl_sched_nr_migrate = 32; +const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; + +__read_mostly int scheduler_running; + +#ifdef CONFIG_SCHED_CORE + +DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); + +/* kernel prio, less is more */ +static inline int __task_prio(struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) /* trumps deadline */ + return -2; + + if (rt_prio(p->prio)) /* includes deadline */ + return p->prio; /* [-1, 99] */ + + if (p->sched_class == &idle_sched_class) + return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ + + return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ +} /* - * period over which we measure -rt task CPU usage in us. - * default: 1s + * l(a,b) + * le(a,b) := !l(b,a) + * g(a,b) := l(b,a) + * ge(a,b) := !l(a,b) */ -unsigned int sysctl_sched_rt_period = 1000000; -__read_mostly int scheduler_running; +/* real prio, less is less */ +static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +{ + + int pa = __task_prio(a), pb = __task_prio(b); + + if (-pa < -pb) + return true; + + if (-pb < -pa) + return false; + + if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ + return !dl_time_before(a->dl.deadline, b->dl.deadline); + + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ + return cfs_prio_less(a, b, in_fi); + + return false; +} + +static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b) +{ + if (a->core_cookie < b->core_cookie) + return true; + + if (a->core_cookie > b->core_cookie) + return false; + + /* flip prio, so high prio is leftmost */ + if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count)) + return true; + + return false; +} + +#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node) + +static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b) +{ + return __sched_core_less(__node_2_sc(a), __node_2_sc(b)); +} + +static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) +{ + const struct task_struct *p = __node_2_sc(node); + unsigned long cookie = (unsigned long)key; + + if (cookie < p->core_cookie) + return -1; + + if (cookie > p->core_cookie) + return 1; + + return 0; +} + +void sched_core_enqueue(struct rq *rq, struct task_struct *p) +{ + rq->core->core_task_seq++; + + if (!p->core_cookie) + return; + + rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); +} + +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) +{ + rq->core->core_task_seq++; + + if (sched_core_enqueued(p)) { + rb_erase(&p->core_node, &rq->core_tree); + RB_CLEAR_NODE(&p->core_node); + } + + /* + * Migrating the last task off the cpu, with the cpu in forced idle + * state. Reschedule to create an accounting edge for forced idle, + * and re-examine whether the core is still in forced idle state. + */ + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 && + rq->core->core_forceidle_count && rq->curr == rq->idle) + resched_curr(rq); +} + +/* + * Find left-most (aka, highest priority) task matching @cookie. + */ +static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) +{ + struct rb_node *node; + + node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); + /* + * The idle task always matches any cookie! + */ + if (!node) + return idle_sched_class.pick_task(rq); + + return __node_2_sc(node); +} + +static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) +{ + struct rb_node *node = &p->core_node; + + node = rb_next(node); + if (!node) + return NULL; + + p = container_of(node, struct task_struct, core_node); + if (p->core_cookie != cookie) + return NULL; + + return p; +} + +/* + * Magic required such that: + * + * raw_spin_rq_lock(rq); + * ... + * raw_spin_rq_unlock(rq); + * + * ends up locking and unlocking the _same_ lock, and all CPUs + * always agree on what rq has what lock. + * + * XXX entirely possible to selectively enable cores, don't bother for now. + */ + +static DEFINE_MUTEX(sched_core_mutex); +static atomic_t sched_core_count; +static struct cpumask sched_core_mask; + +static void sched_core_lock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t, i = 0; + + local_irq_save(*flags); + for_each_cpu(t, smt_mask) + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); +} + +static void sched_core_unlock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t; + + for_each_cpu(t, smt_mask) + raw_spin_unlock(&cpu_rq(t)->__lock); + local_irq_restore(*flags); +} + +static void __sched_core_flip(bool enabled) +{ + unsigned long flags; + int cpu, t; + + cpus_read_lock(); + + /* + * Toggle the online cores, one by one. + */ + cpumask_copy(&sched_core_mask, cpu_online_mask); + for_each_cpu(cpu, &sched_core_mask) { + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + + sched_core_lock(cpu, &flags); + + for_each_cpu(t, smt_mask) + cpu_rq(t)->core_enabled = enabled; + + cpu_rq(cpu)->core->core_forceidle_start = 0; + + sched_core_unlock(cpu, &flags); + + cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); + } + + /* + * Toggle the offline CPUs. + */ + for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask) + cpu_rq(cpu)->core_enabled = enabled; + + cpus_read_unlock(); +} + +static void sched_core_assert_empty(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree)); +} + +static void __sched_core_enable(void) +{ + static_branch_enable(&__sched_core_enabled); + /* + * Ensure all previous instances of raw_spin_rq_*lock() have finished + * and future ones will observe !sched_core_disabled(). + */ + synchronize_rcu(); + __sched_core_flip(true); + sched_core_assert_empty(); +} + +static void __sched_core_disable(void) +{ + sched_core_assert_empty(); + __sched_core_flip(false); + static_branch_disable(&__sched_core_enabled); +} + +void sched_core_get(void) +{ + if (atomic_inc_not_zero(&sched_core_count)) + return; + + mutex_lock(&sched_core_mutex); + if (!atomic_read(&sched_core_count)) + __sched_core_enable(); + + smp_mb__before_atomic(); + atomic_inc(&sched_core_count); + mutex_unlock(&sched_core_mutex); +} + +static void __sched_core_put(struct work_struct *work) +{ + if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) { + __sched_core_disable(); + mutex_unlock(&sched_core_mutex); + } +} + +void sched_core_put(void) +{ + static DECLARE_WORK(_work, __sched_core_put); + + /* + * "There can be only one" + * + * Either this is the last one, or we don't actually need to do any + * 'work'. If it is the last *again*, we rely on + * WORK_STRUCT_PENDING_BIT. + */ + if (!atomic_add_unless(&sched_core_count, -1, 1)) + schedule_work(&_work); +} + +#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } +static inline void +sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } + +#endif /* CONFIG_SCHED_CORE */ + +/* + * Serialization rules: + * + * Lock order: + * + * p->pi_lock + * rq->lock + * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) + * + * rq1->lock + * rq2->lock where: rq1 < rq2 + * + * Regular state: + * + * Normal scheduling state is serialized by rq->lock. __schedule() takes the + * local CPU's rq->lock, it optionally removes the task from the runqueue and + * always looks at the local rq data structures to find the most eligible task + * to run next. + * + * Task enqueue is also under rq->lock, possibly taken from another CPU. + * Wakeups from another LLC domain might use an IPI to transfer the enqueue to + * the local CPU to avoid bouncing the runqueue state around [ see + * ttwu_queue_wakelist() ] + * + * Task wakeup, specifically wakeups that involve migration, are horribly + * complicated to avoid having to take two rq->locks. + * + * Special state: + * + * System-calls and anything external will use task_rq_lock() which acquires + * both p->pi_lock and rq->lock. As a consequence the state they change is + * stable while holding either lock: + * + * - sched_setaffinity()/ + * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed + * - set_user_nice(): p->se.load, p->*prio + * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, + * p->se.load, p->rt_priority, + * p->dl.dl_{runtime, deadline, period, flags, bw, density} + * - sched_setnuma(): p->numa_preferred_nid + * - sched_move_task(): p->sched_task_group + * - uclamp_update_active() p->uclamp* + * + * p->state <- TASK_*: + * + * is changed locklessly using set_current_state(), __set_current_state() or + * set_special_state(), see their respective comments, or by + * try_to_wake_up(). This latter uses p->pi_lock to serialize against + * concurrent self. + * + * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: + * + * is set by activate_task() and cleared by deactivate_task(), under + * rq->lock. Non-zero indicates the task is runnable, the special + * ON_RQ_MIGRATING state is used for migration without holding both + * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). + * + * p->on_cpu <- { 0, 1 }: + * + * is set by prepare_task() and cleared by finish_task() such that it will be + * set before p is scheduled-in and cleared after p is scheduled-out, both + * under rq->lock. Non-zero indicates the task is running on its CPU. + * + * [ The astute reader will observe that it is possible for two tasks on one + * CPU to have ->on_cpu = 1 at the same time. ] + * + * task_cpu(p): is changed by set_task_cpu(), the rules are: + * + * - Don't call set_task_cpu() on a blocked task: + * + * We don't care what CPU we're not running on, this simplifies hotplug, + * the CPU assignment of blocked tasks isn't required to be valid. + * + * - for try_to_wake_up(), called under p->pi_lock: + * + * This allows try_to_wake_up() to only take one rq->lock, see its comment. + * + * - for migration called under rq->lock: + * [ see task_on_rq_migrating() in task_rq_lock() ] + * + * o move_queued_task() + * o detach_task() + * + * - for migration called under double_rq_lock(): + * + * o __migrate_swap_task() + * o push_rt_task() / pull_rt_task() + * o push_dl_task() / pull_dl_task() + * o dl_task_offline_migration() + * + */ + +void raw_spin_rq_lock_nested(struct rq *rq, int subclass) +{ + raw_spinlock_t *lock; + + /* Matches synchronize_rcu() in __sched_core_enable() */ + preempt_disable(); + if (sched_core_disabled()) { + raw_spin_lock_nested(&rq->__lock, subclass); + /* preempt_count *MUST* be > 1 */ + preempt_enable_no_resched(); + return; + } + + for (;;) { + lock = __rq_lockp(rq); + raw_spin_lock_nested(lock, subclass); + if (likely(lock == __rq_lockp(rq))) { + /* preempt_count *MUST* be > 1 */ + preempt_enable_no_resched(); + return; + } + raw_spin_unlock(lock); + } +} + +bool raw_spin_rq_trylock(struct rq *rq) +{ + raw_spinlock_t *lock; + bool ret; + + /* Matches synchronize_rcu() in __sched_core_enable() */ + preempt_disable(); + if (sched_core_disabled()) { + ret = raw_spin_trylock(&rq->__lock); + preempt_enable(); + return ret; + } + + for (;;) { + lock = __rq_lockp(rq); + ret = raw_spin_trylock(lock); + if (!ret || (likely(lock == __rq_lockp(rq)))) { + preempt_enable(); + return ret; + } + raw_spin_unlock(lock); + } +} + +void raw_spin_rq_unlock(struct rq *rq) +{ + raw_spin_unlock(rq_lockp(rq)); +} +#ifdef CONFIG_SMP /* - * part of the period that we allow rt tasks to run in us. - * default: 0.95s + * double_rq_lock - safely lock two runqueues */ -int sysctl_sched_rt_runtime = 950000; +void double_rq_lock(struct rq *rq1, struct rq *rq2) +{ + lockdep_assert_irqs_disabled(); + + if (rq_order_less(rq2, rq1)) + swap(rq1, rq2); + + raw_spin_rq_lock(rq1); + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); + + double_rq_clock_clear_update(rq1, rq2); +} +#endif /* * __task_rq_lock - lock the rq @p resides on. @@ -85,12 +612,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) for (;;) { rq = task_rq(p); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { rq_pin_lock(rq, rf); return rq; } - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); while (unlikely(task_on_rq_migrating(p))) cpu_relax(); @@ -109,7 +636,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) for (;;) { raw_spin_lock_irqsave(&p->pi_lock, rf->flags); rq = task_rq(p); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); /* * move_queued_task() task_rq_lock() * @@ -131,7 +658,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) rq_pin_lock(rq, rf); return rq; } - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); while (unlikely(task_on_rq_migrating(p))) @@ -174,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; + psi_account_irqtime(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { @@ -201,7 +729,7 @@ void update_rq_clock(struct rq *rq) { s64 delta; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (rq->clock_update_flags & RQCF_ACT_SKIP) return; @@ -219,7 +747,6 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } - #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -255,8 +782,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = rq->hrtick_time; - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -269,7 +797,6 @@ static void __hrtick_start(void *arg) rq_lock(rq, &rf); __hrtick_restart(rq); - rq->hrtick_csd_pending = 0; rq_unlock(rq, &rf); } @@ -281,7 +808,6 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time; s64 delta; /* @@ -289,16 +815,12 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - time = ktime_add_ns(timer->base->get_time(), delta); - - hrtimer_set_expires(timer, time); + rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); - if (rq == this_rq()) { + if (rq == this_rq()) __hrtick_restart(rq); - } else if (!rq->hrtick_csd_pending) { + else smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); - rq->hrtick_csd_pending = 1; - } } #else @@ -317,18 +839,14 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED_HARD); } + #endif /* CONFIG_SMP */ static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq->hrtick_csd_pending = 0; - - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; + INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; } @@ -349,15 +867,11 @@ static inline void hrtick_rq_init(struct rq *rq) ({ \ typeof(ptr) _ptr = (ptr); \ typeof(mask) _mask = (mask); \ - typeof(*_ptr) _old, _val = *_ptr; \ + typeof(*_ptr) _val = *_ptr; \ \ - for (;;) { \ - _old = cmpxchg(_ptr, _val, _val | _mask); \ - if (_old == _val) \ - break; \ - _val = _old; \ - } \ - _old; \ + do { \ + } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \ + _val; \ }) #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) @@ -366,7 +880,7 @@ static inline void hrtick_rq_init(struct rq *rq) * this avoids any races wrt polling state changes and thereby avoids * spurious IPIs. */ -static bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); @@ -381,30 +895,28 @@ static bool set_nr_and_not_polling(struct task_struct *p) static bool set_nr_if_polling(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - typeof(ti->flags) old, val = READ_ONCE(ti->flags); + typeof(ti->flags) val = READ_ONCE(ti->flags); for (;;) { if (!(val & _TIF_POLLING_NRFLAG)) return false; if (val & _TIF_NEED_RESCHED) return true; - old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); - if (old == val) + if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)) break; - val = old; } return true; } #else -static bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct task_struct *p) { set_tsk_need_resched(p); return true; } #ifdef CONFIG_SMP -static bool set_nr_if_polling(struct task_struct *p) +static inline bool set_nr_if_polling(struct task_struct *p) { return false; } @@ -417,7 +929,7 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) /* * Atomically grab the task, if ->wake_q is !nil already it means - * its already queued (either by us or someone else) and will get the + * it's already queued (either by us or someone else) and will get the * wakeup due to that. * * In order to ensure that a pending wakeup will observe our pending @@ -484,7 +996,6 @@ void wake_up_q(struct wake_q_head *head) struct task_struct *task; task = container_of(node, struct task_struct, wake_q); - BUG_ON(!task); /* Task can safely be re-inserted now: */ node = node->next; task->wake_q.next = NULL; @@ -510,7 +1021,7 @@ void resched_curr(struct rq *rq) struct task_struct *curr = rq->curr; int cpu; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (test_tsk_need_resched(curr)) return; @@ -534,10 +1045,10 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); if (cpu_online(cpu) || cpu == smp_processor_id()) resched_curr(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); } #ifdef CONFIG_SMP @@ -554,17 +1065,19 @@ int get_nohz_timer_target(void) { int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; + const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } + hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); + rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu_and(i, sched_domain_span(sd), - housekeeping_cpumask(HK_FLAG_TIMER)) { + for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { if (cpu == i) continue; @@ -576,7 +1089,7 @@ int get_nohz_timer_target(void) } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); cpu = default_cpu; unlock: rcu_read_unlock(); @@ -637,29 +1150,23 @@ void wake_up_nohz_cpu(int cpu) wake_up_idle_cpu(cpu); } -static inline bool got_nohz_idle_kick(void) +static void nohz_csd_func(void *info) { - int cpu = smp_processor_id(); - - if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) - return false; - - if (idle_cpu(cpu) && !need_resched()) - return true; + struct rq *rq = info; + int cpu = cpu_of(rq); + unsigned int flags; /* - * We can't run Idle Load Balance on this CPU for this time so we - * cancel it and clear NOHZ_BALANCE_KICK + * Release the rq::nohz_csd. */ - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); - return false; -} - -#else /* CONFIG_NO_HZ_COMMON */ + flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu)); + WARN_ON(!(flags & NOHZ_KICK_MASK)); -static inline bool got_nohz_idle_kick(void) -{ - return false; + rq->idle_balance = idle_cpu(cpu); + if (rq->idle_balance && !need_resched()) { + rq->nohz_idle_balance = flags; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } } #endif /* CONFIG_NO_HZ_COMMON */ @@ -674,7 +1181,7 @@ bool sched_can_stop_tick(struct rq *rq) return false; /* - * If there are more than one RR tasks, we need the tick to effect the + * If there are more than one RR tasks, we need the tick to affect the * actual RR behaviour. */ if (rq->rt.rr_nr_running) { @@ -761,7 +1268,6 @@ static void set_load_weight(struct task_struct *p, bool update_load) if (task_has_idle_policy(p)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; - p->se.runnable_weight = load->weight; return; } @@ -774,7 +1280,6 @@ static void set_load_weight(struct task_struct *p, bool update_load) } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; - p->se.runnable_weight = load->weight; } } @@ -792,14 +1297,51 @@ static void set_load_weight(struct task_struct *p, bool update_load) static DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ -unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; +static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ -unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; + +/* + * By default RT tasks run at the maximum performance point/capacity of the + * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to + * SCHED_CAPACITY_SCALE. + * + * This knob allows admins to change the default behavior when uclamp is being + * used. In battery powered devices, particularly, running at the maximum + * capacity and frequency will increase energy consumption and shorten the + * battery life. + * + * This knob only affects RT tasks that their uclamp_se->user_defined == false. + * + * This knob will not override the system default sched_util_clamp_min defined + * above. + */ +static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; +/* + * This static key is used to reduce the uclamp overhead in the fast path. It + * primarily disables the call to uclamp_rq_{inc, dec}() in + * enqueue/dequeue_task(). + * + * This allows users to continue to enable uclamp in their kernel config with + * minimum uclamp overhead in the fast path. + * + * As soon as userspace modifies any of the uclamp knobs, the static key is + * enabled, since we have an actual users that make use of uclamp + * functionality. + * + * The knobs that would enable this static key are: + * + * * A task modifying its uclamp value with sched_setattr(). + * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. + * * An admin modifying the cgroup cpu.uclamp.{min, max} + */ +DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); + /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -808,12 +1350,7 @@ static struct uclamp_se uclamp_default[UCLAMP_CNT]; static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) { - return clamp_value / UCLAMP_BUCKET_DELTA; -} - -static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) -{ - return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); + return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); } static inline unsigned int uclamp_none(enum uclamp_id clamp_id) @@ -879,12 +1416,44 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static void __uclamp_update_util_min_rt_default(struct task_struct *p) +{ + unsigned int default_util_min; + struct uclamp_se *uc_se; + + lockdep_assert_held(&p->pi_lock); + + uc_se = &p->uclamp_req[UCLAMP_MIN]; + + /* Only sync if user didn't override the default */ + if (uc_se->user_defined) + return; + + default_util_min = sysctl_sched_uclamp_util_min_rt_default; + uclamp_se_set(uc_se, default_util_min, false); +} + +static void uclamp_update_util_min_rt_default(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + + if (!rt_task(p)) + return; + + /* Protect updates to p->uclamp_* */ + rq = task_rq_lock(p, &rf); + __uclamp_update_util_min_rt_default(p); + task_rq_unlock(rq, p, &rf); +} + static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { + /* Copy by value as we could modify it */ struct uclamp_se uc_req = p->uclamp_req[clamp_id]; #ifdef CONFIG_UCLAMP_TASK_GROUP - struct uclamp_se uc_max; + unsigned int tg_min, tg_max, value; /* * Tasks in autogroups or root task group will be @@ -895,9 +1464,11 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) if (task_group(p) == &root_task_group) return uc_req; - uc_max = task_group(p)->uclamp[clamp_id]; - if (uc_req.value > uc_max.value || !uc_req.user_defined) - return uc_max; + tg_min = task_group(p)->uclamp[UCLAMP_MIN].value; + tg_max = task_group(p)->uclamp[UCLAMP_MAX].value; + value = uc_req.value; + value = clamp(value, tg_min, tg_max); + uclamp_se_set(&uc_req, value, false); #endif return uc_req; @@ -954,7 +1525,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_bucket *bucket; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); /* Update task effective clamp */ p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); @@ -994,12 +1565,40 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, unsigned int bkt_clamp; unsigned int rq_clamp; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); + + /* + * If sched_uclamp_used was enabled after task @p was enqueued, + * we could end up with unbalanced call to uclamp_rq_dec_id(). + * + * In this case the uc_se->active flag should be false since no uclamp + * accounting was performed at enqueue time and we can just return + * here. + * + * Need to be careful of the following enqueue/dequeue ordering + * problem too + * + * enqueue(taskA) + * // sched_uclamp_used gets enabled + * enqueue(taskB) + * dequeue(taskA) + * // Must not decrement bucket->tasks here + * dequeue(taskB) + * + * where we could end up with stale data in uc_se and + * bucket[uc_se->bucket_id]. + * + * The following check here eliminates the possibility of such race. + */ + if (unlikely(!uc_se->active)) + return; bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* @@ -1027,6 +1626,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1042,6 +1650,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1049,9 +1666,27 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) +{ + if (!p->uclamp[clamp_id].active) + return; + + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + + /* + * Make sure to clear the idle flag if we've transiently reached 0 + * active tasks on rq. + */ + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} + static inline void -uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) +uclamp_update_active(struct task_struct *p) { + enum uclamp_id clamp_id; struct rq_flags rf; struct rq *rq; @@ -1071,34 +1706,31 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - } + for_each_clamp_id(clamp_id) + uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); } #ifdef CONFIG_UCLAMP_TASK_GROUP static inline void -uclamp_update_active_tasks(struct cgroup_subsys_state *css, - unsigned int clamps) +uclamp_update_active_tasks(struct cgroup_subsys_state *css) { - enum uclamp_id clamp_id; struct css_task_iter it; struct task_struct *p; css_task_iter_start(css, 0, &it); - while ((p = css_task_iter_next(&it))) { - for_each_clamp_id(clamp_id) { - if ((0x1 << clamp_id) & clamps) - uclamp_update_active(p, clamp_id); - } - } + while ((p = css_task_iter_next(&it))) + uclamp_update_active(p); css_task_iter_end(&it); } static void cpu_util_update_eff(struct cgroup_subsys_state *css); +#endif + +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_UCLAMP_TASK +#ifdef CONFIG_UCLAMP_TASK_GROUP static void uclamp_update_root_tg(void) { struct task_group *tg = &root_task_group; @@ -1116,17 +1748,44 @@ static void uclamp_update_root_tg(void) static void uclamp_update_root_tg(void) { } #endif -int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + rcu_read_lock(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); + rcu_read_unlock(); +} + +static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; - int old_min, old_max; + int old_min, old_max, old_min_rt; int result; mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; + old_min_rt = sysctl_sched_uclamp_util_min_rt_default; result = proc_dointvec(table, write, buffer, lenp, ppos); if (result) @@ -1135,7 +1794,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, goto done; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || - sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || + sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) { + result = -EINVAL; goto undo; } @@ -1151,8 +1812,15 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, update_root_tg = true; } - if (update_root_tg) + if (update_root_tg) { + static_branch_enable(&sched_uclamp_used); uclamp_update_root_tg(); + } + + if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { + static_branch_enable(&sched_uclamp_used); + uclamp_sync_util_min_rt_default(); + } /* * We update all RUNNABLE tasks only when task groups are in use. @@ -1165,64 +1833,111 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; + sysctl_sched_uclamp_util_min_rt_default = old_min_rt; done: mutex_unlock(&uclamp_mutex); return result; } +#endif +#endif static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) { - unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; - unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; + int util_min = p->uclamp_req[UCLAMP_MIN].value; + int util_max = p->uclamp_req[UCLAMP_MAX].value; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) - lower_bound = attr->sched_util_min; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) - upper_bound = attr->sched_util_max; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + util_min = attr->sched_util_min; - if (lower_bound > upper_bound) - return -EINVAL; - if (upper_bound > SCHED_CAPACITY_SCALE) + if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + util_max = attr->sched_util_max; + + if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (util_min != -1 && util_max != -1 && util_min > util_max) return -EINVAL; + /* + * We have valid uclamp attributes; make sure uclamp is enabled. + * + * We need to do that here, because enabling static branches is a + * blocking operation which obviously cannot be done while holding + * scheduler locks. + */ + static_branch_enable(&sched_uclamp_used); + return 0; } +static bool uclamp_reset(const struct sched_attr *attr, + enum uclamp_id clamp_id, + struct uclamp_se *uc_se) +{ + /* Reset on sched class change for a non user-defined clamp value. */ + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && + !uc_se->user_defined) + return true; + + /* Reset on sched_util_{min,max} == -1. */ + if (clamp_id == UCLAMP_MIN && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min == -1) { + return true; + } + + if (clamp_id == UCLAMP_MAX && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max == -1) { + return true; + } + + return false; +} + static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { enum uclamp_id clamp_id; - /* - * On scheduling class change, reset to default clamps for tasks - * without a task-specific value. - */ for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; - unsigned int clamp_value = uclamp_none(clamp_id); + unsigned int value; - /* Keep using defined clamps across class changes */ - if (uc_se->user_defined) + if (!uclamp_reset(attr, clamp_id, uc_se)) continue; - /* By default, RT tasks always get 100% boost */ + /* + * RT by default have a 100% boost value that could be modified + * at runtime. + */ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - clamp_value = uclamp_none(UCLAMP_MAX); + value = sysctl_sched_uclamp_util_min_rt_default; + else + value = uclamp_none(clamp_id); + + uclamp_se_set(uc_se, value, false); - uclamp_se_set(uc_se, clamp_value, false); } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) return; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true); } - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], attr->sched_util_max, true); } @@ -1232,6 +1947,10 @@ static void uclamp_fork(struct task_struct *p) { enum uclamp_id clamp_id; + /* + * We don't need to hold task_rq_lock() when updating p->uclamp_* here + * as the task is still at its early fork stages. + */ for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1239,14 +1958,28 @@ static void uclamp_fork(struct task_struct *p) return; for_each_clamp_id(clamp_id) { - unsigned int clamp_value = uclamp_none(clamp_id); + uclamp_se_set(&p->uclamp_req[clamp_id], + uclamp_none(clamp_id), false); + } +} - /* By default, RT tasks always get 100% boost */ - if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - clamp_value = uclamp_none(UCLAMP_MAX); +static void uclamp_post_fork(struct task_struct *p) +{ + uclamp_update_util_min_rt_default(p); +} + +static void __init init_uclamp_rq(struct rq *rq) +{ + enum uclamp_id clamp_id; + struct uclamp_rq *uc_rq = rq->uclamp; - uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false); + for_each_clamp_id(clamp_id) { + uc_rq[clamp_id] = (struct uclamp_rq) { + .value = uclamp_none(clamp_id) + }; } + + rq->uclamp_flags = UCLAMP_FLAG_IDLE; } static void __init init_uclamp(void) @@ -1255,13 +1988,8 @@ static void __init init_uclamp(void) enum uclamp_id clamp_id; int cpu; - mutex_init(&uclamp_mutex); - - for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, - sizeof(struct uclamp_rq)*UCLAMP_CNT); - cpu_rq(cpu)->uclamp_flags = 0; - } + for_each_possible_cpu(cpu) + init_uclamp_rq(cpu_rq(cpu)); for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], @@ -1290,30 +2018,61 @@ static inline int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } +static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ +bool sched_task_on_rq(struct task_struct *p) +{ + return task_on_rq_queued(p); +} + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long ip = 0; + unsigned int state; + + if (!p || p == current) + return 0; + + /* Only get wchan if task is blocked and we can keep it that way. */ + raw_spin_lock_irq(&p->pi_lock); + state = READ_ONCE(p->__state); + smp_rmb(); /* see try_to_wake_up() */ + if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq) + ip = __get_wchan(p); + raw_spin_unlock_irq(&p->pi_lock); + + return ip; +} + static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); if (!(flags & ENQUEUE_RESTORE)) { - sched_info_queued(rq, p); + sched_info_enqueue(rq, p); psi_enqueue(p, flags & ENQUEUE_WAKEUP); } uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + + if (sched_core_enabled(rq)) + sched_core_enqueue(rq, p); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { + if (sched_core_enabled(rq)) + sched_core_dequeue(rq, p, flags); + if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); if (!(flags & DEQUEUE_SAVE)) { - sched_info_dequeued(rq, p); + sched_info_dequeue(rq, p); psi_dequeue(p, flags & DEQUEUE_SLEEP); } @@ -1323,9 +2082,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1335,18 +2091,21 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - dequeue_task(rq, p, flags); } -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) +static inline int __normal_prio(int policy, int rt_prio, int nice) { - return p->static_prio; + int prio; + + if (dl_policy(policy)) + prio = MAX_DL_PRIO - 1; + else if (rt_policy(policy)) + prio = MAX_RT_PRIO - 1 - rt_prio; + else + prio = NICE_TO_PRIO(nice); + + return prio; } /* @@ -1358,15 +2117,7 @@ static inline int __normal_prio(struct task_struct *p) */ static inline int normal_prio(struct task_struct *p) { - int prio; - - if (task_has_dl_policy(p)) - prio = MAX_DL_PRIO-1; - else if (task_has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; + return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); } /* @@ -1422,20 +2173,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - const struct sched_class *class; - - if (p->sched_class == rq->curr->sched_class) { + if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, p, flags); - } else { - for_each_class(class) { - if (class == rq->curr->sched_class) - break; - if (class == p->sched_class) { - resched_curr(rq); - break; - } - } - } + else if (sched_class_above(p->sched_class, rq->curr->sched_class)) + resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In @@ -1447,19 +2188,107 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags); + +static void migrate_disable_switch(struct rq *rq, struct task_struct *p) +{ + if (likely(!p->migration_disabled)) + return; + + if (p->cpus_ptr != &p->cpus_mask) + return; + + /* + * Violates locking rules! see comment in __do_set_cpus_allowed(). + */ + __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); +} + +void migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { + p->migration_disabled++; + return; + } + + preempt_disable(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); + +void migrate_enable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + if (WARN_ON_ONCE(!p->migration_disabled)) + return; + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + preempt_disable(); + if (p->cpus_ptr != &p->cpus_mask) + __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq()->nr_pinned--; + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_enable); + +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return rq->nr_pinned; +} + /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { + /* When not in the task's cpumask, no point in looking further. */ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; - if (is_per_cpu_kthread(p)) + /* migrate_disabled() must be allowed to finish. */ + if (is_migration_disabled(p)) + return cpu_online(cpu); + + /* Non kernel threads are not allowed during either online or offline. */ + if (!(p->flags & PF_KTHREAD)) + return cpu_active(cpu) && task_cpu_possible(cpu, p); + + /* KTHREAD_IS_PER_CPU is always allowed. */ + if (kthread_is_per_cpu(p)) return cpu_online(cpu); - return cpu_active(cpu); + /* Regular kernel threads don't get to stay during offline. */ + if (cpu_dying(cpu)) + return false; + + /* But are allowed during online. */ + return cpu_online(cpu); } /* @@ -1484,27 +2313,38 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int new_cpu) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); - WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); - dequeue_task(rq, p, DEQUEUE_NOCLOCK); + deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); rq = cpu_rq(new_cpu); rq_lock(rq, rf); - BUG_ON(task_cpu(p) != new_cpu); - enqueue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; + WARN_ON_ONCE(task_cpu(p) != new_cpu); + activate_task(rq, p, 0); check_preempt_curr(rq, p, 0); return rq; } struct migration_arg { - struct task_struct *task; - int dest_cpu; + struct task_struct *task; + int dest_cpu; + struct set_affinity_pending *pending; +}; + +/* + * @refs: number of wait_for_completion() + * @stop_pending: is @stop_work in use + */ +struct set_affinity_pending { + refcount_t refs; + unsigned int stop_pending; + struct completion done; + struct cpu_stop_work stop_work; + struct migration_arg arg; }; /* @@ -1537,39 +2377,145 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, static int migration_cpu_stop(void *data) { struct migration_arg *arg = data; + struct set_affinity_pending *pending = arg->pending; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + bool complete = false; struct rq_flags rf; /* * The original target CPU might have gone down and we might * be on another CPU but it doesn't matter. */ - local_irq_disable(); + local_irq_save(rf.flags); /* * We need to explicitly wake pending tasks before running * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - sched_ttwu_pending(); + flush_smp_call_function_queue(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); + + /* + * If we were passed a pending, then ->stop_pending was set, thus + * p->migration_pending must have remained stable. + */ + WARN_ON_ONCE(pending && pending != p->migration_pending); + /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ if (task_rq(p) == rq) { + if (is_migration_disabled(p)) + goto out; + + if (pending) { + p->migration_pending = NULL; + complete = true; + + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) + goto out; + } + if (task_on_rq_queued(p)) rq = __migrate_task(rq, &rf, p, arg->dest_cpu); else p->wake_cpu = arg->dest_cpu; + + /* + * XXX __migrate_task() can fail, at which point we might end + * up running on a dodgy CPU, AFAICT this can only happen + * during CPU hotplug, at which point we'll get pushed out + * anyway, so it's probably not a big deal. + */ + + } else if (pending) { + /* + * This happens when we get migrated between migrate_enable()'s + * preempt_enable() and scheduling the stopper task. At that + * point we're a regular task again and not current anymore. + * + * A !PREEMPT kernel has a giant hole here, which makes it far + * more likely. + */ + + /* + * The task moved before the stopper got to run. We're holding + * ->pi_lock, so the allowed mask is stable - if it got + * somewhere allowed, we're done. + */ + if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { + p->migration_pending = NULL; + complete = true; + goto out; + } + + /* + * When migrate_enable() hits a rq mis-match we can't reliably + * determine is_migration_disabled() and so have to chase after + * it. + */ + WARN_ON_ONCE(!pending->stop_pending); + task_rq_unlock(rq, p, &rf); + stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, + &pending->arg, &pending->stop_work); + return 0; } - rq_unlock(rq, &rf); - raw_spin_unlock(&p->pi_lock); +out: + if (pending) + pending->stop_pending = false; + task_rq_unlock(rq, p, &rf); + + if (complete) + complete_all(&pending->done); - local_irq_enable(); + return 0; +} + +int push_cpu_stop(void *arg) +{ + struct rq *lowest_rq = NULL, *rq = this_rq(); + struct task_struct *p = arg; + + raw_spin_lock_irq(&p->pi_lock); + raw_spin_rq_lock(rq); + + if (task_rq(p) != rq) + goto out_unlock; + + if (is_migration_disabled(p)) { + p->migration_flags |= MDF_PUSH; + goto out_unlock; + } + + p->migration_flags &= ~MDF_PUSH; + + if (p->sched_class->find_lock_rq) + lowest_rq = p->sched_class->find_lock_rq(p, rq); + + if (!lowest_rq) + goto out_unlock; + + // XXX validate p is still the highest prio task + if (task_rq(p) == rq) { + deactivate_task(rq, p, 0); + set_task_cpu(p, lowest_rq->cpu); + activate_task(lowest_rq, p, 0); + resched_curr(lowest_rq); + } + + double_unlock_balance(rq, lowest_rq); + +out_unlock: + rq->push_busy = false; + raw_spin_rq_unlock(rq); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); return 0; } @@ -1577,18 +2523,39 @@ static int migration_cpu_stop(void *data) * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. */ -void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { + if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { + p->cpus_ptr = new_mask; + return; + } + cpumask_copy(&p->cpus_mask, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { struct rq *rq = task_rq(p); bool queued, running; - lockdep_assert_held(&p->pi_lock); + /* + * This here violates the locking rules for affinity, since we're only + * supposed to change these variables while holding both rq->lock and + * p->pi_lock. + * + * HOWEVER, it magically works, because ttwu() is the only code that + * accesses these variables under p->pi_lock and only does so after + * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() + * before finish_task(). + * + * XXX do further audits, this smells like something putrid. + */ + if (flags & SCA_MIGRATE_DISABLE) + SCHED_WARN_ON(!p->on_cpu); + else + lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -1598,13 +2565,13 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * Because __kthread_bind() calls this on blocked tasks without * holding rq->lock. */ - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); } if (running) put_prev_task(rq, p); - p->sched_class->set_cpus_allowed(p, new_mask); + p->sched_class->set_cpus_allowed(p, new_mask, flags); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -1612,109 +2579,517 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) set_next_task(rq, p); } +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + __do_set_cpus_allowed(p, new_mask, 0); +} + +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, + int node) +{ + if (!src->user_cpus_ptr) + return 0; + + dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); + if (!dst->user_cpus_ptr) + return -ENOMEM; + + cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + return 0; +} + +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = NULL; + + swap(p->user_cpus_ptr, user_mask); + + return user_mask; +} + +void release_user_cpus_ptr(struct task_struct *p) +{ + kfree(clear_user_cpus_ptr(p)); +} + /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * This function is wildly self concurrent; here be dragons. + * + * + * When given a valid mask, __set_cpus_allowed_ptr() must block until the + * designated task is enqueued on an allowed CPU. If that task is currently + * running, we have to kick it out using the CPU stopper. + * + * Migrate-Disable comes along and tramples all over our nice sandcastle. + * Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * + * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes + * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). + * This means we need the following scheme: + * + * P0@CPU0 P1 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * <blocks> + * <resumes> + * migrate_enable(); + * __set_cpus_allowed_ptr(); + * <wakes local stopper> + * `--> <woken on migration completion> + * + * Now the fun stuff: there may be several P1-like tasks, i.e. multiple + * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any + * task p are serialized by p->pi_lock, which we can leverage: the one that + * should come into effect at the end of the Migrate-Disable region is the last + * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), + * but we still need to properly signal those waiting tasks at the appropriate + * moment. + * + * This is implemented using struct set_affinity_pending. The first + * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will + * setup an instance of that struct and install it on the targeted task_struct. + * Any and all further callers will reuse that instance. Those then wait for + * a completion signaled at the tail of the CPU stopper callback (1), triggered + * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). + * + * + * (1) In the cases covered above. There is one more where the completion is + * signaled within affine_move_task() itself: when a subsequent affinity request + * occurs after the stopper bailed out due to the targeted task still being + * Migrate-Disable. Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * CPU0 P1 P2 + * <P0> + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * <blocks> + * <migration/0> + * migration_cpu_stop() + * is_migration_disabled() + * <bails> + * set_cpus_allowed_ptr(P0, [0, 1]); + * <signal completion> + * <awakes> + * + * Note that the above is safe vs a concurrent migrate_enable(), as any + * pending affinity completion is preceded by an uninstallation of + * p->migration_pending done with p->pi_lock held. + */ +static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, + int dest_cpu, unsigned int flags) +{ + struct set_affinity_pending my_pending = { }, *pending = NULL; + bool stop_pending, complete = false; + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + struct task_struct *push_task = NULL; + + if ((flags & SCA_MIGRATE_ENABLE) && + (p->migration_flags & MDF_PUSH) && !rq->push_busy) { + rq->push_busy = true; + push_task = get_task_struct(p); + } + + /* + * If there are pending waiters, but no pending stop_work, + * then complete now. + */ + pending = p->migration_pending; + if (pending && !pending->stop_pending) { + p->migration_pending = NULL; + complete = true; + } + + task_rq_unlock(rq, p, rf); + + if (push_task) { + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + p, &rq->push_work); + } + + if (complete) + complete_all(&pending->done); + + return 0; + } + + if (!(flags & SCA_MIGRATE_ENABLE)) { + /* serialized by p->pi_lock */ + if (!p->migration_pending) { + /* Install the request */ + refcount_set(&my_pending.refs, 1); + init_completion(&my_pending.done); + my_pending.arg = (struct migration_arg) { + .task = p, + .dest_cpu = dest_cpu, + .pending = &my_pending, + }; + + p->migration_pending = &my_pending; + } else { + pending = p->migration_pending; + refcount_inc(&pending->refs); + /* + * Affinity has changed, but we've already installed a + * pending. migration_cpu_stop() *must* see this, else + * we risk a completion of the pending despite having a + * task on a disallowed CPU. + * + * Serialized by p->pi_lock, so this is safe. + */ + pending->arg.dest_cpu = dest_cpu; + } + } + pending = p->migration_pending; + /* + * - !MIGRATE_ENABLE: + * we'll have installed a pending if there wasn't one already. + * + * - MIGRATE_ENABLE: + * we're here because the current CPU isn't matching anymore, + * the only way that can happen is because of a concurrent + * set_cpus_allowed_ptr() call, which should then still be + * pending completion. + * + * Either way, we really should have a @pending here. + */ + if (WARN_ON_ONCE(!pending)) { + task_rq_unlock(rq, p, rf); + return -EINVAL; + } + + if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { + /* + * MIGRATE_ENABLE gets here because 'p == current', but for + * anything else we cannot do is_migration_disabled(), punt + * and have the stopper function handle it all race-free. + */ + stop_pending = pending->stop_pending; + if (!stop_pending) + pending->stop_pending = true; + + if (flags & SCA_MIGRATE_ENABLE) + p->migration_flags &= ~MDF_PUSH; + + task_rq_unlock(rq, p, rf); + + if (!stop_pending) { + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + } + + if (flags & SCA_MIGRATE_ENABLE) + return 0; + } else { + + if (!is_migration_disabled(p)) { + if (task_on_rq_queued(p)) + rq = move_queued_task(rq, rf, p, dest_cpu); + + if (!pending->stop_pending) { + p->migration_pending = NULL; + complete = true; + } + } + task_rq_unlock(rq, p, rf); + + if (complete) + complete_all(&pending->done); + } + + wait_for_completion(&pending->done); + + if (refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); /* No UaF, just an address */ + + /* + * Block the original owner of &pending until all subsequent callers + * have seen the completion and decremented the refcount + */ + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + + /* ARGH */ + WARN_ON_ONCE(my_pending.stop_pending); + + return 0; +} + +/* + * Called with both p->pi_lock and rq->lock held; drops both before returning. */ -static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags, + struct rq *rq, + struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) { + const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; + bool kthread = p->flags & PF_KTHREAD; + struct cpumask *user_mask = NULL; unsigned int dest_cpu; - struct rq_flags rf; - struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD) { + if (kthread || is_migration_disabled(p)) { /* - * Kernel threads are allowed on online && !active CPUs + * Kernel threads are allowed on online && !active CPUs, + * however, during cpu-hot-unplug, even these might get pushed + * away if not KTHREAD_IS_PER_CPU. + * + * Specifically, migration_disabled() tasks must not fail the + * cpumask_any_and_distribute() pick below, esp. so on + * SCA_MIGRATE_ENABLE, otherwise we'll not call + * set_cpus_allowed_common() and actually reset p->cpus_ptr. */ cpu_valid_mask = cpu_online_mask; } + if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { + ret = -EINVAL; + goto out; + } + /* * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. */ - if (check && (p->flags & PF_NO_SETAFFINITY)) { + if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out; } - if (cpumask_equal(p->cpus_ptr, new_mask)) - goto out; + if (!(flags & SCA_MIGRATE_ENABLE)) { + if (cpumask_equal(&p->cpus_mask, new_mask)) + goto out; + + if (WARN_ON_ONCE(p == current && + is_migration_disabled(p) && + !cpumask_test_cpu(task_cpu(p), new_mask))) { + ret = -EBUSY; + goto out; + } + } - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + /* + * Picking a ~random cpu helps in cases where we are changing affinity + * for groups of tasks (ie. cpuset), so that load balancing is not + * immediately required to distribute the tasks within their new mask. + */ + dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } - do_set_cpus_allowed(p, new_mask); + __do_set_cpus_allowed(p, new_mask, flags); - if (p->flags & PF_KTHREAD) { - /* - * For kernel threads that do indeed end up on online && - * !active we want to ensure they are strict per-CPU threads. - */ - WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && - !cpumask_intersects(new_mask, cpu_active_mask) && - p->nr_cpus_allowed != 1); - } + if (flags & SCA_USER) + user_mask = clear_user_cpus_ptr(p); - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; + ret = affine_move_task(rq, p, rf, dest_cpu, flags); + + kfree(user_mask); + + return ret; - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &rf); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - return 0; - } else if (task_on_rq_queued(p)) { - /* - * OK, since we're going to drop the lock immediately - * afterwards anyway. - */ - rq = move_queued_task(rq, &rf, p, dest_cpu); - } out: - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, rf); return ret; } +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, u32 flags) +{ + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); +} + int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - return __set_cpus_allowed_ptr(p, new_mask, false); + return __set_cpus_allowed_ptr(p, new_mask, 0); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +/* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask + * and pointing @p->user_cpus_ptr to a copy of the old mask. + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ +static int restrict_cpus_allowed_ptr(struct task_struct *p, + struct cpumask *new_mask, + const struct cpumask *subset_mask) +{ + struct cpumask *user_mask = NULL; + struct rq_flags rf; + struct rq *rq; + int err; + + if (!p->user_cpus_ptr) { + user_mask = kmalloc(cpumask_size(), GFP_KERNEL); + if (!user_mask) + return -ENOMEM; + } + + rq = task_rq_lock(p, &rf); + + /* + * Forcefully restricting the affinity of a deadline task is + * likely to cause problems, so fail and noisily override the + * mask entirely. + */ + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + err = -EPERM; + goto err_unlock; + } + + if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { + err = -EINVAL; + goto err_unlock; + } + + /* + * We're about to butcher the task affinity, so keep track of what + * the user asked for in case we're able to restore it later on. + */ + if (user_mask) { + cpumask_copy(user_mask, p->cpus_ptr); + p->user_cpus_ptr = user_mask; + } + + return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + +err_unlock: + task_rq_unlock(rq, p, &rf); + kfree(user_mask); + return err; +} + +/* + * Restrict the CPU affinity of task @p so that it is a subset of + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ +void force_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + cpumask_var_t new_mask; + const struct cpumask *override_mask = task_cpu_possible_mask(p); + + alloc_cpumask_var(&new_mask, GFP_KERNEL); + + /* + * __migrate_task() can fail silently in the face of concurrent + * offlining of the chosen destination CPU, so take the hotplug + * lock to ensure that the migration succeeds. + */ + cpus_read_lock(); + if (!cpumask_available(new_mask)) + goto out_set_mask; + + if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) + goto out_free_mask; + + /* + * We failed to find a valid subset of the affinity mask for the + * task, so override it based on its cpuset hierarchy. + */ + cpuset_cpus_allowed(p, new_mask); + override_mask = new_mask; + +out_set_mask: + if (printk_ratelimit()) { + printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", + task_pid_nr(p), p->comm, + cpumask_pr_args(override_mask)); + } + + WARN_ON(set_cpus_allowed_ptr(p, override_mask)); +out_free_mask: + cpus_read_unlock(); + free_cpumask_var(new_mask); +} + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); + +/* + * Restore the affinity of a task @p which was previously restricted by a + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) + * @p->user_cpus_ptr. + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = p->user_cpus_ptr; + unsigned long flags; + + /* + * Try to restore the old affinity mask. If this fails, then + * we free the mask explicitly to avoid it being inherited across + * a subsequent fork(). + */ + if (!user_mask || !__sched_setaffinity(p, user_mask)) + return; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + user_mask = clear_user_cpus_ptr(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + kfree(user_mask); +} + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG + unsigned int state = READ_ONCE(p->__state); + /* * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !p->on_rq); + WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq); /* * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, * because schedstat_wait_{start,end} rebase migrating task's wait_start * time relying on p->on_rq. */ - WARN_ON_ONCE(p->state == TASK_RUNNING && + WARN_ON_ONCE(state == TASK_RUNNING && p->sched_class == &fair_sched_class && (p->on_rq && !task_on_rq_migrating(p))); @@ -1730,12 +3105,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * task_rq_lock(). */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock))); + lockdep_is_held(__rq_lockp(task_rq(p))))); #endif /* * Clearly, migrating tasks to offline CPUs is a fairly daft thing. */ WARN_ON_ONCE(!cpu_online(new_cpu)); + + WARN_ON_ONCE(is_migration_disabled(p)); #endif trace_sched_migrate_task(p, new_cpu); @@ -1871,12 +3248,12 @@ out: /* * wait_task_inactive - wait for a thread to unschedule. * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. + * Wait for the thread to block in any of the states set in @match_state. + * If it changes, i.e. @p might have woken up, then return zero. When we + * succeed in waiting for @p to be off its CPU, we return a positive number + * (its total switch count). If a second call a short while later returns the + * same number, the caller can be sure that @p has remained unscheduled the + * whole time. * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't @@ -1884,7 +3261,7 @@ out: * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -unsigned long wait_task_inactive(struct task_struct *p, long match_state) +unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) { int running, queued; struct rq_flags rf; @@ -1907,12 +3284,12 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * * NOTE! Since we don't hold any locks, it's not * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will + * But we don't care, since "task_on_cpu()" will * return false if the runqueue has changed and p * is actually now running somewhere else! */ - while (task_running(rq, p)) { - if (match_state && unlikely(p->state != match_state)) + while (task_on_cpu(rq, p)) { + if (!(READ_ONCE(p->__state) & match_state)) return 0; cpu_relax(); } @@ -1924,10 +3301,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) */ rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); - running = task_running(rq, p); + running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (!match_state || p->state == match_state) + if (READ_ONCE(p->__state) & match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); @@ -1961,7 +3338,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) ktime_t to = NSEC_PER_SEC / HZ; set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL); + schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); continue; } @@ -2040,9 +3417,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) + if (is_cpu_allowed(p, dest_cpu)) return dest_cpu; } } @@ -2059,17 +3434,21 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - if (IS_ENABLED(CONFIG_CPUSETS)) { - cpuset_cpus_allowed_fallback(p); + if (cpuset_cpus_allowed_fallback(p)) { state = possible; break; } - /* Fall-through */ + fallthrough; case possible: - do_set_cpus_allowed(p, cpu_possible_mask); + /* + * XXX When called from select_task_rq() we only + * hold p->pi_lock and again violate locking order. + * + * More yuck to audit. + */ + do_set_cpus_allowed(p, task_cpu_possible_mask(p)); state = fail; break; - case fail: BUG(); break; @@ -2096,12 +3475,12 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) + cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); else cpu = cpumask_any(p->cpus_ptr); @@ -2121,14 +3500,9 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) return cpu; } -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - void sched_set_stop_task(int cpu, struct task_struct *stop) { + static struct lock_class_key stop_pi_lock; struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; struct task_struct *old_stop = cpu_rq(cpu)->stop; @@ -2144,6 +3518,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); stop->sched_class = &stop_sched_class; + + /* + * The PI code calls rt_mutex_setprio() with ->pi_lock held to + * adjust the effective priority of a task. As a result, + * rt_mutex_setprio() can trigger (RT) balancing operations, + * which can then trigger wakeups of the stop thread to push + * around the current task. + * + * The stop task itself will never be part of the PI-chain, it + * never blocks, therefore that ->pi_lock recursion is safe. + * Tell lockdep about this by placing the stop->pi_lock in its + * own class. + */ + lockdep_set_class(&stop->pi_lock, &stop_pi_lock); } cpu_rq(cpu)->stop = stop; @@ -2157,15 +3545,23 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else +#else /* CONFIG_SMP */ static inline int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) + const struct cpumask *new_mask, + u32 flags) { return set_cpus_allowed_ptr(p, new_mask); } -#endif /* CONFIG_SMP */ +static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } + +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return false; +} + +#endif /* !CONFIG_SMP */ static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) @@ -2180,11 +3576,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); - __schedstat_inc(p->se.statistics.nr_wakeups_local); + __schedstat_inc(p->stats.nr_wakeups_local); } else { struct sched_domain *sd; - __schedstat_inc(p->se.statistics.nr_wakeups_remote); + __schedstat_inc(p->stats.nr_wakeups_remote); rcu_read_lock(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { @@ -2196,14 +3592,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - __schedstat_inc(p->se.statistics.nr_wakeups_migrate); + __schedstat_inc(p->stats.nr_wakeups_migrate); #endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); - __schedstat_inc(p->se.statistics.nr_wakeups); + __schedstat_inc(p->stats.nr_wakeups); if (wake_flags & WF_SYNC) - __schedstat_inc(p->se.statistics.nr_wakeups_sync); + __schedstat_inc(p->stats.nr_wakeups_sync); } /* @@ -2213,13 +3609,13 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) { check_preempt_curr(rq, p, wake_flags); - p->state = TASK_RUNNING; + WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* - * Our task @p is fully woken up and running; so its safe to + * Our task @p is fully woken up and running; so it's safe to * drop the rq->lock, hereafter rq is only used for statistics. */ rq_unpin_lock(rq, rf); @@ -2236,6 +3632,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, if (rq->avg_idle > max) rq->avg_idle = max; + rq->wake_stamp = jiffies; + rq->wake_avg_idle = rq->avg_idle / 2; + rq->idle_stamp = 0; } #endif @@ -2247,27 +3646,51 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, { int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); -#ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; + else #endif + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } activate_task(rq, p, en_flags); ttwu_do_wakeup(rq, p, wake_flags, rf); } /* - * Called in case the task @p isn't fully descheduled from its runqueue, - * in this case we must do a remote wakeup. Its a 'light' wakeup though, - * since all we need to do is flip p->state to TASK_RUNNING, since - * the task is still ->on_rq. + * Consider @p being inside a wait loop: + * + * for (;;) { + * set_current_state(TASK_UNINTERRUPTIBLE); + * + * if (CONDITION) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * between set_current_state() and schedule(). In this case @p is still + * runnable, so all that needs doing is change p->state back to TASK_RUNNING in + * an atomic manner. + * + * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq + * then schedule() must still happen and p->state can be changed to + * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we + * need to do a full wakeup with enqueue. + * + * Returns: %true when the wakeup is done, + * %false otherwise. */ -static int ttwu_remote(struct task_struct *p, int wake_flags) +static int ttwu_runnable(struct task_struct *p, int wake_flags) { struct rq_flags rf; struct rq *rq; @@ -2286,75 +3709,63 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -void sched_ttwu_pending(void) +void sched_ttwu_pending(void *arg) { + struct llist_node *llist = arg; struct rq *rq = this_rq(); - struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p, *t; struct rq_flags rf; if (!llist) return; + /* + * rq::ttwu_pending racy indication of out-standing wakeups. + * Races such that false-negatives are possible, since they + * are shorter lived that false-positives would be. + */ + WRITE_ONCE(rq->ttwu_pending, 0); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { + if (WARN_ON_ONCE(p->on_cpu)) + smp_cond_load_acquire(&p->on_cpu, !VAL); + + if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) + set_task_cpu(p, cpu_of(rq)); + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); + } rq_unlock_irqrestore(rq, &rf); } -void scheduler_ipi(void) +void send_call_function_single_ipi(int cpu) { - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); - - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) - return; - - /* - * Not all reschedule IPI handlers call irq_enter/irq_exit, since - * traditionally all their work was done from the interrupt return - * path. Now that we actually do some work, we need to make sure - * we do call them. - * - * Some archs already do call them, luckily irq_enter/exit nest - * properly. - * - * Arguably we should visit all archs and update all handlers, - * however a fair share of IPIs are still resched only so this would - * somewhat pessimize the simple resched case. - */ - irq_enter(); - sched_ttwu_pending(); + struct rq *rq = cpu_rq(cpu); - /* - * Check if someone kicked us for doing the nohz idle load balance. - */ - if (unlikely(got_nohz_idle_kick())) { - this_rq()->idle_balance = 1; - raise_softirq_irqoff(SCHED_SOFTIRQ); - } - irq_exit(); + if (!set_nr_if_polling(rq->idle)) + arch_send_call_function_single_ipi(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } -static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +/* + * Queue a task on the target CPUs wake_list and wake the CPU via IPI if + * necessary. The wakee CPU on receipt of the IPI will queue the task + * via sched_ttwu_wakeup() for activation so the wakee incurs the cost + * of the wakeup instead of the waker. + */ +static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { - if (!set_nr_if_polling(rq->idle)) - smp_send_reschedule(cpu); - else - trace_sched_wake_idle_without_ipi(cpu); - } + WRITE_ONCE(rq->ttwu_pending, 1); + __smp_call_single_queue(cpu, &p->wake_entry.llist); } void wake_up_if_idle(int cpu) @@ -2367,15 +3778,11 @@ void wake_up_if_idle(int cpu) if (!is_idle_task(rcu_dereference(rq->curr))) goto out; - if (set_nr_if_polling(rq->idle)) { - trace_sched_wake_idle_without_ipi(cpu); - } else { - rq_lock_irqsave(rq, &rf); - if (is_idle_task(rq->curr)) - smp_send_reschedule(cpu); - /* Else CPU is not idle, do nothing here: */ - rq_unlock_irqrestore(rq, &rf); - } + rq_lock_irqsave(rq, &rf); + if (is_idle_task(rq->curr)) + resched_curr(rq); + /* Else CPU is not idle, do nothing here: */ + rq_unlock_irqrestore(rq, &rf); out: rcu_read_unlock(); @@ -2383,8 +3790,70 @@ out: bool cpus_share_cache(int this_cpu, int that_cpu) { + if (this_cpu == that_cpu) + return true; + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } + +static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) +{ + /* + * Do not complicate things with the async wake_list while the CPU is + * in hotplug state. + */ + if (!cpu_active(cpu)) + return false; + + /* Ensure the task will still be allowed to run on the CPU. */ + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + + /* + * If the CPU does not share cache, then queue the task on the + * remote rqs wakelist to avoid accessing remote data. + */ + if (!cpus_share_cache(smp_processor_id(), cpu)) + return true; + + if (cpu == smp_processor_id()) + return false; + + /* + * If the wakee cpu is idle, or the task is descheduling and the + * only running task on the CPU, then use the wakelist to offload + * the task activation to the idle (or soon-to-be-idle) CPU as + * the current CPU is likely busy. nr_running is checked to + * avoid unnecessary task stacking. + * + * Note that we can only get here with (wakee) p->on_rq=0, + * p->on_cpu can be whatever, we've done the dequeue, so + * the wakee has been accounted out of ->nr_running. + */ + if (!cpu_rq(cpu)->nr_running) + return true; + + return false; +} + +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) { + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ + __ttwu_queue_wakelist(p, cpu, wake_flags); + return true; + } + + return false; +} + +#else /* !CONFIG_SMP */ + +static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + return false; +} + #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2392,13 +3861,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; -#if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { - sched_clock_cpu(cpu); /* Sync clocks across CPUs */ - ttwu_queue_remote(p, cpu, wake_flags); + if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; - } -#endif rq_lock(rq, &rf); update_rq_clock(rq); @@ -2407,6 +3871,55 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) } /* + * Invoked from try_to_wake_up() to check whether the task can be woken up. + * + * The caller holds p::pi_lock if p != current or has preemption + * disabled when p == current. + * + * The rules of PREEMPT_RT saved_state: + * + * The related locking code always holds p::pi_lock when updating + * p::saved_state, which means the code is fully serialized in both cases. + * + * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other + * bits set. This allows to distinguish all wakeup scenarios. + */ +static __always_inline +bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) +{ + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { + WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && + state != TASK_RTLOCK_WAIT); + } + + if (READ_ONCE(p->__state) & state) { + *success = 1; + return true; + } + +#ifdef CONFIG_PREEMPT_RT + /* + * Saved state preserves the task state across blocking on + * an RT lock. If the state matches, set p::saved_state to + * TASK_RUNNING, but do not wake the task because it waits + * for a lock wakeup. Also indicate success because from + * the regular waker's point of view this has succeeded. + * + * After acquiring the lock the task will restore p::__state + * from p::saved_state which ensures that the regular + * wakeup is not lost. The restore will also set + * p::saved_state to TASK_RUNNING so any further tests will + * not result in false positives vs. @success + */ + if (p->saved_state & state) { + p->saved_state = TASK_RUNNING; + *success = 1; + } +#endif + return false; +} + +/* * Notes on Program-Order guarantees on SMP systems. * * MIGRATION @@ -2454,8 +3967,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * migration. However the means are completely different as there is no lock * chain to provide order. Instead we do: * - * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_load_acquire(!X->on_cpu) + * 1) smp_store_release(X->on_cpu, 0) -- finish_task() + * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() * * Example: * @@ -2495,15 +4008,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * - * If (@state & @p->state) @p->state = TASK_RUNNING. + * Conceptually does: + * + * If (@state & @p->state) @p->state = TASK_RUNNING. * * If the task was not queued/runnable, also place it back on a runqueue. * - * Atomic against schedule() which would dequeue a task, also see - * set_current_state(). + * This function is atomic against schedule() which would dequeue the task. + * + * It issues a full memory barrier before accessing @p->state, see the comment + * with set_current_state(). + * + * Uses p->pi_lock to serialize against concurrent wake-ups. * - * This function executes a full memory barrier before accessing the task - * state; see set_current_state(). + * Relies on p->pi_lock stabilizing: + * - p->sched_class + * - p->cpus_ptr + * - p->sched_task_group + * in order to do migration, see its use of select_task_rq()/set_task_cpu(). + * + * Tries really hard to only take one task_rq(p)->lock for performance. + * Takes rq->lock in: + * - ttwu_runnable() -- old rq, unavoidable, see comment there; + * - ttwu_queue() -- new rq, for enqueue of the task; + * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. + * + * As a consequence we race really badly with just about everything. See the + * many memory barriers and their comments for details. * * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. @@ -2519,7 +4050,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) * == smp_processor_id()'. Together this means we can special - * case the whole 'p->on_rq && ttwu_remote()' case below + * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * * In particular: @@ -2527,13 +4058,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - if (!(p->state & state)) + if (!ttwu_state_match(p, state, &success)) goto out; - success = 1; - cpu = task_cpu(p); trace_sched_waking(p); - p->state = TASK_RUNNING; + WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); goto out; } @@ -2541,20 +4070,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be - * reordered with p->state check below. This pairs with mb() in - * set_current_state() the waiting thread does. + * reordered with p->state check below. This pairs with smp_store_mb() + * in set_current_state() that the waiting thread does. */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); - if (!(p->state & state)) + if (!ttwu_state_match(p, state, &success)) goto unlock; trace_sched_waking(p); - /* We're going to change ->state: */ - success = 1; - cpu = task_cpu(p); - /* * Ensure we load p->on_rq _after_ p->state, otherwise it would * be possible to, falsely, observe p->on_rq == 0 and get stuck @@ -2574,9 +4099,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) goto unlock; #ifdef CONFIG_SMP @@ -2598,12 +4125,47 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). */ - smp_rmb(); + smp_acquire__after_ctrl_dep(); + + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + WRITE_ONCE(p->__state, TASK_WAKING); + + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, considering queueing p on the remote CPUs wake_list + * which potentially sends an IPI instead of spinning on p->on_cpu to + * let the waker make forward progress. This is safe because IRQs are + * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. + */ + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) + goto unlock; /* * If the owning (remote) CPU is still in the middle of schedule() with - * this task as prev, wait until its done referencing the task. + * this task as prev, wait until it's done referencing the task. * * Pairs with the smp_store_release() in finish_task(). * @@ -2612,28 +4174,19 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); if (task_cpu(p) != cpu) { + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } - -#else /* CONFIG_SMP */ - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - +#else + cpu = task_cpu(p); #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2641,13 +4194,105 @@ unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); out: if (success) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, task_cpu(p), wake_flags); preempt_enable(); return success; } /** + * task_call_func - Invoke a function on task in fixed state + * @p: Process for which the function is to be invoked, can be @current. + * @func: Function to invoke. + * @arg: Argument to function. + * + * Fix the task in it's current state by avoiding wakeups and or rq operations + * and call @func(@arg) on it. This function can use ->on_rq and task_curr() + * to work out what the state is, if required. Given that @func can be invoked + * with a runqueue lock held, it had better be quite lightweight. + * + * Returns: + * Whatever @func returns + */ +int task_call_func(struct task_struct *p, task_call_f func, void *arg) +{ + struct rq *rq = NULL; + unsigned int state; + struct rq_flags rf; + int ret; + + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + + state = READ_ONCE(p->__state); + + /* + * Ensure we load p->on_rq after p->__state, otherwise it would be + * possible to, falsely, observe p->on_rq == 0. + * + * See try_to_wake_up() for a longer comment. + */ + smp_rmb(); + + /* + * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when + * the task is blocked. Make sure to check @state since ttwu() can drop + * locks at the end, see ttwu_queue_wakelist(). + */ + if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq) + rq = __task_rq_lock(p, &rf); + + /* + * At this point the task is pinned; either: + * - blocked and we're holding off wakeups (pi->lock) + * - woken, and we're holding off enqueue (rq->lock) + * - queued, and we're holding off schedule (rq->lock) + * - running, and we're holding off de-schedule (rq->lock) + * + * The called function (@func) can use: task_curr(), p->on_rq and + * p->__state to differentiate between these states. + */ + ret = func(p, arg); + + if (rq) + rq_unlock(rq, &rf); + + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); + return ret; +} + +/** + * cpu_curr_snapshot - Return a snapshot of the currently running task + * @cpu: The CPU on which to snapshot the task. + * + * Returns the task_struct pointer of the task "currently" running on + * the specified CPU. If the same task is running on that CPU throughout, + * the return value will be a pointer to that task's task_struct structure. + * If the CPU did any context switches even vaguely concurrently with the + * execution of this function, the return value will be a pointer to the + * task_struct structure of a randomly chosen task that was running on + * that CPU somewhere around the time that this function was executing. + * + * If the specified CPU was offline, the return value is whatever it + * is, perhaps a pointer to the task_struct structure of that CPU's idle + * task, but there is no guarantee. Callers wishing a useful return + * value must take some action to ensure that the specified CPU remains + * online throughout. + * + * This function executes full memory barriers before and after fetching + * the pointer, which permits the caller to confine this function's fetch + * with respect to the caller's accesses to other shared variables. + */ +struct task_struct *cpu_curr_snapshot(int cpu) +{ + struct task_struct *t; + + smp_mb(); /* Pairing determined by caller's synchronization design. */ + t = rcu_dereference(cpu_curr(cpu)); + smp_mb(); /* Pairing determined by caller's synchronization design. */ + return t; +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -2693,7 +4338,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SCHEDSTATS /* Even if schedstat is disabled, there should not be garbage */ - memset(&p->se.statistics, 0, sizeof(p->se.statistics)); + memset(&p->stats, 0, sizeof(p->stats)); #endif RB_CLEAR_NODE(&p->dl.rb_node); @@ -2715,13 +4360,19 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); +#ifdef CONFIG_SMP + p->wake_entry.u_flags = CSD_TYPE_TTWU; + p->migration_pending = NULL; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); #ifdef CONFIG_NUMA_BALANCING -void set_numabalancing_state(bool enabled) +int sysctl_numa_balancing_mode; + +static void __set_numabalancing_state(bool enabled) { if (enabled) static_branch_enable(&sched_numa_balancing); @@ -2729,13 +4380,33 @@ void set_numabalancing_state(bool enabled) static_branch_disable(&sched_numa_balancing); } +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL; + else + sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED; + __set_numabalancing_state(enabled); +} + #ifdef CONFIG_PROC_SYSCTL +static void reset_memory_tiering(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + pgdat->nbp_threshold = 0; + pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + pgdat->nbp_th_start = jiffies_to_msecs(jiffies); + } +} + int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; - int state = static_branch_likely(&sched_numa_balancing); + int state = sysctl_numa_balancing_mode; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2745,8 +4416,13 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; - if (write) - set_numabalancing_state(state); + if (write) { + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + (state & NUMA_BALANCING_MEMORY_TIERING)) + reset_memory_tiering(); + sysctl_numa_balancing_mode = state; + __set_numabalancing_state(state); + } return err; } #endif @@ -2755,7 +4431,6 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #ifdef CONFIG_SCHEDSTATS DEFINE_STATIC_KEY_FALSE(sched_schedstats); -static bool __initdata __sched_schedstats = false; static void set_schedstats(bool enabled) { @@ -2779,16 +4454,11 @@ static int __init setup_schedstats(char *str) if (!str) goto out; - /* - * This code is called before jump labels have been set up, so we can't - * change the static branch directly just yet. Instead set a temporary - * variable so init_schedstats() can do it later. - */ if (!strcmp(str, "enable")) { - __sched_schedstats = true; + set_schedstats(true); ret = 1; } else if (!strcmp(str, "disable")) { - __sched_schedstats = false; + set_schedstats(false); ret = 1; } out: @@ -2799,14 +4469,9 @@ out: } __setup("schedstats=", setup_schedstats); -static void __init init_schedstats(void) -{ - set_schedstats(__sched_schedstats); -} - #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2825,24 +4490,66 @@ int sysctl_schedstats(struct ctl_table *table, int write, return err; } #endif /* CONFIG_PROC_SYSCTL */ -#else /* !CONFIG_SCHEDSTATS */ -static inline void init_schedstats(void) {} #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_core_sysctls[] = { +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_min_rt_default", + .data = &sysctl_sched_uclamp_util_min_rt_default, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif /* CONFIG_UCLAMP_TASK */ + {} +}; +static int __init sched_core_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_core_sysctls); + return 0; +} +late_initcall(sched_core_sysctl_init); +#endif /* CONFIG_SYSCTL */ + /* * fork()/clone()-time setup: */ int sched_fork(unsigned long clone_flags, struct task_struct *p) { - unsigned long flags; - __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_NEW; + p->__state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2862,7 +4569,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); - p->prio = p->normal_prio = __normal_prio(p); + p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); /* @@ -2881,22 +4588,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); - /* - * The child is not yet in the pid-hash so no cgroup attach races, - * and the cgroup is pinned to this child due to cgroup_fork() - * is ran before sched_fork(). - * - * Silence PROVE_RCU. - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - /* - * We're setting the CPU for the first time, we don't migrate, - * so use __set_task_cpu(). - */ - __set_task_cpu(p, smp_processor_id()); - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) @@ -2913,6 +4604,40 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +{ + unsigned long flags; + + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ + raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifdef CONFIG_CGROUP_SCHED + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } +#endif + rseq_migrate(p); + /* + * We're setting the CPU for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, smp_processor_id()); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +void sched_post_fork(struct task_struct *p) +{ + uclamp_post_fork(p); +} + unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) @@ -2942,7 +4667,7 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - p->state = TASK_RUNNING; + WRITE_ONCE(p->__state, TASK_RUNNING); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -2953,7 +4678,8 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + rseq_migrate(p); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -2965,7 +4691,7 @@ void wake_up_new_task(struct task_struct *p) #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* - * Nothing relies on rq->lock after this, so its fine to + * Nothing relies on rq->lock after this, so it's fine to * drop it. */ rq_unpin_lock(rq, &rf); @@ -3069,8 +4795,11 @@ static inline void prepare_task(struct task_struct *next) /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. + * + * See the smp_load_acquire(&p->on_cpu) case in ttwu() and + * its ordering comment. */ - next->on_cpu = 1; + WRITE_ONCE(next->on_cpu, 1); #endif } @@ -3078,8 +4807,9 @@ static inline void finish_task(struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely + * This must be the very last reference to @prev from this CPU. After + * p->on_cpu is cleared, the task can be moved to a different CPU. We + * must ensure this doesn't happen until the switch is completely * finished. * * In particular, the load of prev->state in finish_task_switch() must @@ -3091,6 +4821,106 @@ static inline void finish_task(struct task_struct *prev) #endif } +#ifdef CONFIG_SMP + +static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) +{ + void (*func)(struct rq *rq); + struct balance_callback *next; + + lockdep_assert_rq_held(rq); + + while (head) { + func = (void (*)(struct rq *))head->func; + next = head->next; + head->next = NULL; + head = next; + + func(rq); + } +} + +static void balance_push(struct rq *rq); + +/* + * balance_push_callback is a right abuse of the callback interface and plays + * by significantly different rules. + * + * Where the normal balance_callback's purpose is to be ran in the same context + * that queued it (only later, when it's safe to drop rq->lock again), + * balance_push_callback is specifically targeted at __schedule(). + * + * This abuse is tolerated because it places all the unlikely/odd cases behind + * a single test, namely: rq->balance_callback == NULL. + */ +struct balance_callback balance_push_callback = { + .next = NULL, + .func = balance_push, +}; + +static inline struct balance_callback * +__splice_balance_callbacks(struct rq *rq, bool split) +{ + struct balance_callback *head = rq->balance_callback; + + if (likely(!head)) + return NULL; + + lockdep_assert_rq_held(rq); + /* + * Must not take balance_push_callback off the list when + * splice_balance_callbacks() and balance_callbacks() are not + * in the same rq->lock section. + * + * In that case it would be possible for __schedule() to interleave + * and observe the list empty. + */ + if (split && head == &balance_push_callback) + head = NULL; + else + rq->balance_callback = NULL; + + return head; +} + +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) +{ + return __splice_balance_callbacks(rq, true); +} + +static void __balance_callbacks(struct rq *rq) +{ + do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); +} + +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) +{ + unsigned long flags; + + if (unlikely(head)) { + raw_spin_rq_lock_irqsave(rq, flags); + do_balance_callbacks(rq, head); + raw_spin_rq_unlock_irqrestore(rq, flags); + } +} + +#else + +static inline void __balance_callbacks(struct rq *rq) +{ +} + +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) +{ + return NULL; +} + +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) +{ +} + +#endif + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -3101,10 +4931,10 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf * do an early lockdep release here: */ rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, _THIS_IP_); + spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = next; + rq_lockp(rq)->owner = next; #endif } @@ -3115,8 +4945,9 @@ static inline void finish_lock_switch(struct rq *rq) * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - raw_spin_unlock_irq(&rq->lock); + spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); + __balance_callbacks(rq); + raw_spin_rq_unlock_irq(rq); } /* @@ -3131,6 +4962,22 @@ static inline void finish_lock_switch(struct rq *rq) # define finish_arch_post_lock_switch() do { } while (0) #endif +static inline void kmap_local_sched_out(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_out(); +#endif +} + +static inline void kmap_local_sched_in(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_in(); +#endif +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -3153,6 +5000,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, perf_event_task_sched_out(prev, next); rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); + kmap_local_sched_out(); prepare_task(next); prepare_arch_switch(next); } @@ -3181,7 +5029,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) { struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; - long prev_state; + unsigned int prev_state; /* * The previous task will have left us with a preempt_count of 2 @@ -3212,13 +5060,22 @@ static struct rq *finish_task_switch(struct task_struct *prev) * running on another CPU and we could rave with its RUNNING -> DEAD * transition, resulting in a double drop. */ - prev_state = prev->state; + prev_state = READ_ONCE(prev->__state); vtime_task_switch(prev); perf_event_task_sched_in(prev, current); finish_task(prev); + tick_nohz_task_switch(); finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); + /* + * kmap_local_sched_out() is invoked with rq::lock held and + * interrupts disabled. There is no requirement for that, but the + * sched out code does not have an interrupt enabled section. + * Restoring the maps on sched in does not require interrupts being + * disabled either. + */ + kmap_local_sched_in(); fire_sched_in_preempt_notifiers(current); /* @@ -3235,65 +5092,21 @@ static struct rq *finish_task_switch(struct task_struct *prev) */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop(mm); + mmdrop_sched(mm); } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - /* Task is done with its stack. */ put_task_stack(prev); put_task_struct_rcu_user(prev); } - tick_nohz_task_switch(); return rq; } -#ifdef CONFIG_SMP - -/* rq->lock is NOT held, but preemption is disabled */ -static void __balance_callback(struct rq *rq) -{ - struct callback_head *head, *next; - void (*func)(struct rq *rq); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - head = rq->balance_callback; - rq->balance_callback = NULL; - while (head) { - func = (void (*)(struct rq *))head->func; - next = head->next; - head->next = NULL; - head = next; - - func(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -static inline void balance_callback(struct rq *rq) -{ - if (unlikely(rq->balance_callback)) - __balance_callback(rq); -} - -#else - -static inline void balance_callback(struct rq *rq) -{ -} - -#endif - /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -3301,8 +5114,6 @@ static inline void balance_callback(struct rq *rq) asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq; - /* * New tasks start with FORK_PREEMPT_COUNT, see there and * finish_task_switch() for details. @@ -3312,8 +5123,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) * PREEMPT_COUNT kernels). */ - rq = finish_task_switch(prev); - balance_callback(rq); + finish_task_switch(prev); preempt_enable(); if (current->set_child_tid) @@ -3364,6 +5174,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * finish_task_switch()'s mmdrop(). */ switch_mm_irqs_off(prev->active_mm, next->mm, next); + lru_gen_use_mm(next->mm); if (!prev->mm) { // from kernel /* will mmdrop() in finish_task_switch(). */ @@ -3389,9 +5200,9 @@ context_switch(struct rq *rq, struct task_struct *prev, * externally visible scheduler statistics: current number of runnable * threads, total number of context switches performed since bootup. */ -unsigned long nr_running(void) +unsigned int nr_running(void) { - unsigned long i, sum = 0; + unsigned int i, sum = 0; for_each_online_cpu(i) sum += cpu_rq(i)->nr_running; @@ -3436,13 +5247,13 @@ unsigned long long nr_context_switches(void) * it does become runnable. */ -unsigned long nr_iowait_cpu(int cpu) +unsigned int nr_iowait_cpu(int cpu) { return atomic_read(&cpu_rq(cpu)->nr_iowait); } /* - * IO-wait accounting, and how its mostly bollocks (on SMP). + * IO-wait accounting, and how it's mostly bollocks (on SMP). * * The idea behind IO-wait account is to account the idle time that we could * have spend running if it were not for IO. That is, if we were to improve the @@ -3471,9 +5282,9 @@ unsigned long nr_iowait_cpu(int cpu) * Task CPU affinities can make all that even more 'interesting'. */ -unsigned long nr_iowait(void) +unsigned int nr_iowait(void) { - unsigned long i, sum = 0; + unsigned int i, sum = 0; for_each_possible_cpu(i) sum += nr_iowait_cpu(i); @@ -3494,7 +5305,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); if (dest_cpu == smp_processor_id()) goto unlock; @@ -3578,6 +5389,55 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_SCHED_DEBUG +static u64 cpu_resched_latency(struct rq *rq) +{ + int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); + u64 resched_latency, now = rq_clock(rq); + static bool warned_once; + + if (sysctl_resched_latency_warn_once && warned_once) + return 0; + + if (!need_resched() || !latency_warn_ms) + return 0; + + if (system_state == SYSTEM_BOOTING) + return 0; + + if (!rq->last_seen_need_resched_ns) { + rq->last_seen_need_resched_ns = now; + rq->ticks_without_resched = 0; + return 0; + } + + rq->ticks_without_resched++; + resched_latency = now - rq->last_seen_need_resched_ns; + if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC) + return 0; + + warned_once = true; + + return resched_latency; +} + +static int __init setup_resched_latency_warn_ms(char *str) +{ + long val; + + if ((kstrtol(str, 0, &val))) { + pr_warn("Unable to set resched_latency_warn_ms\n"); + return 1; + } + + sysctl_resched_latency_warn_ms = val; + return 1; +} +__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); +#else +static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } +#endif /* CONFIG_SCHED_DEBUG */ + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3588,18 +5448,28 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; + unsigned long thermal_pressure; + u64 resched_latency; + arch_scale_freq_tick(); sched_clock_tick(); rq_lock(rq, &rf); update_rq_clock(rq); + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); + if (sched_feat(LATENCY_WARN)) + resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); - psi_task_tick(rq); + sched_core_tick(rq); rq_unlock(rq, &rf); + if (sched_feat(LATENCY_WARN) && resched_latency) + resched_latency_warn(cpu, resched_latency); + perf_event_task_tick(); #ifdef CONFIG_SMP @@ -3671,7 +5541,6 @@ static void sched_tick_remote(struct work_struct *work) if (cpu_is_offline(cpu)) goto out_unlock; - curr = rq->curr; update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -3706,7 +5575,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -3727,7 +5596,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -3858,8 +5727,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } if (panic_on_warn) panic("scheduling while atomic\n"); @@ -3876,10 +5744,13 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); + + if (task_scs_end_corrupted(prev)) + panic("corrupted shadow stack detected inside scheduler\n"); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP - if (!preempt && prev->state && prev->non_block_count) { + if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) { printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", prev->comm, prev->pid, prev->non_block_count); dump_stack(); @@ -3892,17 +5763,40 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); + SCHED_WARN_ON(ct_state() == CONTEXT_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq()->sched_count); } +static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + const struct sched_class *class; + /* + * We must do the balancing pass before put_prev_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. + */ + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); +} + /* * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; @@ -3910,18 +5804,17 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a - * higher scheduling class, because otherwise those loose the + * higher scheduling class, because otherwise those lose the * opportunity to pull in more work from other CPUs. */ - if (likely((prev->sched_class == &idle_sched_class || - prev->sched_class == &fair_sched_class) && + if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto restart; - /* Assumes fair_sched_class->next == idle_sched_class */ + /* Assume the next prioritized class is idle_sched_class */ if (!p) { put_prev_task(rq, prev); p = pick_next_task_idle(rq); @@ -3931,33 +5824,524 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } restart: -#ifdef CONFIG_SMP + put_prev_task_balance(rq, prev, rf); + + for_each_class(class) { + p = class->pick_next_task(rq); + if (p) + return p; + } + + BUG(); /* The idle class should always have a runnable task. */ +} + +#ifdef CONFIG_SCHED_CORE +static inline bool is_task_rq_idle(struct task_struct *t) +{ + return (task_rq(t)->idle == t); +} + +static inline bool cookie_equals(struct task_struct *a, unsigned long cookie) +{ + return is_task_rq_idle(a) || (a->core_cookie == cookie); +} + +static inline bool cookie_match(struct task_struct *a, struct task_struct *b) +{ + if (is_task_rq_idle(a) || is_task_rq_idle(b)) + return true; + + return a->core_cookie == b->core_cookie; +} + +static inline struct task_struct *pick_task(struct rq *rq) +{ + const struct sched_class *class; + struct task_struct *p; + + for_each_class(class) { + p = class->pick_task(rq); + if (p) + return p; + } + + BUG(); /* The idle class should always have a runnable task. */ +} + +extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); + +static void queue_core_balance(struct rq *rq); + +static struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + struct task_struct *next, *p, *max = NULL; + const struct cpumask *smt_mask; + bool fi_before = false; + bool core_clock_updated = (rq == rq->core); + unsigned long cookie; + int i, cpu, occ = 0; + struct rq *rq_i; + bool need_sync; + + if (!sched_core_enabled(rq)) + return __pick_next_task(rq, prev, rf); + + cpu = cpu_of(rq); + + /* Stopper task is switching into idle, no need core-wide selection. */ + if (cpu_is_offline(cpu)) { + /* + * Reset core_pick so that we don't enter the fastpath when + * coming online. core_pick would already be migrated to + * another cpu during offline. + */ + rq->core_pick = NULL; + return __pick_next_task(rq, prev, rf); + } + /* - * We must do the balancing pass before put_next_task(), such - * that when we release the rq->lock the task is in the same - * state as before we took rq->lock. + * If there were no {en,de}queues since we picked (IOW, the task + * pointers are all still valid), and we haven't scheduled the last + * pick yet, do so now. * - * We can terminate the balance pass as soon as we know there is - * a runnable task of @class priority or higher. + * rq->core_pick can be NULL if no selection was made for a CPU because + * it was either offline or went offline during a sibling's core-wide + * selection. In this case, do a core-wide selection. */ - for_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) + if (rq->core->core_pick_seq == rq->core->core_task_seq && + rq->core->core_pick_seq != rq->core_sched_seq && + rq->core_pick) { + WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); + + next = rq->core_pick; + if (next != prev) { + put_prev_task(rq, prev); + set_next_task(rq, next); + } + + rq->core_pick = NULL; + goto out; + } + + put_prev_task_balance(rq, prev, rf); + + smt_mask = cpu_smt_mask(cpu); + need_sync = !!rq->core->core_cookie; + + /* reset state */ + rq->core->core_cookie = 0UL; + if (rq->core->core_forceidle_count) { + if (!core_clock_updated) { + update_rq_clock(rq->core); + core_clock_updated = true; + } + sched_core_account_forceidle(rq); + /* reset after accounting force idle */ + rq->core->core_forceidle_start = 0; + rq->core->core_forceidle_count = 0; + rq->core->core_forceidle_occupation = 0; + need_sync = true; + fi_before = true; + } + + /* + * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq + * + * @task_seq guards the task state ({en,de}queues) + * @pick_seq is the @task_seq we did a selection on + * @sched_seq is the @pick_seq we scheduled + * + * However, preemptions can cause multiple picks on the same task set. + * 'Fix' this by also increasing @task_seq for every pick. + */ + rq->core->core_task_seq++; + + /* + * Optimize for common case where this CPU has no cookies + * and there are no cookied tasks running on siblings. + */ + if (!need_sync) { + next = pick_task(rq); + if (!next->core_cookie) { + rq->core_pick = NULL; + /* + * For robustness, update the min_vruntime_fi for + * unconstrained picks as well. + */ + WARN_ON_ONCE(fi_before); + task_vruntime_update(rq, next, false); + goto out_set_next; + } + } + + /* + * For each thread: do the regular task pick and find the max prio task + * amongst them. + * + * Tie-break prio towards the current CPU + */ + for_each_cpu_wrap(i, smt_mask, cpu) { + rq_i = cpu_rq(i); + + /* + * Current cpu always has its clock updated on entrance to + * pick_next_task(). If the current cpu is not the core, + * the core may also have been updated above. + */ + if (i != cpu && (rq_i != rq->core || !core_clock_updated)) + update_rq_clock(rq_i); + + p = rq_i->core_pick = pick_task(rq_i); + if (!max || prio_less(max, p, fi_before)) + max = p; + } + + cookie = rq->core->core_cookie = max->core_cookie; + + /* + * For each thread: try and find a runnable task that matches @max or + * force idle. + */ + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + p = rq_i->core_pick; + + if (!cookie_equals(p, cookie)) { + p = NULL; + if (cookie) + p = sched_core_find(rq_i, cookie); + if (!p) + p = idle_sched_class.pick_task(rq_i); + } + + rq_i->core_pick = p; + + if (p == rq_i->idle) { + if (rq_i->nr_running) { + rq->core->core_forceidle_count++; + if (!fi_before) + rq->core->core_forceidle_seq++; + } + } else { + occ++; + } + } + + if (schedstat_enabled() && rq->core->core_forceidle_count) { + rq->core->core_forceidle_start = rq_clock(rq->core); + rq->core->core_forceidle_occupation = occ; + } + + rq->core->core_pick_seq = rq->core->core_task_seq; + next = rq->core_pick; + rq->core_sched_seq = rq->core->core_pick_seq; + + /* Something should have been selected for current CPU */ + WARN_ON_ONCE(!next); + + /* + * Reschedule siblings + * + * NOTE: L1TF -- at this point we're no longer running the old task and + * sending an IPI (below) ensures the sibling will no longer be running + * their task. This ensures there is no inter-sibling overlap between + * non-matching user state. + */ + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + + /* + * An online sibling might have gone offline before a task + * could be picked for it, or it might be offline but later + * happen to come online, but its too late and nothing was + * picked for it. That's Ok - it will pick tasks for itself, + * so ignore it. + */ + if (!rq_i->core_pick) + continue; + + /* + * Update for new !FI->FI transitions, or if continuing to be in !FI: + * fi_before fi update? + * 0 0 1 + * 0 1 1 + * 1 0 1 + * 1 1 0 + */ + if (!(fi_before && rq->core->core_forceidle_count)) + task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count); + + rq_i->core_pick->core_occupation = occ; + + if (i == cpu) { + rq_i->core_pick = NULL; + continue; + } + + /* Did we break L1TF mitigation requirements? */ + WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick)); + + if (rq_i->curr == rq_i->core_pick) { + rq_i->core_pick = NULL; + continue; + } + + resched_curr(rq_i); + } + +out_set_next: + set_next_task(rq, next); +out: + if (rq->core->core_forceidle_count && next == rq->idle) + queue_core_balance(rq); + + return next; +} + +static bool try_steal_cookie(int this, int that) +{ + struct rq *dst = cpu_rq(this), *src = cpu_rq(that); + struct task_struct *p; + unsigned long cookie; + bool success = false; + + local_irq_disable(); + double_rq_lock(dst, src); + + cookie = dst->core->core_cookie; + if (!cookie) + goto unlock; + + if (dst->curr != dst->idle) + goto unlock; + + p = sched_core_find(src, cookie); + if (p == src->idle) + goto unlock; + + do { + if (p == src->core_pick || p == src->curr) + goto next; + + if (!is_cpu_allowed(p, this)) + goto next; + + if (p->core_occupation > dst->idle->core_occupation) + goto next; + + deactivate_task(src, p, 0); + set_task_cpu(p, this); + activate_task(dst, p, 0); + + resched_curr(dst); + + success = true; + break; + +next: + p = sched_core_next(p, cookie); + } while (p); + +unlock: + double_rq_unlock(dst, src); + local_irq_enable(); + + return success; +} + +static bool steal_cookie_task(int cpu, struct sched_domain *sd) +{ + int i; + + for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { + if (i == cpu) + continue; + + if (need_resched()) break; + + if (try_steal_cookie(cpu, i)) + return true; } -#endif - put_prev_task(rq, prev); + return false; +} - for_each_class(class) { - p = class->pick_next_task(rq); - if (p) - return p; +static void sched_core_balance(struct rq *rq) +{ + struct sched_domain *sd; + int cpu = cpu_of(rq); + + preempt_disable(); + rcu_read_lock(); + raw_spin_rq_unlock_irq(rq); + for_each_domain(cpu, sd) { + if (need_resched()) + break; + + if (steal_cookie_task(cpu, sd)) + break; } + raw_spin_rq_lock_irq(rq); + rcu_read_unlock(); + preempt_enable(); +} - /* The idle class should always have a runnable task: */ - BUG(); +static DEFINE_PER_CPU(struct balance_callback, core_balance_head); + +static void queue_core_balance(struct rq *rq) +{ + if (!sched_core_enabled(rq)) + return; + + if (!rq->core->core_cookie) + return; + + if (!rq->nr_running) /* not forced idle */ + return; + + queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); +} + +static void sched_core_cpu_starting(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t; + + sched_core_lock(cpu, &flags); + + WARN_ON_ONCE(rq->core != rq); + + /* if we're the first, we'll be our own leader */ + if (cpumask_weight(smt_mask) == 1) + goto unlock; + + /* find the leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + rq = cpu_rq(t); + if (rq->core == rq) { + core_rq = rq; + break; + } + } + + if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ + goto unlock; + + /* install and validate core_rq */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); + + if (t == cpu) + rq->core = core_rq; + + WARN_ON_ONCE(rq->core != core_rq); + } + +unlock: + sched_core_unlock(cpu, &flags); +} + +static void sched_core_cpu_deactivate(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t; + + sched_core_lock(cpu, &flags); + + /* if we're the last man standing, nothing to do */ + if (cpumask_weight(smt_mask) == 1) { + WARN_ON_ONCE(rq->core != rq); + goto unlock; + } + + /* if we're not the leader, nothing to do */ + if (rq->core != rq) + goto unlock; + + /* find a new leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + core_rq = cpu_rq(t); + break; + } + + if (WARN_ON_ONCE(!core_rq)) /* impossible */ + goto unlock; + + /* copy the shared state to the new leader */ + core_rq->core_task_seq = rq->core_task_seq; + core_rq->core_pick_seq = rq->core_pick_seq; + core_rq->core_cookie = rq->core_cookie; + core_rq->core_forceidle_count = rq->core_forceidle_count; + core_rq->core_forceidle_seq = rq->core_forceidle_seq; + core_rq->core_forceidle_occupation = rq->core_forceidle_occupation; + + /* + * Accounting edge for forced idle is handled in pick_next_task(). + * Don't need another one here, since the hotplug thread shouldn't + * have a cookie. + */ + core_rq->core_forceidle_start = 0; + + /* install new leader */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); + rq->core = core_rq; + } + +unlock: + sched_core_unlock(cpu, &flags); +} + +static inline void sched_core_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->core != rq) + rq->core = rq; } +#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_cpu_starting(unsigned int cpu) {} +static inline void sched_core_cpu_deactivate(unsigned int cpu) {} +static inline void sched_core_cpu_dying(unsigned int cpu) {} + +static struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return __pick_next_task(rq, prev, rf); +} + +#endif /* CONFIG_SCHED_CORE */ + +/* + * Constants for the sched_mode argument of __schedule(). + * + * The mode argument allows RT enabled kernels to differentiate a + * preemption from blocking on an 'sleeping' spin/rwlock. Note that + * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to + * optimize the AND operation out and just check for zero. + */ +#define SM_NONE 0x0 +#define SM_PREEMPT 0x1 +#define SM_RTLOCK_WAIT 0x2 + +#ifndef CONFIG_PREEMPT_RT +# define SM_MASK_PREEMPT (~0U) +#else +# define SM_MASK_PREEMPT SM_PREEMPT +#endif + /* * __schedule() is the main scheduler function. * @@ -3997,10 +6381,11 @@ restart: * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(bool preempt) +static void __sched notrace __schedule(unsigned int sched_mode) { struct task_struct *prev, *next; unsigned long *switch_count; + unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; @@ -4009,20 +6394,27 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev, preempt); + schedule_debug(prev, !!sched_mode); - if (sched_feat(HRTICK)) + if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(preempt); + rcu_note_context_switch(!!sched_mode); /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). + * done by the caller to avoid the race with signal_wake_up(): + * + * __set_current_state(@state) signal_wake_up() + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) * - * The membarrier system call requires a full memory barrier + * Also, the membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); @@ -4033,10 +6425,35 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (signal_pending_state(prev->state, prev)) { - prev->state = TASK_RUNNING; + + /* + * We must load prev->state once (task_struct::state is volatile), such + * that we form a control dependency vs deactivate_task() below. + */ + prev_state = READ_ONCE(prev->__state); + if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { + if (signal_pending_state(prev_state, prev)) { + WRITE_ONCE(prev->__state, TASK_RUNNING); } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev_state & TASK_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); if (prev->in_iowait) { @@ -4050,6 +6467,9 @@ static void __sched notrace __schedule(bool preempt) next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); +#ifdef CONFIG_SCHED_DEBUG + rq->last_seen_need_resched_ns = 0; +#endif if (likely(prev != next)) { rq->nr_switches++; @@ -4074,16 +6494,20 @@ static void __sched notrace __schedule(bool preempt) */ ++*switch_count; - trace_sched_switch(preempt, prev, next); + migrate_disable_switch(rq, prev); + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unlock_irq(rq, &rf); - } - balance_callback(rq); + rq_unpin_lock(rq, &rf); + __balance_callbacks(rq); + raw_spin_rq_unlock_irq(rq); + } } void __noreturn do_task_dead(void) @@ -4094,7 +6518,7 @@ void __noreturn do_task_dead(void) /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; - __schedule(false); + __schedule(SM_NONE); BUG(); /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ @@ -4104,34 +6528,35 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state) + unsigned int task_flags; + + if (task_is_running(tsk)) return; + task_flags = tsk->flags; /* - * If a worker went to sleep, notify and ask workqueue whether - * it wants to wake up a task to maintain concurrency. - * As this function is called inside the schedule() context, - * we disable preemption to avoid it calling schedule() again - * in the possible wakeup of a kworker. + * If a worker goes to sleep, notify and ask workqueue whether it + * wants to wake up a task to maintain concurrency. */ - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { - preempt_disable(); - if (tsk->flags & PF_WQ_WORKER) + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (task_flags & PF_WQ_WORKER) wq_worker_sleeping(tsk); else io_wq_worker_sleeping(tsk); - preempt_enable_no_resched(); } - if (tsk_is_pi_blocked(tsk)) - return; + /* + * spinlock and rwlock must not flush block requests. This will + * deadlock if the callback attempts to acquire a lock which is + * already acquired. + */ + SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. */ - if (blk_needs_flush_plug(tsk)) - blk_schedule_flush_plug(tsk); + blk_flush_plug(tsk->plug, true); } static void sched_update_worker(struct task_struct *tsk) @@ -4151,7 +6576,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(false); + __schedule(SM_NONE); sched_preempt_enable_no_resched(); } while (need_resched()); sched_update_worker(tsk); @@ -4177,13 +6602,13 @@ void __sched schedule_idle(void) * current task can be in any other state. Note, idle is always in the * TASK_RUNNING state. */ - WARN_ON_ONCE(current->state); + WARN_ON_ONCE(current->__state); do { - __schedule(false); + __schedule(SM_NONE); } while (need_resched()); } -#ifdef CONFIG_CONTEXT_TRACKING +#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) asmlinkage __visible void __sched schedule_user(void) { /* @@ -4214,6 +6639,18 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +#ifdef CONFIG_PREEMPT_RT +void __sched notrace schedule_rtlock(void) +{ + do { + preempt_disable(); + __schedule(SM_RTLOCK_WAIT); + sched_preempt_enable_no_resched(); + } while (need_resched()); +} +NOKPROBE_SYMBOL(schedule_rtlock); +#endif + static void __sched notrace preempt_schedule_common(void) { do { @@ -4232,7 +6669,7 @@ static void __sched notrace preempt_schedule_common(void) */ preempt_disable_notrace(); preempt_latency_start(1); - __schedule(true); + __schedule(SM_PREEMPT); preempt_latency_stop(1); preempt_enable_no_resched_notrace(); @@ -4256,12 +6693,32 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; - preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); +#ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#ifndef preempt_schedule_dynamic_enabled +#define preempt_schedule_dynamic_enabled preempt_schedule +#define preempt_schedule_dynamic_disabled NULL +#endif +DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); +void __sched notrace dynamic_preempt_schedule(void) +{ + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule)) + return; + preempt_schedule(); +} +NOKPROBE_SYMBOL(dynamic_preempt_schedule); +EXPORT_SYMBOL(dynamic_preempt_schedule); +#endif +#endif + /** * preempt_schedule_notrace - preempt_schedule called by tracing * @@ -4305,7 +6762,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(true); + __schedule(SM_PREEMPT); exception_exit(prev_ctx); preempt_latency_stop(1); @@ -4314,6 +6771,27 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); +#ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#ifndef preempt_schedule_notrace_dynamic_enabled +#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +#define preempt_schedule_notrace_dynamic_disabled NULL +#endif +DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); +void __sched notrace dynamic_preempt_schedule_notrace(void) +{ + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace)) + return; + preempt_schedule_notrace(); +} +NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); +EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); +#endif +#endif + #endif /* CONFIG_PREEMPTION */ /* @@ -4334,7 +6812,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) do { preempt_disable(); local_irq_enable(); - __schedule(true); + __schedule(SM_PREEMPT); local_irq_disable(); sched_preempt_enable_no_resched(); } while (need_resched()); @@ -4345,10 +6823,23 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); +static void __setscheduler_prio(struct task_struct *p, int prio) +{ + if (dl_prio(prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + + p->prio = prio; +} + #ifdef CONFIG_RT_MUTEXES static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) @@ -4404,7 +6895,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to * ensure a task is de-boosted (pi_task is set to NULL) before the * task is allowed to run again (and can exit). This ensures the pointer - * points to a blocked task -- which guaratees the task is present. + * points to a blocked task -- which guarantees the task is present. */ p->pi_top_task = pi_task; @@ -4457,27 +6948,26 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) */ if (dl_prio(prio)) { if (!dl_prio(p->normal_prio) || - (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.dl_boosted = 1; + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { + p->dl.pi_se = pi_task->dl.pi_se; queue_flag |= ENQUEUE_REPLENISH; - } else - p->dl.dl_boosted = 0; - p->sched_class = &dl_sched_class; + } else { + p->dl.pi_se = &p->dl; + } } else if (rt_prio(prio)) { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; - p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) p->rt.timeout = 0; - p->sched_class = &fair_sched_class; } - p->prio = prio; + __setscheduler_prio(p, prio); if (queued) enqueue_task(rq, p, queue_flag); @@ -4488,9 +6978,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) out_unlock: /* Avoid rq from going away on us: */ preempt_disable(); - __task_rq_unlock(rq, &rf); - balance_callback(rq); + rq_unpin_lock(rq, &rf); + __balance_callbacks(rq); + raw_spin_rq_unlock(rq); + preempt_enable(); } #else @@ -4519,7 +7011,7 @@ void set_user_nice(struct task_struct *p, long nice) /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected - * it wont have any effect on scheduling until the task is + * it won't have any effect on scheduling until the task is * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: */ if (task_has_dl_policy(p) || task_has_rt_policy(p)) { @@ -4555,17 +7047,29 @@ out_unlock: EXPORT_SYMBOL(set_user_nice); /* - * can_nice - check if a task can reduce its nice value + * is_nice_reduction - check if nice value is an actual reduction + * + * Similar to can_nice() but does not perform a capability check. + * * @p: task * @nice: nice value */ -int can_nice(const struct task_struct *p, const int nice) +static bool is_nice_reduction(const struct task_struct *p, const int nice) { /* Convert nice value [19,-20] to rlimit style value [1,40]: */ int nice_rlim = nice_to_rlimit(nice); - return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || - capable(CAP_SYS_NICE)); + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE)); +} + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ + return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE); } #ifdef __ARCH_WANT_SYS_NICE @@ -4608,8 +7112,12 @@ SYSCALL_DEFINE1(nice, int, increment) * @p: the task in question. * * Return: The priority value as seen by users in /proc. - * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from -16 to +15. + * + * sched policy return value kernel prio user prio/nice + * + * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19] + * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99] + * deadline -101 -1 0 */ int task_prio(const struct task_struct *p) { @@ -4633,7 +7141,7 @@ int idle_cpu(int cpu) return 0; #ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) + if (rq->ttwu_pending) return 0; #endif @@ -4668,6 +7176,121 @@ struct task_struct *idle_task(int cpu) return cpu_rq(cpu)->idle; } +#ifdef CONFIG_SMP +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the irq utilization. + * + * The DL bandwidth number otoh is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + enum cpu_util_type type, + struct task_struct *p) +{ + unsigned long dl_util, util, irq, max; + struct rq *rq = cpu_rq(cpu); + + max = arch_scale_cpu_capacity(cpu); + + if (!uclamp_is_used() && + type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { + return max; + } + + /* + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ + irq = cpu_util_irq(rq); + if (unlikely(irq >= max)) + return max; + + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + * + * CFS and RT utilization can be boosted or capped, depending on + * utilization clamp constraints requested by currently RUNNABLE + * tasks. + * When there are no CFS RUNNABLE tasks, clamps are released and + * frequency will be gracefully reduced with the utilization decay. + */ + util = util_cfs + cpu_util_rt(rq); + if (type == FREQUENCY_UTIL) + util = uclamp_rq_util_with(rq, util, p); + + dl_util = cpu_util_dl(rq); + + /* + * For frequency selection we do not make cpu_util_dl() a permanent part + * of this sum because we want to use cpu_bw_dl() later on, but we need + * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such + * that we select f_max when there is no idle time. + * + * NOTE: numerical errors or stop class might cause us to not quite hit + * saturation when we should -- something for later. + */ + if (util + dl_util >= max) + return max; + + /* + * OTOH, for energy computation we need the estimated running time, so + * include util_dl and ignore dl_bw. + */ + if (type == ENERGY_UTIL) + util += dl_util; + + /* + * There is still idle time; further improve the number by using the + * irq metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * max - irq + * U' = irq + --------- * U + * max + */ + util = scale_irq_capacity(util, irq, max); + util += irq; + + /* + * Bandwidth required by DEADLINE must always be granted while, for + * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism + * to gracefully reduce the frequency when no tasks show up for longer + * periods of time. + * + * Ideally we would like to set bw_dl as min/guaranteed freq and util + + * bw_dl as requested freq. However, cpufreq is not yet ready for such + * an interface. So, we only do the latter for now. + */ + if (type == FREQUENCY_UTIL) + util += cpu_bw_dl(rq); + + return min(max, util); +} + +unsigned long sched_cpu_util(int cpu) +{ + return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL); +} +#endif /* CONFIG_SMP */ + /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. @@ -4710,35 +7333,6 @@ static void __setscheduler_params(struct task_struct *p, set_load_weight(p, true); } -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr, bool keep_boost) -{ - /* - * If params can't change scheduling class changes aren't allowed - * either. - */ - if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) - return; - - __setscheduler_params(p, attr); - - /* - * Keep a potential priority boosting if called from - * sched_setscheduler(). - */ - p->prio = normal_prio(p); - if (keep_boost) - p->prio = rt_effective_prio(p, p->prio); - - if (dl_prio(p->prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(p->prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; -} - /* * Check the target process has a UID that matches the current process's: */ @@ -4755,15 +7349,77 @@ static bool check_same_owner(struct task_struct *p) return match; } +/* + * Allow unprivileged RT tasks to decrease priority. + * Only issue a capable test if needed and only once to avoid an audit + * event on permitted non-privileged operations: + */ +static int user_check_sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + int policy, int reset_on_fork) +{ + if (fair_policy(policy)) { + if (attr->sched_nice < task_nice(p) && + !is_nice_reduction(p, attr->sched_nice)) + goto req_priv; + } + + if (rt_policy(policy)) { + unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); + + /* Can't set/change the rt policy: */ + if (policy != p->policy && !rlim_rtprio) + goto req_priv; + + /* Can't increase priority: */ + if (attr->sched_priority > p->rt_priority && + attr->sched_priority > rlim_rtprio) + goto req_priv; + } + + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + goto req_priv; + + /* + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. + */ + if (task_has_idle_policy(p) && !idle_policy(policy)) { + if (!is_nice_reduction(p, task_nice(p))) + goto req_priv; + } + + /* Can't change other user's priorities: */ + if (!check_same_owner(p)) + goto req_priv; + + /* Normal users shall not reset the sched_reset_on_fork flag: */ + if (p->sched_reset_on_fork && !reset_on_fork) + goto req_priv; + + return 0; + +req_priv: + if (!capable(CAP_SYS_NICE)) + return -EPERM; + + return 0; +} + static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi) { - int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : - MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, queued, running; - int new_effective_prio, policy = attr->sched_policy; + int oldpolicy = -1, policy = attr->sched_policy; + int retval, oldprio, newprio, queued, running; const struct sched_class *prev_class; + struct balance_callback *head; struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; @@ -4788,68 +7444,20 @@ recheck: /* * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ - if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) + if (attr->sched_priority > MAX_RT_PRIO-1) return -EINVAL; if ((dl_policy(policy) && !__checkparam_dl(attr)) || (rt_policy(policy) != (attr->sched_priority != 0))) return -EINVAL; - /* - * Allow unprivileged RT tasks to decrease priority: - */ - if (user && !capable(CAP_SYS_NICE)) { - if (fair_policy(policy)) { - if (attr->sched_nice < task_nice(p) && - !can_nice(p, attr->sched_nice)) - return -EPERM; - } - - if (rt_policy(policy)) { - unsigned long rlim_rtprio = - task_rlimit(p, RLIMIT_RTPRIO); - - /* Can't set/change the rt policy: */ - if (policy != p->policy && !rlim_rtprio) - return -EPERM; - - /* Can't increase priority: */ - if (attr->sched_priority > p->rt_priority && - attr->sched_priority > rlim_rtprio) - return -EPERM; - } - - /* - * Can't set/change SCHED_DEADLINE policy at all for now - * (safest behavior); in the future we would like to allow - * unprivileged DL tasks to increase their relative deadline - * or reduce their runtime (both ways reducing utilization) - */ - if (dl_policy(policy)) - return -EPERM; - - /* - * Treat SCHED_IDLE as nice 20. Only allow a switch to - * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. - */ - if (task_has_idle_policy(p) && !idle_policy(policy)) { - if (!can_nice(p, task_nice(p))) - return -EPERM; - } - - /* Can't change other user's priorities: */ - if (!check_same_owner(p)) - return -EPERM; - - /* Normal users shall not reset the sched_reset_on_fork flag: */ - if (p->sched_reset_on_fork && !reset_on_fork) - return -EPERM; - } - if (user) { + retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork); + if (retval) + return retval; + if (attr->sched_flags & SCHED_FLAG_SUGOV) return -EINVAL; @@ -4960,6 +7568,7 @@ change: p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; + newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); if (pi) { /* * Take priority boosted tasks into account. If the new @@ -4968,8 +7577,8 @@ change: * the runqueue. This will be done when the task deboost * itself. */ - new_effective_prio = rt_effective_prio(p, newprio); - if (new_effective_prio == oldprio) + newprio = rt_effective_prio(p, newprio); + if (newprio == oldprio) queue_flags &= ~DEQUEUE_MOVE; } @@ -4982,7 +7591,10 @@ change: prev_class = p->sched_class; - __setscheduler(rq, p, attr, pi); + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + __setscheduler_prio(p, newprio); + } __setscheduler_uclamp(p, attr); if (queued) { @@ -5002,6 +7614,7 @@ change: /* Avoid rq from going away on us: */ preempt_disable(); + head = splice_balance_callbacks(rq); task_rq_unlock(rq, p, &rf); if (pi) { @@ -5010,7 +7623,7 @@ change: } /* Run balance callbacks after we've adjusted the PI chain: */ - balance_callback(rq); + balance_callbacks(rq, head); preempt_enable(); return 0; @@ -5046,6 +7659,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, * @policy: new policy. * @param: structure containing the new RT priority. * + * Use sched_set_fifo(), read its comment. + * * Return: 0 on success. An error code otherwise. * * NOTE that the task may be already dead. @@ -5055,18 +7670,17 @@ int sched_setscheduler(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, true); } -EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, true, true); } -EXPORT_SYMBOL_GPL(sched_setattr); int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. @@ -5086,7 +7700,51 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } -EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); + +/* + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally + * incapable of resource management, which is the one thing an OS really should + * be doing. + * + * This is of course the reason it is limited to privileged users only. + * + * Worse still; it is fundamentally impossible to compose static priority + * workloads. You cannot take two correctly working static prio workloads + * and smash them together and still expect them to work. + * + * For this reason 'all' FIFO tasks the kernel creates are basically at: + * + * MAX_RT_PRIO / 2 + * + * The administrator _MUST_ configure the system, the kernel simply doesn't + * know enough information to make a sensible choice. + */ +void sched_set_fifo(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo); + +/* + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. + */ +void sched_set_fifo_low(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo_low); + +void sched_set_normal(struct task_struct *p, int nice) +{ + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = nice, + }; + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_normal); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) @@ -5160,6 +7818,16 @@ err_size: return -E2BIG; } +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ + if (task_has_dl_policy(p)) + __getparam_dl(p, attr); + else if (task_has_rt_policy(p)) + attr->sched_priority = p->rt_priority; + else + attr->sched_nice = task_nice(p); +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@ -5221,6 +7889,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, rcu_read_unlock(); if (likely(p)) { + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); retval = sched_setattr(p, &attr); put_task_struct(p); } @@ -5369,14 +8039,15 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - if (task_has_dl_policy(p)) - __getparam_dl(p, &kattr); - else if (task_has_rt_policy(p)) - kattr.sched_priority = p->rt_priority; - else - kattr.sched_nice = task_nice(p); + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif @@ -5390,9 +8061,76 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) +{ + int ret = 0; + + /* + * If the task isn't a deadline task or admission control is + * disabled then we don't care about affinity changes. + */ + if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) + return 0; + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, mask)) + ret = -EBUSY; + rcu_read_unlock(); + return ret; +} +#endif + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) { + int retval; cpumask_var_t cpus_allowed, new_mask; + + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) + return -ENOMEM; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, mask, cpus_allowed); + + retval = dl_task_check_affinity(p, new_mask); + if (retval) + goto out_free_new_mask; +again: + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); + if (retval) + goto out_free_new_mask; + + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset update. + * Just reset the cpumask to the cpuset's cpus_allowed. + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } + +out_free_new_mask: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ struct task_struct *p; int retval; @@ -5412,68 +8150,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) retval = -EINVAL; goto out_put_task; } - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - retval = -EPERM; + if (!check_same_owner(p)) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); - goto out_free_new_mask; + retval = -EPERM; + goto out_put_task; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) - goto out_free_new_mask; - - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); - - /* - * Since bandwidth control happens on root_domain basis, - * if admission test is enabled, we only admit -deadline - * tasks allowed to run on all the CPUs in the task's - * root_domain. - */ -#ifdef CONFIG_SMP - if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { - rcu_read_lock(); - if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { - retval = -EBUSY; - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); - } -#endif -again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); + goto out_put_task; - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } - } -out_free_new_mask: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); + retval = __sched_setaffinity(p, in_mask); out_put_task: put_task_struct(p); return retval; @@ -5578,14 +8270,6 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, return ret; } -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. If there are no - * other threads running on this CPU then this function will return. - * - * Return: 0. - */ static void do_sched_yield(void) { struct rq_flags rf; @@ -5596,34 +8280,83 @@ static void do_sched_yield(void) schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ preempt_disable(); - rq_unlock(rq, &rf); + rq_unlock_irq(rq, &rf); sched_preempt_enable_no_resched(); schedule(); } +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + * + * Return: 0. + */ SYSCALL_DEFINE0(sched_yield) { do_sched_yield(); return 0; } -#ifndef CONFIG_PREEMPTION -int __sched _cond_resched(void) +#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) +int __sched __cond_resched(void) { if (should_resched(0)) { preempt_schedule_common(); return 1; } + /* + * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * whether the current CPU is in an RCU read-side critical section, + * so the tick can report quiescent states even for CPUs looping + * in kernel context. In contrast, in non-preemptible kernels, + * RCU readers leave no in-memory hints, which means that CPU-bound + * processes executing in kernel context might never report an + * RCU quiescent state. Therefore, the following code causes + * cond_resched() to report a quiescent state, but only when RCU + * is in urgent need of one. + */ +#ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); +#endif return 0; } -EXPORT_SYMBOL(_cond_resched); +EXPORT_SYMBOL(__cond_resched); +#endif + +#ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define cond_resched_dynamic_enabled __cond_resched +#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) +DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); +EXPORT_STATIC_CALL_TRAMP(cond_resched); + +#define might_resched_dynamic_enabled __cond_resched +#define might_resched_dynamic_disabled ((void *)&__static_call_return0) +DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); +EXPORT_STATIC_CALL_TRAMP(might_resched); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); +int __sched dynamic_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_cond_resched)) + return 0; + return __cond_resched(); +} +EXPORT_SYMBOL(dynamic_cond_resched); + +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); +int __sched dynamic_might_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_might_resched)) + return 0; + return __cond_resched(); +} +EXPORT_SYMBOL(dynamic_might_resched); +#endif #endif /* @@ -5643,9 +8376,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; spin_lock(lock); @@ -5654,6 +8385,202 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); +int __cond_resched_rwlock_read(rwlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held_read(lock); + + if (rwlock_needbreak(lock) || resched) { + read_unlock(lock); + if (!_cond_resched()) + cpu_relax(); + ret = 1; + read_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_rwlock_read); + +int __cond_resched_rwlock_write(rwlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held_write(lock); + + if (rwlock_needbreak(lock) || resched) { + write_unlock(lock); + if (!_cond_resched()) + cpu_relax(); + ret = 1; + write_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_rwlock_write); + +#ifdef CONFIG_PREEMPT_DYNAMIC + +#ifdef CONFIG_GENERIC_ENTRY +#include <linux/entry-common.h> +#endif + +/* + * SC:cond_resched + * SC:might_resched + * SC:preempt_schedule + * SC:preempt_schedule_notrace + * SC:irqentry_exit_cond_resched + * + * + * NONE: + * cond_resched <- __cond_resched + * might_resched <- RET0 + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * VOLUNTARY: + * cond_resched <- __cond_resched + * might_resched <- __cond_resched + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * FULL: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + */ + +enum { + preempt_dynamic_undefined = -1, + preempt_dynamic_none, + preempt_dynamic_voluntary, + preempt_dynamic_full, +}; + +int preempt_dynamic_mode = preempt_dynamic_undefined; + +int sched_dynamic_mode(const char *str) +{ + if (!strcmp(str, "none")) + return preempt_dynamic_none; + + if (!strcmp(str, "voluntary")) + return preempt_dynamic_voluntary; + + if (!strcmp(str, "full")) + return preempt_dynamic_full; + + return -EINVAL; +} + +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) +#else +#error "Unsupported PREEMPT_DYNAMIC mechanism" +#endif + +void sched_dynamic_update(int mode) +{ + /* + * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in + * the ZERO state, which is invalid. + */ + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + + switch (mode) { + case preempt_dynamic_none: + preempt_dynamic_enable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: none\n"); + break; + + case preempt_dynamic_voluntary: + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: voluntary\n"); + break; + + case preempt_dynamic_full: + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: full\n"); + break; + } + + preempt_dynamic_mode = mode; +} + +static int __init setup_preempt_mode(char *str) +{ + int mode = sched_dynamic_mode(str); + if (mode < 0) { + pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); + return 0; + } + + sched_dynamic_update(mode); + return 1; +} +__setup("preempt=", setup_preempt_mode); + +static void __init preempt_dynamic_init(void) +{ + if (preempt_dynamic_mode == preempt_dynamic_undefined) { + if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { + sched_dynamic_update(preempt_dynamic_none); + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { + sched_dynamic_update(preempt_dynamic_voluntary); + } else { + /* Default static call setting, nothing to do */ + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); + preempt_dynamic_mode = preempt_dynamic_full; + pr_info("Dynamic Preempt: full\n"); + } + } +} + +#define PREEMPT_MODEL_ACCESSOR(mode) \ + bool preempt_model_##mode(void) \ + { \ + WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ + return preempt_dynamic_mode == preempt_dynamic_##mode; \ + } \ + EXPORT_SYMBOL_GPL(preempt_model_##mode) + +PREEMPT_MODEL_ACCESSOR(none); +PREEMPT_MODEL_ACCESSOR(voluntary); +PREEMPT_MODEL_ACCESSOR(full); + +#else /* !CONFIG_PREEMPT_DYNAMIC */ + +static inline void preempt_dynamic_init(void) { } + +#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ + /** * yield - yield the current processor to other threads. * @@ -5661,7 +8588,7 @@ EXPORT_SYMBOL(__cond_resched_lock); * * The scheduler is at all times free to pick the calling task as the most * eligible task to run, if removing the yield() call from your code breaks - * it, its already broken. + * it, it's already broken. * * Typical broken usage is: * @@ -5731,10 +8658,10 @@ again: if (curr->sched_class != p->sched_class) goto out_unlock; - if (task_running(p_rq, p) || p->state) + if (task_on_cpu(p_rq, p) || !task_is_running(p)) goto out_unlock; - yielded = curr->sched_class->yield_to_task(rq, p, preempt); + yielded = curr->sched_class->yield_to_task(rq, p); if (yielded) { schedstat_inc(rq->yld_count); /* @@ -5762,8 +8689,7 @@ int io_schedule_prepare(void) int old_iowait = current->in_iowait; current->in_iowait = 1; - blk_schedule_flush_plug(current); - + blk_flush_plug(current->plug, true); return old_iowait; } @@ -5814,7 +8740,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) switch (policy) { case SCHED_FIFO: case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; + ret = MAX_RT_PRIO-1; break; case SCHED_DEADLINE: case SCHED_NORMAL: @@ -5932,10 +8858,10 @@ void sched_show_task(struct task_struct *p) if (!try_get_task_stack(p)) return; - printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); - if (p->state == TASK_RUNNING) - printk(KERN_CONT " running task "); + if (task_is_running(p)) + pr_cont(" running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif @@ -5944,12 +8870,13 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), ppid, - (unsigned long)task_thread_info(p)->flags); + pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", + free, task_pid_nr(p), ppid, + read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); - show_stack(p, NULL); + print_stop_info(KERN_INFO, p); + show_stack(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); @@ -5957,36 +8884,31 @@ EXPORT_SYMBOL_GPL(sched_show_task); static inline bool state_filter_match(unsigned long state_filter, struct task_struct *p) { + unsigned int state = READ_ONCE(p->__state); + /* no filter, everything matches */ if (!state_filter) return true; /* filter, but doesn't match */ - if (!(p->state & state_filter)) + if (!(state & state_filter)) return false; /* * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows * TASK_KILLABLE). */ - if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) + if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD)) return false; return true; } -void show_state_filter(unsigned long state_filter) +void show_state_filter(unsigned int state_filter) { struct task_struct *g, *p; -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif rcu_read_lock(); for_each_process_thread(g, p) { /* @@ -6022,7 +8944,7 @@ void show_state_filter(unsigned long state_filter) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void init_idle(struct task_struct *idle, int cpu) +void __init init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -6030,22 +8952,25 @@ void init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); raw_spin_lock_irqsave(&idle->pi_lock, flags); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); - idle->state = TASK_RUNNING; + idle->__state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - idle->flags |= PF_IDLE; - - kasan_unpoison_task_stack(idle); + /* + * PF_KTHREAD should already be set at this point; regardless, make it + * look like a proper per-CPU kthread. + */ + idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; + kthread_set_per_cpu(idle, cpu); #ifdef CONFIG_SMP /* - * Its possible that init_idle() gets called multiple times on a task, + * It's possible that init_idle() gets called multiple times on a task, * in that case do_set_cpus_allowed() will not do the right thing. * * And since this is boot we can forgo the serialization. */ - set_cpus_allowed_common(idle, cpumask_of(cpu)); + set_cpus_allowed_common(idle, cpumask_of(cpu), 0); #endif /* * We're having a chicken and egg problem, even though we are @@ -6067,7 +8992,7 @@ void init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP idle->on_cpu = 1; #endif - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -6091,7 +9016,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, { int ret = 1; - if (!cpumask_weight(cur)) + if (cpumask_empty(cur)) return ret; ret = dl_cpuset_cpumask_can_shrink(cur, trial); @@ -6100,7 +9025,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, } int task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed) + const struct cpumask *cs_effective_cpus) { int ret = 0; @@ -6119,8 +9044,13 @@ int task_can_attach(struct task_struct *p, } if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) - ret = dl_task_can_attach(p, cs_cpus_allowed); + cs_effective_cpus)) { + int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus); + + if (unlikely(cpu >= nr_cpu_ids)) + return -EINVAL; + ret = dl_cpu_busy(cpu, p); + } out: return ret; @@ -6186,128 +9116,154 @@ void idle_task_exit(void) struct mm_struct *mm = current->active_mm; BUG_ON(cpu_online(smp_processor_id())); + BUG_ON(current != this_rq()->idle); if (mm != &init_mm) { switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; finish_arch_post_lock_switch(); } - mmdrop(mm); -} -/* - * Since this CPU is going 'away' for a while, fold any nr_active delta - * we might have. Assumes we're called after migrate_tasks() so that the - * nr_active count is stable. We need to take the teardown thread which - * is calling this into account, so we hand in adjust = 1 to the load - * calculation. - * - * Also see the comment "Global load-average calculations". - */ -static void calc_load_migrate(struct rq *rq) -{ - long delta = calc_load_fold_active(rq, 1); - if (delta) - atomic_long_add(delta, &calc_load_tasks); + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } -static struct task_struct *__pick_migrate_task(struct rq *rq) +static int __balance_push_cpu_stop(void *arg) { - const struct sched_class *class; - struct task_struct *next; + struct task_struct *p = arg; + struct rq *rq = this_rq(); + struct rq_flags rf; + int cpu; - for_each_class(class) { - next = class->pick_next_task(rq); - if (next) { - next->sched_class->put_prev_task(rq, next); - return next; - } + raw_spin_lock_irq(&p->pi_lock); + rq_lock(rq, &rf); + + update_rq_clock(rq); + + if (task_rq(p) == rq && task_on_rq_queued(p)) { + cpu = select_fallback_rq(rq->cpu, p); + rq = __migrate_task(rq, &rf, p, cpu); } - /* The idle class should always have a runnable task */ - BUG(); + rq_unlock(rq, &rf); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + return 0; } +static DEFINE_PER_CPU(struct cpu_stop_work, push_work); + /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Ensure we only run per-cpu kthreads once the CPU goes !active. * - * Called with rq->lock held even though we'er in stop_machine() and - * there's no concurrency possible, we hold the required locks anyway - * because of lock validation efforts. + * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only + * effective when the hotplug motion is down. */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +static void balance_push(struct rq *rq) { - struct rq *rq = dead_rq; - struct task_struct *next, *stop = rq->stop; - struct rq_flags orf = *rf; - int dest_cpu; + struct task_struct *push_task = rq->curr; + + lockdep_assert_rq_held(rq); /* - * Fudge the rq selection such that the below task selection loop - * doesn't get stuck on the currently eligible stop task. - * - * We're currently inside stop_machine() and the rq is either stuck - * in the stop_machine_cpu_stop() loop, or we're executing this code, - * either way we should never end up calling schedule() until we're - * done here. + * Ensure the thing is persistent until balance_push_set(.on = false); */ - rq->stop = NULL; + rq->balance_callback = &balance_push_callback; /* - * put_prev_task() and pick_next_task() sched - * class method both need to have an up-to-date - * value of rq->clock[_task] + * Only active while going offline and when invoked on the outgoing + * CPU. */ - update_rq_clock(rq); - - for (;;) { - /* - * There's this thread running, bail when that's the only - * remaining thread: - */ - if (rq->nr_running == 1) - break; + if (!cpu_dying(rq->cpu) || rq != this_rq()) + return; - next = __pick_migrate_task(rq); + /* + * Both the cpu-hotplug and stop task are in this case and are + * required to complete the hotplug process. + */ + if (kthread_is_per_cpu(push_task) || + is_migration_disabled(push_task)) { /* - * Rules for changing task_struct::cpus_mask are holding - * both pi_lock and rq->lock, such that holding either - * stabilizes the mask. + * If this is the idle task on the outgoing CPU try to wake + * up the hotplug control thread which might wait for the + * last task to vanish. The rcuwait_active() check is + * accurate here because the waiter is pinned on this CPU + * and can't obviously be running in parallel. * - * Drop rq->lock is not quite as disastrous as it usually is - * because !cpu_active at this point, which means load-balance - * will not interfere. Also, stop-machine. + * On RT kernels this also has to check whether there are + * pinned and scheduled out tasks on the runqueue. They + * need to leave the migrate disabled section first. */ - rq_unlock(rq, rf); - raw_spin_lock(&next->pi_lock); - rq_relock(rq, rf); - - /* - * Since we're inside stop-machine, _nothing_ should have - * changed the task, WARN if weird stuff happened, because in - * that case the above rq->lock drop is a fail too. - */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { - raw_spin_unlock(&next->pi_lock); - continue; + if (!rq->nr_running && !rq_has_pinned_tasks(rq) && + rcuwait_active(&rq->hotplug_wait)) { + raw_spin_rq_unlock(rq); + rcuwait_wake_up(&rq->hotplug_wait); + raw_spin_rq_lock(rq); } + return; + } - /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); - rq = __migrate_task(rq, rf, next, dest_cpu); - if (rq != dead_rq) { - rq_unlock(rq, rf); - rq = dead_rq; - *rf = orf; - rq_relock(rq, rf); - } - raw_spin_unlock(&next->pi_lock); + get_task_struct(push_task); + /* + * Temporarily drop rq->lock such that we can wake-up the stop task. + * Both preemption and IRQs are still disabled. + */ + raw_spin_rq_unlock(rq); + stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, + this_cpu_ptr(&push_work)); + /* + * At this point need_resched() is true and we'll take the loop in + * schedule(). The next pick is obviously going to be the stop task + * which kthread_is_per_cpu() and will push this task away. + */ + raw_spin_rq_lock(rq); +} + +static void balance_push_set(int cpu, bool on) +{ + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (on) { + WARN_ON_ONCE(rq->balance_callback); + rq->balance_callback = &balance_push_callback; + } else if (rq->balance_callback == &balance_push_callback) { + rq->balance_callback = NULL; } + rq_unlock_irqrestore(rq, &rf); +} + +/* + * Invoked from a CPUs hotplug control thread after the CPU has been marked + * inactive. All tasks which are not per CPU kernel threads are either + * pushed off this CPU now via balance_push() or placed on a different CPU + * during wakeup. Wait until the CPU is quiescent. + */ +static void balance_hotplug_wait(void) +{ + struct rq *rq = this_rq(); - rq->stop = stop; + rcuwait_wait_event(&rq->hotplug_wait, + rq->nr_running == 1 && !rq_has_pinned_tasks(rq), + TASK_UNINTERRUPTIBLE); } + +#else + +static inline void balance_push(struct rq *rq) +{ +} + +static inline void balance_push_set(int cpu, bool on) +{ +} + +static inline void balance_hotplug_wait(void) +{ +} + #endif /* CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) @@ -6378,8 +9334,10 @@ static void cpuset_cpu_active(void) static int cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - if (dl_cpu_busy(cpu)) - return -EBUSY; + int ret = dl_cpu_busy(cpu, NULL); + + if (ret) + return ret; cpuset_update_active_cpus(); } else { num_cpus_frozen++; @@ -6393,6 +9351,12 @@ int sched_cpu_activate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; + /* + * Clear the balance_push callback and prepare to schedule + * regular tasks. + */ + balance_push_set(cpu, false); + #ifdef CONFIG_SCHED_SMT /* * When going up, increment the number of cores with SMT present. @@ -6403,6 +9367,7 @@ int sched_cpu_activate(unsigned int cpu) set_cpu_active(cpu, true); if (sched_smp_initialized) { + sched_update_numa(cpu, true); sched_domains_numa_masks_set(cpu); cpuset_cpu_active(); } @@ -6428,32 +9393,65 @@ int sched_cpu_activate(unsigned int cpu) int sched_cpu_deactivate(unsigned int cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; int ret; + /* + * Remove CPU from nohz.idle_cpus_mask to prevent participating in + * load balancing when not active + */ + nohz_balance_exit_idle(rq); + set_cpu_active(cpu, false); + /* - * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU - * users of this state to go away such that all new such users will - * observe it. + * From this point forward, this CPU will refuse to run any task that + * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively + * push those tasks away until this gets cleared, see + * sched_cpu_dying(). + */ + balance_push_set(cpu, true); + + /* + * We've cleared cpu_active_mask / set balance_push, wait for all + * preempt-disabled and RCU users of this state to go away such that + * all new such users will observe it. + * + * Specifically, we rely on ttwu to no longer target this CPU, see + * ttwu_queue_cond() and is_cpu_allowed(). * * Do sync before park smpboot threads to take care the rcu boost case. */ synchronize_rcu(); + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + update_rq_clock(rq); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + rq_unlock_irqrestore(rq, &rf); + #ifdef CONFIG_SCHED_SMT /* * When going down, decrement the number of cores with SMT present. */ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_dec_cpuslocked(&sched_smt_present); + + sched_core_cpu_deactivate(cpu); #endif if (!sched_smp_initialized) return 0; + sched_update_numa(cpu, false); ret = cpuset_cpu_inactive(cpu); if (ret) { + balance_push_set(cpu, false); set_cpu_active(cpu, true); + sched_update_numa(cpu, true); return ret; } sched_domains_numa_masks_clear(cpu); @@ -6470,41 +9468,93 @@ static void sched_rq_cpu_starting(unsigned int cpu) int sched_cpu_starting(unsigned int cpu) { + sched_core_cpu_starting(cpu); sched_rq_cpu_starting(cpu); sched_tick_start(cpu); return 0; } #ifdef CONFIG_HOTPLUG_CPU + +/* + * Invoked immediately before the stopper thread is invoked to bring the + * CPU down completely. At this point all per CPU kthreads except the + * hotplug thread (current) and the stopper thread (inactive) have been + * either parked or have been unbound from the outgoing CPU. Ensure that + * any of those which might be on the way out are gone. + * + * If after this point a bound task is being woken on this CPU then the + * responsible hotplug callback has failed to do it's job. + * sched_cpu_dying() will catch it with the appropriate fireworks. + */ +int sched_cpu_wait_empty(unsigned int cpu) +{ + balance_hotplug_wait(); + return 0; +} + +/* + * Since this CPU is going 'away' for a while, fold any nr_active delta we + * might have. Called from the CPU stopper task after ensuring that the + * stopper is the last running task on the CPU, so nr_active count is + * stable. We need to take the teardown thread which is calling this into + * account, so we hand in adjust = 1 to the load calculation. + * + * Also see the comment "Global load-average calculations". + */ +static void calc_load_migrate(struct rq *rq) +{ + long delta = calc_load_fold_active(rq, 1); + + if (delta) + atomic_long_add(delta, &calc_load_tasks); +} + +static void dump_rq_tasks(struct rq *rq, const char *loglvl) +{ + struct task_struct *g, *p; + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); + for_each_process_thread(g, p) { + if (task_cpu(p) != cpu) + continue; + + if (!task_on_rq_queued(p)) + continue; + + printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm); + } +} + int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ - sched_ttwu_pending(); sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); + if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { + WARN(true, "Dying CPU not properly vacated!"); + dump_rq_tasks(rq, KERN_WARNING); } - migrate_tasks(rq, &rf); - BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); update_max_interval(); - nohz_balance_exit_idle(rq); hrtick_clear(rq); + sched_core_cpu_dying(cpu); return 0; } #endif void __init sched_init_smp(void) { - sched_init_numa(); + sched_init_numa(NUMA_NO_NODE); /* * There's no userspace yet to cause hotplug operations; hence all the @@ -6516,8 +9566,9 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) BUG(); + current->flags &= ~PF_NO_SETAFFINITY; sched_init_granularity(); init_sched_rt_class(); @@ -6559,14 +9610,19 @@ LIST_HEAD(task_groups); static struct kmem_cache *task_group_cache __read_mostly; #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); -DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); - void __init sched_init(void) { unsigned long ptr = 0; int i; + /* Make sure the linker didn't screw up */ + BUG_ON(&idle_sched_class != &fair_sched_class + 1 || + &fair_sched_class != &rt_sched_class + 1 || + &rt_sched_class != &dl_sched_class + 1); +#ifdef CONFIG_SMP + BUG_ON(&dl_sched_class != &stop_sched_class + 1); +#endif + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6585,6 +9641,8 @@ void __init sched_init(void) root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -6595,17 +9653,8 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } -#ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); - init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); #ifdef CONFIG_SMP init_defrootdomain(); @@ -6629,7 +9678,7 @@ void __init sched_init(void) struct rq *rq; rq = cpu_rq(i); - raw_spin_lock_init(&rq->lock); + raw_spin_lock_init(&rq->__lock); rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; @@ -6637,7 +9686,6 @@ void __init sched_init(void) init_rt_rq(&rq->rt); init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* @@ -6659,7 +9707,6 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -6671,7 +9718,7 @@ void __init sched_init(void) rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; - rq->balance_callback = NULL; + rq->balance_callback = &balance_push_callback; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; @@ -6679,19 +9726,37 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + rq->wake_stamp = jiffies; + rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON - rq->last_load_update_tick = jiffies; rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); + + INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); +#endif +#ifdef CONFIG_HOTPLUG_CPU + rcuwait_init(&rq->hotplug_wait); #endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); + +#ifdef CONFIG_SCHED_CORE + rq->core = rq; + rq->core_pick = NULL; + rq->core_enabled = 0; + rq->core_tree = RB_ROOT; + rq->core_forceidle_count = 0; + rq->core_forceidle_occupation = 0; + rq->core_forceidle_start = 0; + + rq->core_cookie = 0UL; +#endif } set_load_weight(&init_task, false); @@ -6703,6 +9768,14 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current); /* + * The idle task doesn't need the kthread struct to function, but it + * is dressed up as a per-CPU kthread and thus needs to play the part + * if we want to avoid special-casing it in code that deals with per-CPU + * kthreads. + */ + WARN_ON(!set_kthread_struct(current)); + + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again @@ -6714,45 +9787,61 @@ void __init sched_init(void) #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); + balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); - init_schedstats(); - psi_init(); init_uclamp(); + preempt_dynamic_init(); + scheduler_running = 1; } #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -static inline int preempt_count_equals(int preempt_offset) -{ - int nested = preempt_count() + rcu_preempt_depth(); - - return (nested == preempt_offset); -} -void __might_sleep(const char *file, int line, int preempt_offset) +void __might_sleep(const char *file, int line) { + unsigned int state = get_current_state(); /* * Blocking primitives will set (and therefore destroy) current->state, * since we will exit with TASK_RUNNING make sure we enter with it, * otherwise we will destroy state. */ - WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, + WARN_ONCE(state != TASK_RUNNING && current->task_state_change, "do not call blocking ops when !TASK_RUNNING; " - "state=%lx set at [<%p>] %pS\n", - current->state, + "state=%x set at [<%p>] %pS\n", state, (void *)current->task_state_change, (void *)current->task_state_change); - ___might_sleep(file, line, preempt_offset); + __might_resched(file, line, 0); } EXPORT_SYMBOL(__might_sleep); -void ___might_sleep(const char *file, int line, int preempt_offset) +static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) +{ + if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) + return; + + if (preempt_count() == preempt_offset) + return; + + pr_err("Preemption disabled at:"); + print_ip_sym(KERN_ERR, ip); +} + +static inline bool resched_offsets_ok(unsigned int offsets) +{ + unsigned int nested = preempt_count(); + + nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; + + return nested == offsets; +} + +void __might_resched(const char *file, int line, unsigned int offsets) { /* Ratelimiting timestamp: */ static unsigned long prev_jiffy; @@ -6762,7 +9851,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) /* WARN_ON_ONCE() by default, no rate limit required: */ rcu_sleep_check(); - if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + if ((resched_offsets_ok(offsets) && !irqs_disabled() && !is_idle_task(current) && !current->non_block_count) || system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || oops_in_progress) @@ -6775,30 +9864,33 @@ void ___might_sleep(const char *file, int line, int preempt_offset) /* Save this before calling printk(), since that will clobber it: */ preempt_disable_ip = get_preempt_disable_ip(current); - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), current->non_block_count, - current->pid, current->comm); + pr_err("BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->non_block_count, + current->pid, current->comm); + pr_err("preempt_count: %x, expected: %x\n", preempt_count(), + offsets & MIGHT_RESCHED_PREEMPT_MASK); + + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { + pr_err("RCU nest depth: %d, expected: %u\n", + rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); + } if (task_stack_end_corrupted(current)) - printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + pr_emerg("Thread overran stack, or stack corrupted\n"); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); - if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) - && !preempt_count_equals(preempt_offset)) { - pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); - } + + print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, + preempt_disable_ip); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } -EXPORT_SYMBOL(___might_sleep); +EXPORT_SYMBOL(__might_resched); void __cant_sleep(const char *file, int line, int preempt_offset) { @@ -6827,6 +9919,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL_GPL(__cant_sleep); + +#ifdef CONFIG_SMP +void __cant_migrate(const char *file, int line) +{ + static unsigned long prev_jiffy; + + if (irqs_disabled()) + return; + + if (is_migration_disabled(current)) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + if (preempt_count() > 0) + return; + + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), is_migration_disabled(current), + current->pid, current->comm); + + debug_show_held_locks(current); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +} +EXPORT_SYMBOL_GPL(__cant_migrate); +#endif #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -6846,9 +9971,9 @@ void normalize_rt_tasks(void) continue; p->se.exec_start = 0; - schedstat_set(p->se.statistics.wait_start, 0); - schedstat_set(p->se.statistics.sleep_start, 0); - schedstat_set(p->se.statistics.block_start, 0); + schedstat_set(p->stats.wait_start, 0); + schedstat_set(p->stats.sleep_start, 0); + schedstat_set(p->stats.block_start, 0); if (!dl_task(p) && !rt_task(p)) { /* @@ -6942,6 +10067,22 @@ static void sched_free_group(struct task_group *tg) kmem_cache_free(task_group_cache, tg); } +static void sched_free_group_rcu(struct rcu_head *rcu) +{ + sched_free_group(container_of(rcu, struct task_group, rcu)); +} + +static void sched_unregister_group(struct task_group *tg) +{ + unregister_fair_sched_group(tg); + unregister_rt_sched_group(tg); + /* + * We have to wait for yet another RCU grace period to expire, as + * print_cfs_stats() might run concurrently. + */ + call_rcu(&tg->rcu, sched_free_group_rcu); +} + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -6985,32 +10126,42 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void sched_free_group_rcu(struct rcu_head *rhp) +static void sched_unregister_group_rcu(struct rcu_head *rhp) { /* Now it should be safe to free those cfs_rqs: */ - sched_free_group(container_of(rhp, struct task_group, rcu)); + sched_unregister_group(container_of(rhp, struct task_group, rcu)); } void sched_destroy_group(struct task_group *tg) { /* Wait for possible concurrent references to cfs_rqs complete: */ - call_rcu(&tg->rcu, sched_free_group_rcu); + call_rcu(&tg->rcu, sched_unregister_group_rcu); } -void sched_offline_group(struct task_group *tg) +void sched_release_group(struct task_group *tg) { unsigned long flags; - /* End participation in shares distribution: */ - unregister_fair_sched_group(tg); - + /* + * Unlink first, to avoid walk_tg_tree_from() from finding us (via + * sched_cfs_period_timer()). + * + * For this to be effective, we have to wait for all pending users of + * this task group to leave their RCU critical section to ensure no new + * user will see our dying task group any more. Specifically ensure + * that tg_unthrottle_up() won't add decayed cfs_rq's to it. + * + * We therefore defer calling unregister_fair_sched_group() to + * sched_unregister_group() which is guarantied to get called only after the + * current RCU grace period has expired. + */ spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); } -static void sched_change_group(struct task_struct *tsk, int type) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -7026,7 +10177,7 @@ static void sched_change_group(struct task_struct *tsk, int type) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) - tsk->sched_class->task_change_group(tsk, type); + tsk->sched_class->task_change_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); @@ -7057,7 +10208,7 @@ void sched_move_task(struct task_struct *tsk) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, TASK_MOVE_GROUP); + sched_change_group(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -7108,7 +10259,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new group */ + mutex_lock(&uclamp_mutex); + rcu_read_lock(); cpu_util_update_eff(css); + rcu_read_unlock(); + mutex_unlock(&uclamp_mutex); #endif return 0; @@ -7118,7 +10273,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_offline_group(tg); + sched_release_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) @@ -7128,56 +10283,22 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) /* * Relies on the RCU grace period between css_released() and this. */ - sched_free_group(tg); -} - -/* - * This is called before wake_up_new_task(), therefore we really only - * have to set its group bits, all the other stuff does not apply. - */ -static void cpu_cgroup_fork(struct task_struct *task) -{ - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(task, &rf); - - update_rq_clock(rq); - sched_change_group(task, TASK_SET_GROUP); - - task_rq_unlock(rq, task, &rf); + sched_unregister_group(tg); } +#ifdef CONFIG_RT_GROUP_SCHED static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; - int ret = 0; cgroup_taskset_for_each(task, css, tset) { -#ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; -#endif - /* - * Serialize against wake_up_new_task() such that if its - * running, we're sure to observe its full state. - */ - raw_spin_lock_irq(&task->pi_lock); - /* - * Avoid calling sched_move_task() before wake_up_new_task() - * has happened. This would lead to problems with PELT, due to - * move wanting to detach+attach while we're not attached yet. - */ - if (task->state == TASK_NEW) - ret = -EINVAL; - raw_spin_unlock_irq(&task->pi_lock); - - if (ret) - break; } - return ret; + return 0; } +#endif static void cpu_cgroup_attach(struct cgroup_taskset *tset) { @@ -7198,6 +10319,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) enum uclamp_id clamp_id; unsigned int clamps; + lockdep_assert_held(&uclamp_mutex); + SCHED_WARN_ON(!rcu_read_lock_held()); + css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent ? css_tg(css)->parent->uclamp : NULL; @@ -7230,7 +10354,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) } /* Immediately update descendants RUNNABLE tasks */ - uclamp_update_active_tasks(css, clamps); + uclamp_update_active_tasks(css); } } @@ -7288,6 +10412,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; + static_branch_enable(&sched_uclamp_used); + mutex_lock(&uclamp_mutex); rcu_read_lock(); @@ -7382,10 +10508,13 @@ static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, + u64 burst) { int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; @@ -7402,7 +10531,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) return -EINVAL; /* - * Likewise, bound things on the otherside by preventing insane quota + * Likewise, bound things on the other side by preventing insane quota * periods. This also allows us to normalize in computing quota * feasibility. */ @@ -7410,10 +10539,20 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) return -EINVAL; /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota != RUNTIME_INF && quota > max_cfs_runtime) + return -EINVAL; + + if (quota != RUNTIME_INF && (burst > quota || + burst + quota > max_cfs_runtime)) + return -EINVAL; + + /* * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@ -7430,6 +10569,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; + cfs_b->burst = burst; __refill_cfs_bandwidth_runtime(cfs_b); @@ -7456,16 +10596,17 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); - put_online_cpus(); + cpus_read_unlock(); return ret; } static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { - u64 quota, period; + u64 quota, period, burst; period = ktime_to_ns(tg->cfs_bandwidth.period); + burst = tg->cfs_bandwidth.burst; if (cfs_quota_us < 0) quota = RUNTIME_INF; else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) @@ -7473,7 +10614,7 @@ static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) else return -EINVAL; - return tg_set_cfs_bandwidth(tg, period, quota); + return tg_set_cfs_bandwidth(tg, period, quota, burst); } static long tg_get_cfs_quota(struct task_group *tg) @@ -7491,15 +10632,16 @@ static long tg_get_cfs_quota(struct task_group *tg) static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { - u64 quota, period; + u64 quota, period, burst; if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) return -EINVAL; period = (u64)cfs_period_us * NSEC_PER_USEC; quota = tg->cfs_bandwidth.quota; + burst = tg->cfs_bandwidth.burst; - return tg_set_cfs_bandwidth(tg, period, quota); + return tg_set_cfs_bandwidth(tg, period, quota, burst); } static long tg_get_cfs_period(struct task_group *tg) @@ -7512,6 +10654,30 @@ static long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } +static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) +{ + u64 quota, period, burst; + + if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + + burst = (u64)cfs_burst_us * NSEC_PER_USEC; + period = ktime_to_ns(tg->cfs_bandwidth.period); + quota = tg->cfs_bandwidth.quota; + + return tg_set_cfs_bandwidth(tg, period, quota, burst); +} + +static long tg_get_cfs_burst(struct task_group *tg) +{ + u64 burst_us; + + burst_us = tg->cfs_bandwidth.burst; + do_div(burst_us, NSEC_PER_USEC); + + return burst_us; +} + static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -7536,6 +10702,18 @@ static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, return tg_set_cfs_period(css_tg(css), cfs_period_us); } +static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_cfs_burst(css_tg(css)); +} + +static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_burst_us) +{ + return tg_set_cfs_burst(css_tg(css), cfs_burst_us); +} + struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -7629,15 +10807,21 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); if (schedstat_enabled() && tg != &root_task_group) { + struct sched_statistics *stats; u64 ws = 0; int i; - for_each_possible_cpu(i) - ws += schedstat_val(tg->se[i]->statistics.wait_sum); + for_each_possible_cpu(i) { + stats = __schedstats_from_se(tg->se[i]); + ws += schedstat_val(stats->wait_sum); + } seq_printf(sf, "wait_sum %llu\n", ws); } + seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst); + seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time); + return 0; } #endif /* CONFIG_CFS_BANDWIDTH */ @@ -7669,6 +10853,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->idle; +} + +static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 idle) +{ + return sched_group_set_idle(css_tg(css), idle); +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -7676,6 +10874,11 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, + { + .name = "idle", + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -7689,6 +10892,11 @@ static struct cftype cpu_legacy_files[] = { .write_u64 = cpu_cfs_period_write_u64, }, { + .name = "cfs_burst_us", + .read_u64 = cpu_cfs_burst_read_u64, + .write_u64 = cpu_cfs_burst_write_u64, + }, + { .name = "stat", .seq_show = cpu_cfs_stat_show, }, @@ -7729,16 +10937,20 @@ static int cpu_extra_stat_show(struct seq_file *sf, { struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - u64 throttled_usec; + u64 throttled_usec, burst_usec; throttled_usec = cfs_b->throttled_time; do_div(throttled_usec, NSEC_PER_USEC); + burst_usec = cfs_b->burst_time; + do_div(burst_usec, NSEC_PER_USEC); seq_printf(sf, "nr_periods %d\n" "nr_throttled %d\n" - "throttled_usec %llu\n", + "throttled_usec %llu\n" + "nr_bursts %d\n" + "burst_usec %llu\n", cfs_b->nr_periods, cfs_b->nr_throttled, - throttled_usec); + throttled_usec, cfs_b->nr_burst, burst_usec); } #endif return 0; @@ -7853,12 +11065,13 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, { struct task_group *tg = css_tg(of_css(of)); u64 period = tg_get_cfs_period(tg); + u64 burst = tg_get_cfs_burst(tg); u64 quota; int ret; ret = cpu_period_quota_parse(buf, &period, "a); if (!ret) - ret = tg_set_cfs_bandwidth(tg, period, quota); + ret = tg_set_cfs_bandwidth(tg, period, quota, burst); return ret ?: nbytes; } #endif @@ -7877,6 +11090,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, + { + .name = "idle", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -7885,6 +11104,12 @@ static struct cftype cpu_files[] = { .seq_show = cpu_max_show, .write = cpu_max_write, }, + { + .name = "max.burst", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_cfs_burst_read_u64, + .write_u64 = cpu_cfs_burst_write_u64, + }, #endif #ifdef CONFIG_UCLAMP_TASK_GROUP { @@ -7909,8 +11134,9 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, - .fork = cpu_cgroup_fork, +#ifdef CONFIG_RT_GROUP_SCHED .can_attach = cpu_cgroup_can_attach, +#endif .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_legacy_files, .dfl_cftypes = cpu_files, @@ -7922,6 +11148,19 @@ struct cgroup_subsys cpu_cgrp_subsys = { void dump_cpu_task(int cpu) { + if (cpu == smp_processor_id() && in_hardirq()) { + struct pt_regs *regs; + + regs = get_irq_regs(); + if (regs) { + show_regs(regs); + return; + } + } + + if (trigger_single_cpu_backtrace(cpu)) + return; + pr_info("Task dump for CPU %d:\n", cpu); sched_show_task(cpu_curr(cpu)); } @@ -7967,4 +11206,7 @@ const u32 sched_prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#undef CREATE_TRACE_POINTS +void call_trace_sched_update_nr_running(struct rq *rq, int count) +{ + trace_sched_update_nr_running_tp(rq, count); +} diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c new file mode 100644 index 000000000000..a57fd8f27498 --- /dev/null +++ b/kernel/sched/core_sched.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * A simple wrapper around refcount. An allocated sched_core_cookie's + * address is used to compute the cookie of the task. + */ +struct sched_core_cookie { + refcount_t refcnt; +}; + +static unsigned long sched_core_alloc_cookie(void) +{ + struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL); + if (!ck) + return 0; + + refcount_set(&ck->refcnt, 1); + sched_core_get(); + + return (unsigned long)ck; +} + +static void sched_core_put_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr && refcount_dec_and_test(&ptr->refcnt)) { + kfree(ptr); + sched_core_put(); + } +} + +static unsigned long sched_core_get_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr) + refcount_inc(&ptr->refcnt); + + return cookie; +} + +/* + * sched_core_update_cookie - replace the cookie on a task + * @p: the task to update + * @cookie: the new cookie + * + * Effectively exchange the task cookie; caller is responsible for lifetimes on + * both ends. + * + * Returns: the old cookie + */ +static unsigned long sched_core_update_cookie(struct task_struct *p, + unsigned long cookie) +{ + unsigned long old_cookie; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + + /* + * Since creating a cookie implies sched_core_get(), and we cannot set + * a cookie until after we've created it, similarly, we cannot destroy + * a cookie until after we've removed it, we must have core scheduling + * enabled here. + */ + SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); + + if (sched_core_enqueued(p)) + sched_core_dequeue(rq, p, DEQUEUE_SAVE); + + old_cookie = p->core_cookie; + p->core_cookie = cookie; + + /* + * Consider the cases: !prev_cookie and !cookie. + */ + if (cookie && task_on_rq_queued(p)) + sched_core_enqueue(rq, p); + + /* + * If task is currently running, it may not be compatible anymore after + * the cookie change, so enter the scheduler on its CPU to schedule it + * away. + * + * Note that it is possible that as a result of this cookie change, the + * core has now entered/left forced idle state. Defer accounting to the + * next scheduling edge, rather than always forcing a reschedule here. + */ + if (task_on_cpu(rq, p)) + resched_curr(rq); + + task_rq_unlock(rq, p, &rf); + + return old_cookie; +} + +static unsigned long sched_core_clone_cookie(struct task_struct *p) +{ + unsigned long cookie, flags; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + cookie = sched_core_get_cookie(p->core_cookie); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return cookie; +} + +void sched_core_fork(struct task_struct *p) +{ + RB_CLEAR_NODE(&p->core_node); + p->core_cookie = sched_core_clone_cookie(current); +} + +void sched_core_free(struct task_struct *p) +{ + sched_core_put_cookie(p->core_cookie); +} + +static void __sched_core_set(struct task_struct *p, unsigned long cookie) +{ + cookie = sched_core_get_cookie(cookie); + cookie = sched_core_update_cookie(p, cookie); + sched_core_put_cookie(cookie); +} + +/* Called from prctl interface: PR_SCHED_CORE */ +int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr) +{ + unsigned long cookie = 0, id = 0; + struct task_struct *task, *p; + struct pid *grp; + int err = 0; + + if (!static_branch_likely(&sched_smt_present)) + return -ENODEV; + + BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID); + BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD_GROUP != PIDTYPE_TGID); + BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_PROCESS_GROUP != PIDTYPE_PGID); + + if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 || + (cmd != PR_SCHED_CORE_GET && uaddr)) + return -EINVAL; + + rcu_read_lock(); + if (pid == 0) { + task = current; + } else { + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } + } + get_task_struct(task); + rcu_read_unlock(); + + /* + * Check if this process has the right to modify the specified + * process. Use the regular "ptrace_may_access()" checks. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out; + } + + switch (cmd) { + case PR_SCHED_CORE_GET: + if (type != PIDTYPE_PID || uaddr & 7) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + if (cookie) { + /* XXX improve ? */ + ptr_to_hashval((void *)cookie, &id); + } + err = put_user(id, (u64 __user *)uaddr); + goto out; + + case PR_SCHED_CORE_CREATE: + cookie = sched_core_alloc_cookie(); + if (!cookie) { + err = -ENOMEM; + goto out; + } + break; + + case PR_SCHED_CORE_SHARE_TO: + cookie = sched_core_clone_cookie(current); + break; + + case PR_SCHED_CORE_SHARE_FROM: + if (type != PIDTYPE_PID) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + __sched_core_set(current, cookie); + goto out; + + default: + err = -EINVAL; + goto out; + } + + if (type == PIDTYPE_PID) { + __sched_core_set(task, cookie); + goto out; + } + + read_lock(&tasklist_lock); + grp = task_pid_type(task, type); + + do_each_pid_thread(grp, type, p) { + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out_tasklist; + } + } while_each_pid_thread(grp, type, p); + + do_each_pid_thread(grp, type, p) { + __sched_core_set(p, cookie); + } while_each_pid_thread(grp, type, p); +out_tasklist: + read_unlock(&tasklist_lock); + +out: + sched_core_put_cookie(cookie); + put_task_struct(task); + return err; +} + +#ifdef CONFIG_SCHEDSTATS + +/* REQUIRES: rq->core's clock recently updated. */ +void __sched_core_account_forceidle(struct rq *rq) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); + u64 delta, now = rq_clock(rq->core); + struct rq *rq_i; + struct task_struct *p; + int i; + + lockdep_assert_rq_held(rq); + + WARN_ON_ONCE(!rq->core->core_forceidle_count); + + if (rq->core->core_forceidle_start == 0) + return; + + delta = now - rq->core->core_forceidle_start; + if (unlikely((s64)delta <= 0)) + return; + + rq->core->core_forceidle_start = now; + + if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) { + /* can't be forced idle without a running task */ + } else if (rq->core->core_forceidle_count > 1 || + rq->core->core_forceidle_occupation > 1) { + /* + * For larger SMT configurations, we need to scale the charged + * forced idle amount since there can be more than one forced + * idle sibling and more than one running cookied task. + */ + delta *= rq->core->core_forceidle_count; + delta = div_u64(delta, rq->core->core_forceidle_occupation); + } + + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + p = rq_i->core_pick ?: rq_i->curr; + + if (p == rq_i->idle) + continue; + + /* + * Note: this will account forceidle to the current cpu, even + * if it comes from our SMT sibling. + */ + __account_forceidle_time(p, delta); + } +} + +void __sched_core_tick(struct rq *rq) +{ + if (!rq->core->core_forceidle_count) + return; + + if (rq != rq->core) + update_rq_clock(rq->core); + + __sched_core_account_forceidle(rq); +} + +#endif /* CONFIG_SCHEDSTATS */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9fbb10383434..0de9dda09949 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 + /* * CPU accounting code for task groups. * * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ -#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { @@ -20,15 +20,11 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -struct cpuacct_usage { - u64 usages[CPUACCT_STAT_NSTATS]; -}; - /* track CPU usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every CPU */ - struct cpuacct_usage __percpu *cpuusage; + u64 __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; }; @@ -48,7 +44,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); } -static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -67,7 +63,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out; - ca->cpuusage = alloc_percpu(struct cpuacct_usage); + ca->cpuusage = alloc_percpu(u64); if (!ca->cpuusage) goto out_free_ca; @@ -98,56 +94,66 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, enum cpuacct_stat_index index) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; u64 data; /* * We allow index == CPUACCT_STAT_NSTATS here to read - * the sum of suages. + * the sum of usages. */ - BUG_ON(index > CPUACCT_STAT_NSTATS); + if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS)) + return 0; #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif - if (index == CPUACCT_STAT_NSTATS) { - int i = 0; - - data = 0; - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - data += cpuusage->usages[i]; - } else { - data = cpuusage->usages[index]; + switch (index) { + case CPUACCT_STAT_USER: + data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE]; + break; + case CPUACCT_STAT_SYSTEM: + data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] + + cpustat[CPUTIME_SOFTIRQ]; + break; + case CPUACCT_STAT_NSTATS: + data = *cpuusage; + break; } #ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_unlock_irq(cpu_rq(cpu)); #endif return data; } -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - int i; + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + + /* Don't allow to reset global kernel_cpustat */ + if (ca == &root_cpuacct) + return; #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif - - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - cpuusage->usages[i] = val; + *cpuusage = 0; + cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0; + cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0; + cpustat[CPUTIME_SOFTIRQ] = 0; #ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_unlock_irq(cpu_rq(cpu)); #endif } @@ -195,7 +201,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, return -EINVAL; for_each_possible_cpu(cpu) - cpuacct_cpuusage_write(ca, cpu, 0); + cpuacct_cpuusage_write(ca, cpu); return 0; } @@ -242,25 +248,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) seq_puts(m, "\n"); for_each_possible_cpu(cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - seq_printf(m, "%d", cpu); - - for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit - * platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); -#endif - - seq_printf(m, " %llu", cpuusage->usages[index]); - -#ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#endif - } + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %llu", + cpuacct_cpuusage_read(ca, cpu, index)); seq_puts(m, "\n"); } return 0; @@ -269,25 +260,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); - s64 val[CPUACCT_STAT_NSTATS]; + struct task_cputime cputime; + u64 val[CPUACCT_STAT_NSTATS]; int cpu; int stat; - memset(val, 0, sizeof(val)); + memset(&cputime, 0, sizeof(cputime)); for_each_possible_cpu(cpu) { u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; + cputime.utime += cpustat[CPUTIME_USER]; + cputime.utime += cpustat[CPUTIME_NICE]; + cputime.stime += cpustat[CPUTIME_SYSTEM]; + cputime.stime += cpustat[CPUTIME_IRQ]; + cputime.stime += cpustat[CPUTIME_SOFTIRQ]; + + cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu); } + cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime, + &val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]); + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { - seq_printf(sf, "%s %lld\n", - cpuacct_stat_desc[stat], - (long long)nsec_to_clock_t(val[stat])); + seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat], + nsec_to_clock_t(val[stat])); } return 0; @@ -337,19 +333,13 @@ static struct cftype files[] = { */ void cpuacct_charge(struct task_struct *tsk, u64 cputime) { + unsigned int cpu = task_cpu(tsk); struct cpuacct *ca; - int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = task_pt_regs(tsk); - if (regs && user_mode(regs)) - index = CPUACCT_STAT_USER; - - rcu_read_lock(); + lockdep_assert_rq_held(cpu_rq(cpu)); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; - - rcu_read_unlock(); + *per_cpu_ptr(ca->cpuusage, cpu) += cputime; } /* @@ -361,10 +351,8 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { struct cpuacct *ca; - rcu_read_lock(); for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpustat)->cpustat[index] += val; - rcu_read_unlock(); + __this_cpu_add(ca->cpustat->cpustat[index], val); } struct cgroup_subsys cpuacct_cgrp_subsys = { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5cc4012572ec..57c92d751bcd 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * kernel/sched/cpudl.c + * kernel/sched/cpudeadline.c * * Global CPU deadline management * * Author: Juri Lelli <j.lelli@sssup.it> */ -#include "sched.h" static inline int parent(int i) { @@ -120,14 +119,38 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { + cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) { + unsigned long cap, max_cap = 0; + int cpu, max_cpu = -1; + + if (!sched_asym_cpucap_active()) + return 1; + + /* Ensure the capacity of the CPUs fits the task. */ + for_each_cpu(cpu, later_mask) { + if (!dl_task_fits_capacity(p, cpu)) { + cpumask_clear_cpu(cpu, later_mask); + + cap = capacity_orig_of(cpu); + + if (cap > max_cap || + (cpu == task_cpu(p) && cap == max_cap)) { + max_cap = cap; + max_cpu = cpu; + } + } + } + + if (cpumask_empty(later_mask)) + cpumask_set_cpu(max_cpu, later_mask); + return 1; } else { int best_cpu = cpudl_maximum(cp); WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); - if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && + if (cpumask_test_cpu(best_cpu, &p->cpus_mask) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { if (later_mask) cpumask_set_cpu(best_cpu, later_mask); diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 7c2fe50fd76d..5252fb191fae 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,9 +5,6 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ -#include <linux/cpufreq.h> - -#include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7fbaee24c824..9161d1136d01 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -6,13 +6,6 @@ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include "sched.h" - -#include <linux/sched/cpufreq.h> -#include <trace/events/power.h> - #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) struct sugov_tunables { @@ -26,12 +19,15 @@ struct sugov_policy { struct sugov_tunables *tunables; struct list_head tunables_hook; - raw_spinlock_t update_lock; /* For shared policies */ + raw_spinlock_t update_lock; u64 last_freq_update_time; s64 freq_update_delay_ns; unsigned int next_freq; unsigned int cached_raw_freq; + /* max CPU capacity, which is equal for all CPUs in freq. domain */ + unsigned long max; + /* The next fields are only needed if fast switch cannot be used: */ struct irq_work irq_work; struct kthread_work work; @@ -53,8 +49,8 @@ struct sugov_cpu { unsigned int iowait_boost; u64 last_update; + unsigned long util; unsigned long bw_dl; - unsigned long max; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -102,7 +98,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->next_freq == next_freq) + if (sg_policy->need_freq_update) + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + else if (sg_policy->next_freq == next_freq) return false; sg_policy->next_freq = next_freq; @@ -111,33 +109,8 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, return true; } -static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) -{ - struct cpufreq_policy *policy = sg_policy->policy; - int cpu; - - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; - - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; - - policy->cur = next_freq; - - if (trace_cpu_frequency_enabled()) { - for_each_cpu(cpu, policy->cpus) - trace_cpu_frequency(next_freq, cpu); - } -} - -static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) +static void sugov_deferred_update(struct sugov_policy *sg_policy) { - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; - if (!sg_policy->work_in_progress) { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); @@ -173,132 +146,23 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; + util = map_util_perf(util); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; - sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } -/* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. - * - * The scheduler tracks the following metrics: - * - * cpu_util_{cfs,rt,dl,irq}() - * cpu_bw_dl() - * - * Where the cfs,rt and dl util numbers are tracked with the same metric and - * synchronized windows and are thus directly comparable. - * - * The cfs,rt,dl utilization are the running times measured with rq->clock_task - * which excludes things like IRQ and steal-time. These latter are then accrued - * in the irq utilization. - * - * The DL bandwidth number otoh is not a measured metric but a value computed - * based on the task model parameters and gives the minimal utilization - * required to meet deadlines. - */ -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) -{ - unsigned long dl_util, util, irq; - struct rq *rq = cpu_rq(cpu); - - if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { - return max; - } - - /* - * Early check to see if IRQ/steal time saturates the CPU, can be - * because of inaccuracies in how we track these -- see - * update_irq_load_avg(). - */ - irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; - - /* - * Because the time spend on RT/DL tasks is visible as 'lost' time to - * CFS tasks and we use the same metric to track the effective - * utilization (PELT windows are synchronized) we can directly add them - * to obtain the CPU's actual utilization. - * - * CFS and RT utilization can be boosted or capped, depending on - * utilization clamp constraints requested by currently RUNNABLE - * tasks. - * When there are no CFS RUNNABLE tasks, clamps are released and - * frequency will be gracefully reduced with the utilization decay. - */ - util = util_cfs + cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) - util = uclamp_rq_util_with(rq, util, p); - - dl_util = cpu_util_dl(rq); - - /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. - */ - if (util + dl_util >= max) - return max; - - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; - - /* - * There is still idle time; further improve the number by using the - * irq metric. Because IRQ/steal time is hidden from the task clock we - * need to scale the task numbers: - * - * max - irq - * U' = irq + --------- * U - * max - */ - util = scale_irq_capacity(util, irq, max); - util += irq; - - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); -} - -static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util = cpu_util_cfs(rq); - unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); - sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), + FREQUENCY_UTIL, NULL); } /** @@ -375,8 +239,6 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * sugov_iowait_apply() - Apply the IO boost to a CPU. * @sg_cpu: the sugov data for the cpu to boost * @time: the update time from the caller - * @util: the utilization to (eventually) boost - * @max: the maximum value the utilization can be boosted to * * A CPU running a task which woken up after an IO operation can have its * utilization boosted to speed up the completion of those IO operations. @@ -390,18 +252,18 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long util, unsigned long max) +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long boost; /* No boost currently required */ if (!sg_cpu->iowait_boost) - return util; + return; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return util; + return; if (!sg_cpu->iowait_boost_pending) { /* @@ -410,18 +272,21 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return util; + return; } } sg_cpu->iowait_boost_pending = false; /* - * @util is already in capacity scale; convert iowait_boost + * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; - return max(boost, util); + boost = sg_cpu->iowait_boost * sg_policy->max; + boost >>= SCHED_CAPACITY_SHIFT; + boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); + if (sg_cpu->util < boost) + sg_cpu->util = boost; } #ifdef CONFIG_NO_HZ_COMMON @@ -441,83 +306,126 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } * Make sugov_should_update_freq() ignore the rate limit when DL * has increased the utilization. */ -static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) +static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) - sg_policy->limits_changed = true; + sg_cpu->sg_policy->limits_changed = true; } -static void sugov_update_single(struct update_util_data *hook, u64 time, - unsigned int flags) +static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, + u64 time, unsigned int flags) { - struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); - struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned long util, max; - unsigned int next_f; - bool busy; - sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; - ignore_dl_rate_limit(sg_cpu, sg_policy); + ignore_dl_rate_limit(sg_cpu); - if (!sugov_should_update_freq(sg_policy, time)) - return; + if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) + return false; + + sugov_get_util(sg_cpu); + sugov_iowait_apply(sg_cpu, time); + + return true; +} - /* Limits may have changed, don't skip frequency update */ - busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu); +static void sugov_update_single_freq(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned int cached_freq = sg_policy->cached_raw_freq; + unsigned int next_f; - util = sugov_get_util(sg_cpu); - max = sg_cpu->max; - util = sugov_iowait_apply(sg_cpu, time, util, max); - next_f = get_next_freq(sg_policy, util, max); + if (!sugov_update_single_common(sg_cpu, time, flags)) + return; + + next_f = get_next_freq(sg_policy, sg_cpu->util, sg_policy->max); /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. + * + * Except when the rq is capped by uclamp_max. */ - if (busy && next_f < sg_policy->next_freq) { + if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && + sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; - /* Reset cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = 0; + /* Restore cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = cached_freq; } + if (!sugov_update_next_freq(sg_policy, time, next_f)) + return; + /* * This code runs under rq->lock for the target CPU, so it won't run * concurrently on two different CPUs for the same target and it is not * necessary to acquire the lock in the fast switch case. */ if (sg_policy->policy->fast_switch_enabled) { - sugov_fast_switch(sg_policy, time, next_f); + cpufreq_driver_fast_switch(sg_policy->policy, next_f); } else { raw_spin_lock(&sg_policy->update_lock); - sugov_deferred_update(sg_policy, time, next_f); + sugov_deferred_update(sg_policy); raw_spin_unlock(&sg_policy->update_lock); } } +static void sugov_update_single_perf(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned long prev_util = sg_cpu->util; + + /* + * Fall back to the "frequency" path if frequency invariance is not + * supported, because the direct mapping between the utilization and + * the performance levels depends on the frequency invariance. + */ + if (!arch_scale_freq_invariant()) { + sugov_update_single_freq(hook, time, flags); + return; + } + + if (!sugov_update_single_common(sg_cpu, time, flags)) + return; + + /* + * Do not reduce the target performance level if the CPU has not been + * idle recently, as the reduction is likely to be premature then. + * + * Except when the rq is capped by uclamp_max. + */ + if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && + sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) + sg_cpu->util = prev_util; + + cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), + map_util_perf(sg_cpu->util), + sg_policy->max); + + sg_cpu->sg_policy->last_freq_update_time = time; +} + static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned long util = 0, max = 1; + unsigned long util = 0; unsigned int j; for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); - unsigned long j_util, j_max; - j_util = sugov_get_util(j_sg_cpu); - j_max = j_sg_cpu->max; - j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); + sugov_get_util(j_sg_cpu); + sugov_iowait_apply(j_sg_cpu, time); - if (j_util * max > j_max * util) { - util = j_util; - max = j_max; - } + util = max(j_sg_cpu->util, util); } - return get_next_freq(sg_policy, util, max); + return get_next_freq(sg_policy, util, sg_policy->max); } static void @@ -532,17 +440,20 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; - ignore_dl_rate_limit(sg_cpu, sg_policy); + ignore_dl_rate_limit(sg_cpu); if (sugov_should_update_freq(sg_policy, time)) { next_f = sugov_next_freq_shared(sg_cpu, time); + if (!sugov_update_next_freq(sg_policy, time, next_f)) + goto unlock; + if (sg_policy->policy->fast_switch_enabled) - sugov_fast_switch(sg_policy, time, next_f); + cpufreq_driver_fast_switch(sg_policy->policy, next_f); else - sugov_deferred_update(sg_policy, time, next_f); + sugov_deferred_update(sg_policy); } - +unlock: raw_spin_unlock(&sg_policy->update_lock); } @@ -554,7 +465,7 @@ static void sugov_work(struct kthread_work *work) /* * Hold sg_policy->update_lock shortly to handle the case where: - * incase sg_policy->next_freq is read here, and then updated by + * in case sg_policy->next_freq is read here, and then updated by * sugov_deferred_update() just before work_in_progress is set to false * here, we may miss queueing the new update. * @@ -624,9 +535,17 @@ static struct attribute *sugov_attrs[] = { }; ATTRIBUTE_GROUPS(sugov); +static void sugov_tunables_free(struct kobject *kobj) +{ + struct gov_attr_set *attr_set = to_gov_attr_set(kobj); + + kfree(to_sugov_tunables(attr_set)); +} + static struct kobj_type sugov_tunables_ktype = { .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, + .release = &sugov_tunables_free, }; /********************** cpufreq governor interface *********************/ @@ -726,12 +645,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_free(struct sugov_tunables *tunables) +static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; - - kfree(tunables); } static int sugov_init(struct cpufreq_policy *policy) @@ -794,7 +711,7 @@ out: fail: kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); stop_kthread: sugov_kthread_stop(sg_policy); @@ -821,7 +738,7 @@ static void sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; if (!count) - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); mutex_unlock(&global_tunables_lock); @@ -833,15 +750,18 @@ static void sugov_exit(struct cpufreq_policy *policy) static int sugov_start(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; - unsigned int cpu; + void (*uu)(struct update_util_data *data, u64 time, unsigned int flags); + unsigned int cpu = cpumask_first(policy->cpus); sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; sg_policy->last_freq_update_time = 0; sg_policy->next_freq = 0; sg_policy->work_in_progress = false; sg_policy->limits_changed = false; - sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; + sg_policy->max = arch_scale_cpu_capacity(cpu); + + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); @@ -851,13 +771,17 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; } + if (policy_is_shared(policy)) + uu = sugov_update_shared; + else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) + uu = sugov_update_single_perf; + else + uu = sugov_update_single_freq; + for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - policy_is_shared(policy) ? - sugov_update_shared : - sugov_update_single); + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); } return 0; } @@ -894,7 +818,7 @@ static void sugov_limits(struct cpufreq_policy *policy) struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, - .dynamic_switching = true, + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, .init = sugov_init, .exit = sugov_exit, .start = sugov_start, @@ -909,23 +833,12 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif -static int __init sugov_register(void) -{ - return cpufreq_register_governor(&schedutil_gov); -} -core_initcall(sugov_register); +cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL -extern bool sched_energy_update; -extern struct mutex sched_energy_mutex; - static void rebuild_sd_workfn(struct work_struct *work) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = true; - rebuild_sched_domains(); - sched_energy_update = false; - mutex_unlock(&sched_energy_mutex); + rebuild_sched_domains_energy(); } static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 1a2719e1350a..a286e726eb4b 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -11,7 +11,7 @@ * This code tracks the priority of each CPU so that global migration * decisions are easy to calculate. Each CPU can be in a state as follows: * - * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * (INVALID), NORMAL, RT1, ... RT99, HIGHER * * going from the lowest priority to the highest. CPUs in the INVALID state * are not eligible for routing. The system maintains this state with @@ -19,30 +19,112 @@ * in that class). Therefore a typical application without affinity * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * searches). For tasks with affinity restrictions, the algorithm has a - * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ -#include "sched.h" -/* Convert between a 140 based task->prio, and our 102 based cpupri */ +/* + * p->rt_priority p->prio newpri cpupri + * + * -1 -1 (CPUPRI_INVALID) + * + * 99 0 (CPUPRI_NORMAL) + * + * 1 98 98 1 + * ... + * 49 50 50 49 + * 50 49 49 50 + * ... + * 99 0 0 99 + * + * 100 100 (CPUPRI_HIGHER) + */ static int convert_prio(int prio) { int cpupri; - if (prio == CPUPRI_INVALID) - cpupri = CPUPRI_INVALID; - else if (prio == MAX_PRIO) - cpupri = CPUPRI_IDLE; - else if (prio >= MAX_RT_PRIO) - cpupri = CPUPRI_NORMAL; - else - cpupri = MAX_RT_PRIO - prio + 1; + switch (prio) { + case CPUPRI_INVALID: + cpupri = CPUPRI_INVALID; /* -1 */ + break; + + case 0 ... 98: + cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */ + break; + + case MAX_RT_PRIO-1: + cpupri = CPUPRI_NORMAL; /* 0 */ + break; + + case MAX_RT_PRIO: + cpupri = CPUPRI_HIGHER; /* 100 */ + break; + } return cpupri; } +static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, int idx) +{ + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + int skip = 0; + + if (!atomic_read(&(vec)->count)) + skip = 1; + /* + * When looking at the vector, we need to read the counter, + * do a memory barrier, then read the mask. + * + * Note: This is still all racy, but we can deal with it. + * Ideally, we only want to look at masks that are set. + * + * If a mask is not set, then the only thing wrong is that we + * did a little more work than necessary. + * + * If we read a zero count but the mask is set, because of the + * memory barriers, that can only happen when the highest prio + * task for a run queue has left the run queue, in which case, + * it will be followed by a pull. If the task we are processing + * fails to find a proper place to go, that pull request will + * pull this task if the run queue is running at a lower + * priority. + */ + smp_rmb(); + + /* Need to do the rmb for every iteration */ + if (skip) + return 0; + + if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids) + return 0; + + if (lowest_mask) { + cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); + + /* + * We have to ensure that we have at least one bit + * still set in the array, since the map could have + * been concurrently emptied between the first and + * second reads of vec->mask. If we hit this + * condition, simply act as though we never hit this + * priority level and continue on. + */ + if (cpumask_empty(lowest_mask)) + return 0; + } + + return 1; +} + +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask) +{ + return cpupri_find_fitness(cp, p, lowest_mask, NULL); +} + /** - * cpupri_find - find the best (lowest-pri) CPU in the system + * cpupri_find_fitness - find the best (lowest-pri) CPU in the system * @cp: The cpupri context * @p: The task * @lowest_mask: A mask to fill in with selected CPUs (or NULL) @@ -58,84 +140,59 @@ static int convert_prio(int prio) * * Return: (int)bool - CPUs were found */ -int cpupri_find(struct cpupri *cp, struct task_struct *p, +int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask, bool (*fitness_fn)(struct task_struct *p, int cpu)) { - int idx = 0; int task_pri = convert_prio(p->prio); + int idx, cpu; - BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); + WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { - struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; - int skip = 0; - if (!atomic_read(&(vec)->count)) - skip = 1; - /* - * When looking at the vector, we need to read the counter, - * do a memory barrier, then read the mask. - * - * Note: This is still all racey, but we can deal with it. - * Ideally, we only want to look at masks that are set. - * - * If a mask is not set, then the only thing wrong is that we - * did a little more work than necessary. - * - * If we read a zero count but the mask is set, because of the - * memory barriers, that can only happen when the highest prio - * task for a run queue has left the run queue, in which case, - * it will be followed by a pull. If the task we are processing - * fails to find a proper place to go, that pull request will - * pull this task if the run queue is running at a lower - * priority. - */ - smp_rmb(); - - /* Need to do the rmb for every iteration */ - if (skip) + if (!__cpupri_find(cp, p, lowest_mask, idx)) continue; - if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) - continue; + if (!lowest_mask || !fitness_fn) + return 1; - if (lowest_mask) { - int cpu; - - cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); - - /* - * We have to ensure that we have at least one bit - * still set in the array, since the map could have - * been concurrently emptied between the first and - * second reads of vec->mask. If we hit this - * condition, simply act as though we never hit this - * priority level and continue on. - */ - if (cpumask_empty(lowest_mask)) - continue; - - if (!fitness_fn) - return 1; - - /* Ensure the capacity of the CPUs fit the task */ - for_each_cpu(cpu, lowest_mask) { - if (!fitness_fn(p, cpu)) - cpumask_clear_cpu(cpu, lowest_mask); - } - - /* - * If no CPU at the current priority can fit the task - * continue looking - */ - if (cpumask_empty(lowest_mask)) - continue; + /* Ensure the capacity of the CPUs fit the task */ + for_each_cpu(cpu, lowest_mask) { + if (!fitness_fn(p, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); } + /* + * If no CPU at the current priority can fit the task + * continue looking + */ + if (cpumask_empty(lowest_mask)) + continue; + return 1; } + /* + * If we failed to find a fitting lowest_mask, kick off a new search + * but without taking into account any fitness criteria this time. + * + * This rule favours honouring priority over fitting the task in the + * correct CPU (Capacity Awareness being the only user now). + * The idea is that if a higher priority task can run, then it should + * run even if this ends up being on unfitting CPU. + * + * The cost of this trade-off is not entirely clear and will probably + * be good for some workloads and bad for others. + * + * The main idea here is that if some CPUs were over-committed, we try + * to spread which is what the scheduler traditionally did. Sys admins + * must do proper RT planning to avoid overloading the system if they + * really care. + */ + if (fitness_fn) + return cpupri_find(cp, p, lowest_mask); + return 0; } @@ -143,7 +200,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * cpupri_set - update the CPU priority setting * @cp: The cpupri context * @cpu: The target CPU - * @newpri: The priority (INVALID-RT99) to assign to this CPU + * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked * diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 32dd520db11f..d6cba0020064 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,11 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) #define CPUPRI_INVALID -1 -#define CPUPRI_IDLE 0 -#define CPUPRI_NORMAL 1 -/* values 2-101 are RT priorities 0-99 */ +#define CPUPRI_NORMAL 0 +/* values 1-99 are for RT1-RT99 priorities */ +#define CPUPRI_HIGHER 100 struct cpupri_vec { atomic_t count; @@ -19,8 +19,10 @@ struct cpupri { #ifdef CONFIG_SMP int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask, - bool (*fitness_fn)(struct task_struct *p, int cpu)); + struct cpumask *lowest_mask); +int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cff3e656566d..95fc77853743 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,7 +2,6 @@ /* * Simple CPU accounting cgroup controller */ -#include "sched.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -44,12 +43,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, } /* - * Called before incrementing preempt_count on {soft,}irq_enter + * Called after incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void irqtime_account_irq(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr, unsigned int offset) { struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); + unsigned int pc; s64 delta; int cpu; @@ -59,6 +59,7 @@ void irqtime_account_irq(struct task_struct *curr) cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; + pc = irq_count() - offset; /* * We do not account for softirq time from ksoftirqd here. @@ -66,12 +67,11 @@ void irqtime_account_irq(struct task_struct *curr) * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (hardirq_count()) + if (pc & HARDIRQ_MASK) irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } -EXPORT_SYMBOL_GPL(irqtime_account_irq); static u64 irqtime_tick_accounted(u64 maxtime) { @@ -147,10 +147,10 @@ void account_guest_time(struct task_struct *p, u64 cputime) /* Add guest time to cpustat. */ if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += cputime; + task_group_account_field(p, CPUTIME_NICE, cputime); cpustat[CPUTIME_GUEST_NICE] += cputime; } else { - cpustat[CPUTIME_USER] += cputime; + task_group_account_field(p, CPUTIME_USER, cputime); cpustat[CPUTIME_GUEST] += cputime; } } @@ -226,6 +226,21 @@ void account_idle_time(u64 cputime) cpustat[CPUTIME_IDLE] += cputime; } + +#ifdef CONFIG_SCHED_CORE +/* + * Account for forceidle time due to core scheduling. + * + * REQUIRES: schedstat is enabled. + */ +void __account_forceidle_time(struct task_struct *p, u64 delta) +{ + __schedstat_add(p->stats.core_forceidle_sum, delta); + + task_group_account_field(p, CPUTIME_FORCEIDLE, delta); +} +#endif + /* * When a guest is interrupted for a longer amount of time, missed clock * ticks are not redelivered later. Due to that, this function may on @@ -418,24 +433,21 @@ void vtime_task_switch(struct task_struct *prev) } # endif -/* - * Archs that account the whole time spent in the idle task - * (outside irq) as idle time can rely on this and just implement - * vtime_account_kernel() and vtime_account_idle(). Archs that - * have other meaning of the idle time (s390 only includes the - * time spent by the CPU when it's in low power mode) must override - * vtime_account(). - */ -#ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { - if (!in_interrupt() && is_idle_task(tsk)) + unsigned int pc = irq_count() - offset; + + if (pc & HARDIRQ_OFFSET) { + vtime_account_hardirq(tsk); + } else if (pc & SOFTIRQ_OFFSET) { + vtime_account_softirq(tsk); + } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && + is_idle_task(tsk)) { vtime_account_idle(tsk); - else + } else { vtime_account_kernel(tsk); + } } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, u64 *ut, u64 *st) @@ -520,50 +532,6 @@ void account_idle_ticks(unsigned long ticks) } /* - * Perform (stime * rtime) / total, but avoid multiplication overflow by - * losing precision when the numbers are big. - */ -static u64 scale_stime(u64 stime, u64 rtime, u64 total) -{ - u64 scaled; - - for (;;) { - /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) - swap(rtime, stime); - - /* Make sure 'total' fits in 32 bits */ - if (total >> 32) - goto drop_precision; - - /* Does rtime (and thus stime) fit in 32 bits? */ - if (!(rtime >> 32)) - break; - - /* Can we just balance rtime/stime rather than dropping bits? */ - if (stime >> 31) - goto drop_precision; - - /* We can grow stime and shrink rtime and try to make them both fit */ - stime <<= 1; - rtime >>= 1; - continue; - -drop_precision: - /* We drop from rtime, it has more bits than stime */ - rtime >>= 1; - total >>= 1; - } - - /* - * Make sure gcc understands that this is a 32x32->64 multiply, - * followed by a 64/32->64 divide. - */ - scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); - return scaled; -} - -/* * Adjust tick based cputime random precision against scheduler runtime * accounting. * @@ -609,7 +577,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, /* * If either stime or utime are 0, assume all runtime is userspace. - * Once a task gets some ticks, the monotonicy code at 'update:' + * Once a task gets some ticks, the monotonicity code at 'update:' * will ensure things converge to the observed ratio. */ if (stime == 0) { @@ -622,7 +590,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, goto update; } - stime = scale_stime(stime, rtime, stime + utime); + stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); update: /* @@ -661,7 +629,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) .sum_exec_runtime = p->se.sum_exec_runtime, }; - task_cputime(p, &cputime.utime, &cputime.stime); + if (task_cputime(p, &cputime.utime, &cputime.stime)) + cputime.sum_exec_runtime = task_sched_runtime(p); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } EXPORT_SYMBOL_GPL(task_cputime_adjusted); @@ -874,19 +843,21 @@ u64 task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) +bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { struct vtime *vtime = &t->vtime; unsigned int seq; u64 delta; + int ret; if (!vtime_accounting_enabled()) { *utime = t->utime; *stime = t->stime; - return; + return false; } do { + ret = false; seq = read_seqcount_begin(&vtime->seqcount); *utime = t->utime; @@ -896,6 +867,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) if (vtime->state < VTIME_SYS) continue; + ret = true; delta = vtime_delta(vtime); /* @@ -907,10 +879,14 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) else *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return ret; } -static int vtime_state_check(struct vtime *vtime, int cpu) +static int vtime_state_fetch(struct vtime *vtime, int cpu) { + int state = READ_ONCE(vtime->state); + /* * We raced against a context switch, fetch the * kcpustat task again. @@ -927,10 +903,10 @@ static int vtime_state_check(struct vtime *vtime, int cpu) * * Case 1) is ok but 2) is not. So wait for a safe VTIME state. */ - if (vtime->state == VTIME_INACTIVE) + if (state == VTIME_INACTIVE) return -EAGAIN; - return 0; + return state; } static u64 kcpustat_user_vtime(struct vtime *vtime) @@ -949,14 +925,15 @@ static int kcpustat_field_vtime(u64 *cpustat, { struct vtime *vtime = &tsk->vtime; unsigned int seq; - int err; do { + int state; + seq = read_seqcount_begin(&vtime->seqcount); - err = vtime_state_check(vtime, cpu); - if (err < 0) - return err; + state = vtime_state_fetch(vtime, cpu); + if (state < 0) + return state; *val = cpustat[usage]; @@ -969,7 +946,7 @@ static int kcpustat_field_vtime(u64 *cpustat, */ switch (usage) { case CPUTIME_SYSTEM: - if (vtime->state == VTIME_SYS) + if (state == VTIME_SYS) *val += vtime->stime + vtime_delta(vtime); break; case CPUTIME_USER: @@ -981,11 +958,11 @@ static int kcpustat_field_vtime(u64 *cpustat, *val += kcpustat_user_vtime(vtime); break; case CPUTIME_GUEST: - if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0) + if (state == VTIME_GUEST && task_nice(tsk) <= 0) *val += vtime->gtime + vtime_delta(vtime); break; case CPUTIME_GUEST_NICE: - if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0) + if (state == VTIME_GUEST && task_nice(tsk) > 0) *val += vtime->gtime + vtime_delta(vtime); break; default: @@ -1000,12 +977,12 @@ u64 kcpustat_field(struct kernel_cpustat *kcpustat, enum cpu_usage_stat usage, int cpu) { u64 *cpustat = kcpustat->cpustat; + u64 val = cpustat[usage]; struct rq *rq; - u64 val; int err; if (!vtime_accounting_enabled_cpu(cpu)) - return cpustat[usage]; + return val; rq = cpu_rq(cpu); @@ -1036,23 +1013,23 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, { struct vtime *vtime = &tsk->vtime; unsigned int seq; - int err; do { u64 *cpustat; u64 delta; + int state; seq = read_seqcount_begin(&vtime->seqcount); - err = vtime_state_check(vtime, cpu); - if (err < 0) - return err; + state = vtime_state_fetch(vtime, cpu); + if (state < 0) + return state; *dst = *src; cpustat = dst->cpustat; /* Task is sleeping, dead or idle, nothing to add */ - if (vtime->state < VTIME_SYS) + if (state < VTIME_SYS) continue; delta = vtime_delta(vtime); @@ -1061,15 +1038,15 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, * Task runs either in user (including guest) or kernel space, * add pending nohz time to the right place. */ - if (vtime->state == VTIME_SYS) { + if (state == VTIME_SYS) { cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; - } else if (vtime->state == VTIME_USER) { + } else if (state == VTIME_USER) { if (task_nice(tsk) > 0) cpustat[CPUTIME_NICE] += vtime->utime + delta; else cpustat[CPUTIME_USER] += vtime->utime + delta; } else { - WARN_ON_ONCE(vtime->state != VTIME_GUEST); + WARN_ON_ONCE(state != VTIME_GUEST); if (task_nice(tsk) > 0) { cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; cpustat[CPUTIME_NICE] += vtime->gtime + delta; @@ -1080,7 +1057,7 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, } } while (read_seqcount_retry(&vtime->seqcount, seq)); - return err; + return 0; } void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 43323f875cb9..9ae8f41e3372 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -15,10 +15,42 @@ * Michael Trimarchi <michael@amarulasolutions.com>, * Fabio Checconi <fchecconi@gmail.com> */ -#include "sched.h" -#include "pelt.h" -struct dl_bandwidth def_dl_bandwidth; +/* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting ridiculously long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_dl_sysctls[] = { + { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (void *)&sysctl_sched_dl_period_min, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra2 = (void *)&sysctl_sched_dl_period_max, + }, + {} +}; + +static int __init sched_dl_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_dl_sysctls); + return 0; +} +late_initcall(sched_dl_sysctl_init); +#endif static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { @@ -43,6 +75,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_RT_MUTEXES +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se->pi_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return pi_of(dl_se) != dl_se; +} +#else +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return false; +} +#endif + #ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { @@ -54,15 +108,75 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; + int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); + + if (cpumask_subset(rd->span, cpu_active_mask)) + return cpumask_weight(rd->span); + + cpus = 0; + for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; return cpus; } + +static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) +{ + unsigned long cap = 0; + int i; + + for_each_cpu_and(i, mask, cpu_active_mask) + cap += capacity_orig_of(i); + + return cap; +} + +/* + * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity + * of the CPU the task is running on rather rd's \Sum CPU capacity. + */ +static inline unsigned long dl_bw_capacity(int i) +{ + if (!sched_asym_cpucap_active() && + capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { + return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; + } else { + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + return __dl_bw_capacity(cpu_rq(i)->rd->span); + } +} + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + struct root_domain *rd = cpu_rq(cpu)->rd; + + if (rd->visit_gen == gen) + return true; + + rd->visit_gen = gen; + return false; +} + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -73,14 +187,53 @@ static inline int dl_bw_cpus(int i) { return 1; } + +static inline unsigned long dl_bw_capacity(int i) +{ + return SCHED_CAPACITY_SCALE; +} + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + return false; +} + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} #endif static inline +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); +} + +static inline bool +__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; +} + +static inline void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); @@ -93,7 +246,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) @@ -107,7 +260,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ } @@ -117,7 +270,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) @@ -153,11 +306,11 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) __sub_running_bw(dl_se->dl_bw, dl_rq); } -void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; - BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); if (task_on_rq_queued(p)) return; @@ -168,7 +321,7 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw) p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. @@ -190,7 +343,7 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw) * fires. * * If the task wakes up again before the inactive timer fires, - * the timer is cancelled, whereas if the task wakes up after the + * the timer is canceled, whereas if the task wakes up after the * inactive timer fired (and running_bw has been decreased) the * task's utilization has to be added to running_bw again. * A flag in the deadline scheduling entity (dl_non_contending) @@ -271,15 +424,15 @@ static void task_non_contending(struct task_struct *p) if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || p->state == TASK_DEAD) { + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - if (p->state == TASK_DEAD) + if (READ_ONCE(p->__state) == TASK_DEAD) sub_rq_bw(&p->dl, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); - __dl_clear_params(p); raw_spin_unlock(&dl_b->lock); + __dl_clear_params(p); } return; @@ -308,7 +461,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) dl_se->dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. @@ -331,9 +484,11 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; - return dl_rq->root.rb_leftmost == &dl_se->rb_node; + return rb_first_cached(&dl_rq->root) == &dl_se->rb_node; } +static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); + void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) { raw_spin_lock_init(&dl_b->dl_runtime_lock); @@ -344,12 +499,10 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); - raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); if (global_rt_runtime() == RUNTIME_INF) dl_b->bw = -1; else dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); - raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); dl_b->total_bw = 0; } @@ -438,58 +591,44 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) update_dl_migration(dl_rq); } +#define __node_2_pdl(node) \ + rb_entry((node), struct task_struct, pushable_dl_tasks) + +static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); +} + /* * The list of pushable -deadline task is not a plist, like in * sched_rt.c, it is an rb-tree with tasks ordered by deadline. */ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { - struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct task_struct *entry; - bool leftmost = true; - - BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct task_struct, - pushable_dl_tasks); - if (dl_entity_preempt(&p->dl, &entry->dl)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = false; - } - } + struct rb_node *leftmost; - if (leftmost) - dl_rq->earliest_dl.next = p->dl.deadline; + WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); - rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color_cached(&p->pushable_dl_tasks, - &dl_rq->pushable_dl_tasks_root, leftmost); + leftmost = rb_add_cached(&p->pushable_dl_tasks, + &rq->dl.pushable_dl_tasks_root, + __pushable_less); + if (leftmost) + rq->dl.earliest_dl.next = p->dl.deadline; } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; + struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root; + struct rb_node *leftmost; if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { - struct rb_node *next_node; - - next_node = rb_next(&p->pushable_dl_tasks); - if (next_node) { - dl_rq->earliest_dl.next = rb_entry(next_node, - struct task_struct, pushable_dl_tasks)->dl.deadline; - } - } + leftmost = rb_erase_cached(&p->pushable_dl_tasks, root); + if (leftmost) + dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; - rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); } @@ -502,11 +641,11 @@ static int push_dl_task(struct rq *rq); static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) { - return dl_task(prev); + return rq->online && dl_task(prev); } -static DEFINE_PER_CPU(struct callback_head, dl_push_head); -static DEFINE_PER_CPU(struct callback_head, dl_pull_head); +static DEFINE_PER_CPU(struct balance_callback, dl_push_head); +static DEFINE_PER_CPU(struct balance_callback, dl_pull_head); static void push_dl_tasks(struct rq *); static void pull_dl_task(struct rq *); @@ -545,7 +684,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * Failed to find any suitable CPU. * The task will never come back! */ - BUG_ON(dl_bandwidth_enabled()); + WARN_ON_ONCE(dl_bandwidth_enabled()); /* * If admission control is disabled we @@ -618,15 +757,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { } -static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) -{ - return false; -} - -static inline void pull_dl_task(struct rq *rq) -{ -} - static inline void deadline_queue_push_tasks(struct rq *rq) { } @@ -640,6 +770,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + struct rq *rq) +{ + /* for non-boosted task, pi_of(dl_se) == dl_se */ + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; +} + /* * We are being explicitly informed that a new instance is starting, * and this means that: @@ -657,7 +795,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(dl_se->dl_boosted); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -673,8 +811,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; - dl_se->runtime = dl_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } /* @@ -695,22 +832,19 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * could happen are, typically, a entity voluntarily trying to overcome its * runtime, or it just underestimated it during sched_setattr(). */ -static void replenish_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void replenish_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_se->dl_runtime <= 0); + WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. */ - if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; - } + if (dl_se->dl_deadline == 0) + replenish_dl_new_period(dl_se, rq); if (dl_se->dl_yielded && dl_se->runtime > 0) dl_se->runtime = 0; @@ -722,8 +856,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * arbitrary large. */ while (dl_se->runtime <= 0) { - dl_se->deadline += pi_se->dl_period; - dl_se->runtime += pi_se->dl_runtime; + dl_se->deadline += pi_of(dl_se)->dl_period; + dl_se->runtime += pi_of(dl_se)->dl_runtime; } /* @@ -737,8 +871,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } if (dl_se->dl_yielded) @@ -771,8 +904,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * task with deadline equal to period this is the same of using * dl_period instead of dl_deadline in the equation above. */ -static bool dl_entity_overflow(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, u64 t) +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) { u64 left, right; @@ -794,9 +926,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * - (pi_se->dl_runtime >> DL_SCALE); + (pi_of(dl_se)->dl_runtime >> DL_SCALE); return dl_time_before(right, left); } @@ -881,24 +1013,22 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * Please refer to the comments update_dl_revised_wakeup() function to find * more about the Revised CBS rule. */ -static void update_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void update_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_entity_overflow(dl_se, rq_clock(rq))) { if (unlikely(!dl_is_implicit(dl_se) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && - !dl_se->dl_boosted)){ + !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); return; } - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } } @@ -925,7 +1055,7 @@ static int start_dl_timer(struct task_struct *p) ktime_t now, act; s64 delta; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); /* * We want the timer to fire at the deadline, but considering @@ -997,7 +1127,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ - if (dl_se->dl_boosted) + if (is_dl_boosted(dl_se)) goto unlock; /* @@ -1025,7 +1155,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * but do not enqueue -- wait for our wakeup to do that. */ if (!task_on_rq_queued(p)) { - replenish_dl_entity(dl_se, dl_se); + replenish_dl_entity(dl_se); goto unlock; } @@ -1035,9 +1165,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * If the runqueue is no longer available, migrate the * task elsewhere. This necessarily changes rq. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + lockdep_unpin_lock(__rq_lockp(rq), rf.cookie); rq = dl_task_offline_migration(rq, p); - rf.cookie = lockdep_pin_lock(&rq->lock); + rf.cookie = lockdep_pin_lock(__rq_lockp(rq)); update_rq_clock(rq); /* @@ -1096,7 +1226,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) * cannot use the runtime, and so it replenishes the task. This rule * works fine for implicit deadline tasks (deadline == period), and the * CBS was designed for implicit deadline tasks. However, a task with - * constrained deadline (deadine < period) might be awakened after the + * constrained deadline (deadline < period) might be awakened after the * deadline, but before the next period. In this case, replenishing the * task would allow it to run for runtime / deadline. As in this case * deadline < period, CBS enables a task to run for more than the @@ -1115,7 +1245,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1129,8 +1259,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) return (dl_se->runtime <= 0); } -extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); - /* * This function implements the GRUB accounting rule: * according to the GRUB reclaiming algorithm, the runtime is @@ -1144,7 +1272,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations * multiplied by 2^BW_SHIFT, the result has to be shifted right by * BW_SHIFT. - * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT, + * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. * Since delta is a 64 bit variable, to have an overflow its value * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. @@ -1203,14 +1331,12 @@ static void update_curr_dl(struct rq *rq) return; } - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->stats.exec_max, + max(curr->stats.exec_max, delta_exec)); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); + trace_sched_stat_runtime(curr, delta_exec, 0); - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); if (dl_entity_is_special(dl_se)) return; @@ -1246,7 +1372,7 @@ throttle: dl_se->dl_overrun = 1; __dequeue_task_dl(rq, curr, 0); - if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -1293,10 +1419,10 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); - if (!dl_task(p) || p->state == TASK_DEAD) { + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - if (p->state == TASK_DEAD && dl_se->dl_non_contending) { + if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) { sub_running_bw(&p->dl, dl_rq_of_se(&p->dl)); sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; @@ -1329,6 +1455,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) timer->function = inactive_task_timer; } +#define __node_2_dle(node) \ + rb_entry((node), struct sched_dl_entity, rb_node) + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -1337,6 +1466,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + if (dl_rq->earliest_dl.curr == 0) + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER); dl_rq->earliest_dl.curr = deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } @@ -1354,11 +1485,11 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { - struct rb_node *leftmost = dl_rq->root.rb_leftmost; - struct sched_dl_entity *entry; + struct rb_node *leftmost = rb_first_cached(&dl_rq->root); + struct sched_dl_entity *entry = __node_2_dle(leftmost); - entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); } @@ -1399,29 +1530,94 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) dec_dl_migration(dl_se, dl_rq); } +static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); +} + +static inline struct sched_statistics * +__schedstats_from_dl_se(struct sched_dl_entity *dl_se) +{ + return &dl_task_of(dl_se)->stats; +} + +static inline void +update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_dl_se(dl_se); + __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_dl_se(dl_se); + __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_dl_se(dl_se); + __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags) +{ + if (!schedstat_enabled()) + return; + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper_dl(dl_rq, dl_se); +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags) +{ + struct task_struct *p = dl_task_of(dl_se); + + if (!schedstat_enabled()) + return; + + if ((flags & DEQUEUE_SLEEP)) { + unsigned int state; + + state = READ_ONCE(p->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(p->stats.sleep_start, + rq_clock(rq_of_dl_rq(dl_rq))); + + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(p->stats.block_start, + rq_clock(rq_of_dl_rq(dl_rq))); + } +} + static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_dl_entity *entry; - int leftmost = 1; - - BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_dl_entity, rb_node); - if (dl_time_before(dl_se->deadline, entry->deadline)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = 0; - } - } - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); + WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node)); + + rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less); inc_dl_tasks(dl_se, dl_rq); } @@ -1434,16 +1630,18 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) return; rb_erase_cached(&dl_se->rb_node, &dl_rq->root); + RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); } static void -enqueue_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, int flags) +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { - BUG_ON(on_dl_rq(dl_se)); + WARN_ON_ONCE(on_dl_rq(dl_se)); + + update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); /* * If this is a wakeup or a new instance, the scheduling @@ -1452,9 +1650,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, */ if (flags & ENQUEUE_WAKEUP) { task_contending(dl_se, flags); - update_dl_entity(dl_se, pi_se); + update_dl_entity(dl_se); } else if (flags & ENQUEUE_REPLENISH) { - replenish_dl_entity(dl_se, pi_se); + replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { @@ -1471,28 +1669,43 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); - struct sched_dl_entity *pi_se = &p->dl; - - /* - * Use the scheduling parameters of the top pi-waiter task if: - * - we have a top pi-waiter which is a SCHED_DEADLINE task AND - * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is - * smaller than our deadline OR we are a !SCHED_DEADLINE task getting - * boosted due to a SCHED_DEADLINE pi-waiter). - * Otherwise we keep our runtime and deadline. - */ - if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { - pi_se = &pi_task->dl; + if (is_dl_boosted(&p->dl)) { + /* + * Because of delays in the detection of the overrun of a + * thread's runtime, it might be the case that a thread + * goes to sleep in a rt mutex with negative runtime. As + * a consequence, the thread will be throttled. + * + * While waiting for the mutex, this thread can also be + * boosted via PI, resulting in a thread that is throttled + * and boosted at the same time. + * + * In this case, the boost overrides the throttle. + */ + if (p->dl.dl_throttled) { + /* + * The replenish timer needs to be canceled. No + * problem if it fires concurrently: boosted threads + * are ignored in dl_task_timer(). + */ + hrtimer_try_to_cancel(&p->dl.dl_timer); + p->dl.dl_throttled = 0; + } } else if (!dl_prio(p->normal_prio)) { /* - * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceeds its - * runtime while doing so. No point in replenishing - * it, as it's going to return back to its original - * scheduling class after this. + * Special case in which we have a !SCHED_DEADLINE task that is going + * to be deboosted, but exceeds its runtime while doing so. No point in + * replenishing it, as it's going to return back to its original + * scheduling class after this. If it has been throttled, we need to + * clear the flag, otherwise the task may wake up as throttled after + * being boosted again with no means to replenish the runtime and clear + * the throttle. */ - BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + p->dl.dl_throttled = 0; + if (!(flags & ENQUEUE_REPLENISH)) + printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n", + task_pid_nr(p)); + return; } @@ -1529,7 +1742,10 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } - enqueue_dl_entity(&p->dl, pi_se, flags); + check_schedstat_required(); + update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl); + + enqueue_dl_entity(&p->dl, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); @@ -1537,6 +1753,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { + update_stats_dequeue_dl(&rq->dl, &p->dl, flags); dequeue_dl_entity(&p->dl); dequeue_pushable_dl_task(rq, p); } @@ -1596,15 +1813,24 @@ static void yield_task_dl(struct rq *rq) #ifdef CONFIG_SMP +static inline bool dl_task_is_earliest_deadline(struct task_struct *p, + struct rq *rq) +{ + return (!rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + rq->dl.earliest_dl.curr)); +} + static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int flags) { struct task_struct *curr; + bool select_rq; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE) + if (!(flags & WF_TTWU)) goto out; rq = cpu_rq(cpu); @@ -1621,16 +1847,23 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + select_rq = unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + p->nr_cpus_allowed > 1; + + /* + * Take the capacity of the CPU into account to + * ensure it fits the requirement of the task. + */ + if (sched_asym_cpucap_active()) + select_rq |= !dl_task_fits_capacity(p, cpu); + + if (select_rq) { int target = find_later_rq(p); if (target != -1 && - (dl_time_before(p->dl.deadline, - cpu_rq(target)->dl.earliest_dl.curr) || - (cpu_rq(target)->dl.dl_nr_running == 0))) + dl_task_is_earliest_deadline(p, cpu_rq(target))) cpu = target; } rcu_read_unlock(); @@ -1641,9 +1874,10 @@ out: static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { + struct rq_flags rf; struct rq *rq; - if (p->state != TASK_WAKING) + if (READ_ONCE(p->__state) != TASK_WAKING) return; rq = task_rq(p); @@ -1652,13 +1886,14 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * from try_to_wake_up(). Hence, p->pi_lock is locked, but * rq->lock is not... So, lock it */ - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); if (p->dl.dl_non_contending) { + update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. @@ -1667,7 +1902,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused put_task_struct(p); } sub_rq_bw(&p->dl, &rq->dl); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) @@ -1745,7 +1980,12 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + p->se.exec_start = rq_clock_task(rq); + if (on_dl_rq(&p->dl)) + update_stats_wait_end_dl(dl_rq, dl_se); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); @@ -1753,7 +1993,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, p); if (rq->curr->sched_class != &dl_sched_class) @@ -1762,18 +2002,17 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) deadline_queue_push_tasks(rq); } -static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, - struct dl_rq *dl_rq) +static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) { struct rb_node *left = rb_first_cached(&dl_rq->root); if (!left) return NULL; - return rb_entry(left, struct sched_dl_entity, rb_node); + return __node_2_dle(left); } -static struct task_struct *pick_next_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -1782,15 +2021,32 @@ static struct task_struct *pick_next_task_dl(struct rq *rq) if (!sched_dl_runnable(rq)) return NULL; - dl_se = pick_next_dl_entity(rq, dl_rq); - BUG_ON(!dl_se); + dl_se = pick_next_dl_entity(dl_rq); + WARN_ON_ONCE(!dl_se); p = dl_task_of(dl_se); - set_next_task_dl(rq, p, true); + + return p; +} + +static struct task_struct *pick_next_task_dl(struct rq *rq) +{ + struct task_struct *p; + + p = pick_task_dl(rq); + if (p) + set_next_task_dl(rq, p, true); + return p; } static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + + if (on_dl_rq(&p->dl)) + update_stats_wait_start_dl(dl_rq, dl_se); + update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); @@ -1816,7 +2072,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * not being the leftmost task anymore. In that case NEED_RESCHED will * be set and schedule() will start a new hrtick for the next task. */ - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && + if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && is_leftmost(p, &rq->dl)) start_hrtick_dl(rq, p); } @@ -1836,8 +2092,8 @@ static void task_fork_dl(struct task_struct *p) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { - if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + if (!task_on_cpu(rq, p) && + cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; } @@ -1848,15 +2104,17 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) */ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) { - struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost; struct task_struct *p = NULL; + struct rb_node *next_node; if (!has_pushable_dl_tasks(rq)) return NULL; + next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); + next_node: if (next_node) { - p = rb_entry(next_node, struct task_struct, pushable_dl_tasks); + p = __node_2_pdl(next_node); if (pick_dl_task(rq, p, cpu)) return p; @@ -1927,8 +2185,8 @@ static int find_later_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(later_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(later_mask, + sched_domain_span(sd)); /* * Last chance: if a CPU being in both later_mask * and current sd span is valid, that becomes our @@ -1950,7 +2208,7 @@ static int find_later_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(later_mask); + cpu = cpumask_any_distribute(later_mask); if (cpu < nr_cpu_ids) return cpu; @@ -1972,9 +2230,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); - if (later_rq->dl.dl_nr_running && - !dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) { + if (!dl_task_is_earliest_deadline(task, later_rq)) { /* * Target rq has tasks of equal or earlier deadline, * retrying does not release any lock and is unlikely @@ -1987,8 +2243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || - task_running(rq, task) || + !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || + task_on_cpu(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); @@ -2002,9 +2258,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) * its earliest one has a later deadline than our * task, the rq is a good one. */ - if (!later_rq->dl.dl_nr_running || - dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) + if (dl_task_is_earliest_deadline(task, later_rq)) break; /* Otherwise we try again. */ @@ -2022,15 +2276,14 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) if (!has_pushable_dl_tasks(rq)) return NULL; - p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, - struct task_struct, pushable_dl_tasks); + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); return p; } @@ -2054,9 +2307,6 @@ static int push_dl_task(struct rq *rq) return 0; retry: - if (WARN_ON(next_task == rq->curr)) - return 0; - /* * If next_task preempts rq->curr, and rq->curr * can move away, it makes sense to just reschedule @@ -2069,6 +2319,12 @@ retry: return 0; } + if (is_migration_disabled(next_task)) + return 0; + + if (WARN_ON(next_task == rq->curr)) + return 0; + /* We might release rq lock */ get_task_struct(next_task); @@ -2102,13 +2358,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); - - /* - * Update the later_rq clock here, because the clock is used - * by the cpufreq_update_util() inside __add_running_bw(). - */ - update_rq_clock(later_rq); - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); + activate_task(later_rq, next_task, 0); ret = 1; resched_curr(later_rq); @@ -2131,7 +2381,7 @@ static void push_dl_tasks(struct rq *rq) static void pull_dl_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; - struct task_struct *p; + struct task_struct *p, *push_task; bool resched = false; struct rq *src_rq; u64 dmin = LONG_MAX; @@ -2161,6 +2411,7 @@ static void pull_dl_task(struct rq *this_rq) continue; /* Might drop this_rq->lock */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2178,9 +2429,7 @@ static void pull_dl_task(struct rq *this_rq) * - it will preempt the last one we pulled (if any). */ if (p && dl_time_before(p->dl.deadline, dmin) && - (!this_rq->dl.dl_nr_running || - dl_time_before(p->dl.deadline, - this_rq->dl.earliest_dl.curr))) { + dl_task_is_earliest_deadline(p, this_rq)) { WARN_ON(p == src_rq->curr); WARN_ON(!task_on_rq_queued(p)); @@ -2192,17 +2441,27 @@ static void pull_dl_task(struct rq *this_rq) src_rq->curr->dl.deadline)) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - dmin = p->dl.deadline; + if (is_migration_disabled(p)) { + push_task = get_push_task(src_rq); + } else { + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + dmin = p->dl.deadline; + resched = true; + } /* Is there any other task even earlier? */ } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + raw_spin_rq_unlock(this_rq); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + raw_spin_rq_lock(this_rq); + } } if (resched) @@ -2215,7 +2474,7 @@ skip: */ static void task_woken_dl(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && dl_task(rq->curr) && @@ -2226,12 +2485,13 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) } static void set_cpus_allowed_dl(struct task_struct *p, - const struct cpumask *new_mask) + const struct cpumask *new_mask, + u32 flags) { struct root_domain *src_rd; struct rq *rq; - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); src_rd = rq->rd; @@ -2255,7 +2515,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - set_cpus_allowed_common(p, new_mask); + set_cpus_allowed_common(p, new_mask, flags); } /* Assumes rq->lock is held */ @@ -2294,9 +2554,13 @@ void dl_add_task_root_domain(struct task_struct *p) struct rq *rq; struct dl_bw *dl_b; - rq = task_rq_lock(p, &rf); - if (!dl_task(p)) - goto unlock; + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + if (!dl_task(p)) { + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); + return; + } + + rq = __task_rq_lock(p, &rf); dl_b = &rq->rd->dl_bw; raw_spin_lock(&dl_b->lock); @@ -2305,7 +2569,6 @@ void dl_add_task_root_domain(struct task_struct *p) raw_spin_unlock(&dl_b->lock); -unlock: task_rq_unlock(rq, p, &rf); } @@ -2389,6 +2652,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); + } else { + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); } } @@ -2399,7 +2664,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (task_on_rq_queued(p) || rq->curr == p) { + if (task_on_rq_queued(p) || task_current(rq, p)) { #ifdef CONFIG_SMP /* * This might be too much, but unfortunately @@ -2428,8 +2693,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } } -const struct sched_class dl_sched_class = { - .next = &rt_sched_class, +DEFINE_SCHED_CLASS(dl) = { + .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -2442,12 +2707,14 @@ const struct sched_class dl_sched_class = { #ifdef CONFIG_SMP .balance = balance_dl, + .pick_task = pick_task_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, + .find_lock_rq = find_lock_later_rq, #endif .task_tick = task_tick_dl, @@ -2460,33 +2727,39 @@ const struct sched_class dl_sched_class = { .update_curr = update_curr_dl, }; +/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ +static u64 dl_generation; + int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + u64 gen = ++dl_generation; struct dl_bw *dl_b; - int cpu, ret = 0; + int cpu, cpus, ret = 0; unsigned long flags; /* * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) + goto next; + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) + if (new_bw * cpus < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); +next: rcu_read_unlock_sched(); if (ret) @@ -2496,7 +2769,7 @@ int sched_dl_global_validate(void) return ret; } -void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) { if (global_rt_runtime() == RUNTIME_INF) { dl_rq->bw_ratio = 1 << RATIO_SHIFT; @@ -2512,21 +2785,22 @@ void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) void sched_dl_do_global(void) { u64 new_bw = -1; + u64 gen = ++dl_generation; struct dl_bw *dl_b; int cpu; unsigned long flags; - def_dl_bandwidth.dl_period = global_rt_period(); - def_dl_bandwidth.dl_runtime = global_rt_runtime(); - if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - /* - * FIXME: As above... - */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) { + rcu_read_unlock_sched(); + continue; + } + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); @@ -2549,11 +2823,12 @@ void sched_dl_do_global(void) int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; + int cpus, err = -1, cpu = task_cpu(p); + struct dl_bw *dl_b = dl_bw_of(cpu); + unsigned long cap; if (attr->sched_flags & SCHED_FLAG_SUGOV) return 0; @@ -2568,15 +2843,17 @@ int sched_dl_overflow(struct task_struct *p, int policy, * allocated bandwidth of the container. */ raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { + !__dl_overflow(dl_b, cap, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) { /* * XXX this is slightly incorrect: when the task * utilization decreases, we should delay the total @@ -2616,7 +2893,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; + dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } @@ -2629,7 +2906,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + attr->sched_flags &= ~SCHED_DL_FLAGS; + attr->sched_flags |= dl_se->flags; } /* @@ -2644,6 +2922,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + u64 period, max, min; + /* special dl tasks don't actually use any parameter */ if (attr->sched_flags & SCHED_FLAG_SUGOV) return true; @@ -2667,12 +2947,21 @@ bool __checkparam_dl(const struct sched_attr *attr) attr->sched_period & (1ULL << 63)) return false; + period = attr->sched_period; + if (!period) + period = attr->sched_deadline; + /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || + if (period < attr->sched_deadline || attr->sched_deadline < attr->sched_runtime) return false; + max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC; + min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC; + + if (period < min || period > max) + return false; + return true; } @@ -2694,6 +2983,10 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + +#ifdef CONFIG_RT_MUTEXES + dl_se->pi_se = dl_se; +#endif } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) @@ -2703,60 +2996,25 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) if (dl_se->dl_runtime != attr->sched_runtime || dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) + dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS)) return true; return false; } #ifdef CONFIG_SMP -int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) -{ - unsigned int dest_cpu; - struct dl_bw *dl_b; - bool overflow; - int cpus, ret; - unsigned long flags; - - dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); - - rcu_read_lock_sched(); - dl_b = dl_bw_of(dest_cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); - if (overflow) { - ret = -EBUSY; - } else { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - __dl_add(dl_b, p->dl.dl_bw, cpus); - ret = 0; - } - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); - - return ret; -} - int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { - int ret = 1, trial_cpus; + unsigned long flags, cap; struct dl_bw *cur_dl_b; - unsigned long flags; + int ret = 1; rcu_read_lock_sched(); cur_dl_b = dl_bw_of(cpumask_any(cur)); - trial_cpus = cpumask_weight(trial); - + cap = __dl_bw_capacity(trial); raw_spin_lock_irqsave(&cur_dl_b->lock, flags); - if (cur_dl_b->bw != -1 && - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + if (__dl_overflow(cur_dl_b, cap, 0, 0)) ret = 0; raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); rcu_read_unlock_sched(); @@ -2764,22 +3022,32 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -bool dl_cpu_busy(unsigned int cpu) +int dl_cpu_busy(int cpu, struct task_struct *p) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow; - int cpus; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); + cap = dl_bw_capacity(cpu); + overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0); + + if (!overflow && p) { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu)); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - return overflow; + return overflow ? -EBUSY : 0; } #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 879d3ccf3806..1637b65ba07a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,9 +6,6 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ -#include "sched.h" - -static DEFINE_SPINLOCK(sched_debug_lock); /* * This allows printing both to /proc/sched_debug and @@ -169,191 +166,273 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; -__read_mostly bool sched_debug_enabled; +#ifdef CONFIG_SMP -static __init int sched_init_debug(void) +static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); + char buf[16]; + unsigned int scaling; + + if (cnt > 15) + cnt = 15; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + buf[cnt] = '\0'; + + if (kstrtouint(buf, 10, &scaling)) + return -EINVAL; - debugfs_create_bool("sched_debug", 0644, NULL, - &sched_debug_enabled); + if (scaling >= SCHED_TUNABLESCALING_END) + return -EINVAL; + sysctl_sched_tunable_scaling = scaling; + if (sched_update_scaling()) + return -EINVAL; + + *ppos += cnt; + return cnt; +} + +static int sched_scaling_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", sysctl_sched_tunable_scaling); return 0; } -late_initcall(sched_init_debug); -#ifdef CONFIG_SMP +static int sched_scaling_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_scaling_show, NULL); +} -#ifdef CONFIG_SYSCTL +static const struct file_operations sched_scaling_fops = { + .open = sched_scaling_open, + .write = sched_scaling_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} +#endif /* SMP */ + +#ifdef CONFIG_PREEMPT_DYNAMIC + +static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[16]; + int mode; + + if (cnt > 15) + cnt = 15; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + mode = sched_dynamic_mode(strstrip(buf)); + if (mode < 0) + return mode; + + sched_dynamic_update(mode); + + *ppos += cnt; + + return cnt; +} + +static int sched_dynamic_show(struct seq_file *m, void *v) +{ + static const char * preempt_modes[] = { + "none", "voluntary", "full" + }; + int i; + + for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + if (preempt_dynamic_mode == i) + seq_puts(m, "("); + seq_puts(m, preempt_modes[i]); + if (preempt_dynamic_mode == i) + seq_puts(m, ")"); + + seq_puts(m, " "); + } + + seq_puts(m, "\n"); + return 0; +} + +static int sched_dynamic_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_dynamic_show, NULL); +} + +static const struct file_operations sched_dynamic_fops = { + .open = sched_dynamic_open, + .write = sched_dynamic_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} +#endif /* CONFIG_PREEMPT_DYNAMIC */ + +__read_mostly bool sched_debug_verbose; + +static const struct seq_operations sched_debug_sops; + +static int sched_debug_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &sched_debug_sops); +} + +static const struct file_operations sched_debug_fops = { + .open = sched_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; -static struct ctl_table *sd_alloc_ctl_entry(int n) +static struct dentry *debugfs_sched; + +static __init int sched_init_debug(void) { - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + struct dentry __maybe_unused *numa; + + debugfs_sched = debugfs_create_dir("sched", NULL); + + debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops); + debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose); +#ifdef CONFIG_PREEMPT_DYNAMIC + debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); +#endif + + debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); + debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); + debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); + debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); - return entry; + debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); + debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); + +#ifdef CONFIG_SMP + debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); + debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); + debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); + + mutex_lock(&sched_domains_mutex); + update_sched_domain_debugfs(); + mutex_unlock(&sched_domains_mutex); +#endif + +#ifdef CONFIG_NUMA_BALANCING + numa = debugfs_create_dir("numa_balancing", debugfs_sched); + + debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay); + debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min); + debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); + debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); + debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); +#endif + + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + + return 0; } +late_initcall(sched_init_debug); + +#ifdef CONFIG_SMP -static void sd_free_ctl_entry(struct ctl_table **tablep) +static cpumask_var_t sd_sysctl_cpus; +static struct dentry *sd_dentry; + +static int sd_flags_show(struct seq_file *m, void *v) { - struct ctl_table *entry; + unsigned long flags = *(unsigned int *)m->private; + int idx; - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + seq_puts(m, sd_flag_debug[idx].name); + seq_puts(m, " "); } + seq_puts(m, "\n"); - kfree(*tablep); - *tablep = NULL; + return 0; } -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(9); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); - /* &table[8] is terminator */ - - return table; -} - -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; +static int sd_flags_open(struct inode *inode, struct file *file) +{ + return single_open(file, sd_flags_show, inode->i_private); } -static cpumask_var_t sd_sysctl_cpus; -static struct ctl_table_header *sd_sysctl_header; +static const struct file_operations sd_flags_fops = { + .open = sd_flags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; -void register_sched_domain_sysctl(void) +static void register_sd(struct sched_domain *sd, struct dentry *parent) { - static struct ctl_table *cpu_entries; - static struct ctl_table **cpu_idx; - static bool init_done = false; - char buf[32]; - int i; +#define SDM(type, mode, member) \ + debugfs_create_##type(#member, mode, parent, &sd->member) - if (!cpu_entries) { - cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); - if (!cpu_entries) - return; + SDM(ulong, 0644, min_interval); + SDM(ulong, 0644, max_interval); + SDM(u64, 0644, max_newidle_lb_cost); + SDM(u32, 0644, busy_factor); + SDM(u32, 0644, imbalance_pct); + SDM(u32, 0644, cache_nice_tries); + SDM(str, 0444, name); - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = cpu_entries; - } +#undef SDM - if (!cpu_idx) { - struct ctl_table *e = cpu_entries; + debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); +} - cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); - if (!cpu_idx) - return; +void update_sched_domain_debugfs(void) +{ + int cpu, i; - /* deal with sparse possible map */ - for_each_possible_cpu(i) { - cpu_idx[i] = e; - e++; - } - } + /* + * This can unfortunately be invoked before sched_debug_init() creates + * the debug directory. Don't touch sd_sysctl_cpus until then. + */ + if (!debugfs_sched) + return; if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; - } - - if (!init_done) { - init_done = true; - /* init to possible to not have holes in @cpu_entries */ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); } - for_each_cpu(i, sd_sysctl_cpus) { - struct ctl_table *e = cpu_idx[i]; + if (!sd_dentry) + sd_dentry = debugfs_create_dir("domains", debugfs_sched); + + for_each_cpu(cpu, sd_sysctl_cpus) { + struct sched_domain *sd; + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%d", cpu); + debugfs_lookup_and_remove(buf, sd_dentry); + d_cpu = debugfs_create_dir(buf, sd_dentry); + + i = 0; + for_each_domain(cpu, sd) { + struct dentry *d_sd; - if (e->child) - sd_free_ctl_entry(&e->child); + snprintf(buf, sizeof(buf), "domain%d", i); + d_sd = debugfs_create_dir(buf, d_cpu); - if (!e->procname) { - snprintf(buf, 32, "cpu%d", i); - e->procname = kstrdup(buf, GFP_KERNEL); + register_sd(sd, d_sd); + i++; } - e->mode = 0555; - e->child = sd_alloc_ctl_cpu_table(i); - __cpumask_clear_cpu(i, sd_sysctl_cpus); + __cpumask_clear_cpu(cpu, sd_sysctl_cpus); } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); } void dirty_sched_domain_sysctl(int cpu) @@ -362,13 +441,6 @@ void dirty_sched_domain_sysctl(int cpu) __cpumask_set_cpu(cpu, sd_sysctl_cpus); } -/* may be called multiple times per register */ -void unregister_sched_domain_sysctl(void) -{ - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; -} -#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -377,9 +449,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group struct sched_entity *se = tg->se[cpu]; #define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) -#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) +#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \ + #F, (long long)schedstat_val(stats->F)) #define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) +#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", \ + #F, SPLIT_NS((long long)schedstat_val(stats->F))) if (!se) return; @@ -389,24 +463,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->sum_exec_runtime); if (schedstat_enabled()) { - PN_SCHEDSTAT(se->statistics.wait_start); - PN_SCHEDSTAT(se->statistics.sleep_start); - PN_SCHEDSTAT(se->statistics.block_start); - PN_SCHEDSTAT(se->statistics.sleep_max); - PN_SCHEDSTAT(se->statistics.block_max); - PN_SCHEDSTAT(se->statistics.exec_max); - PN_SCHEDSTAT(se->statistics.slice_max); - PN_SCHEDSTAT(se->statistics.wait_max); - PN_SCHEDSTAT(se->statistics.wait_sum); - P_SCHEDSTAT(se->statistics.wait_count); + struct sched_statistics *stats; + stats = __schedstats_from_se(se); + + PN_SCHEDSTAT(wait_start); + PN_SCHEDSTAT(sleep_start); + PN_SCHEDSTAT(block_start); + PN_SCHEDSTAT(sleep_max); + PN_SCHEDSTAT(block_max); + PN_SCHEDSTAT(exec_max); + PN_SCHEDSTAT(slice_max); + PN_SCHEDSTAT(wait_max); + PN_SCHEDSTAT(wait_sum); + P_SCHEDSTAT(wait_count); } P(se->load.weight); - P(se->runnable_weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); - P(se->avg.runnable_load_avg); + P(se->avg.runnable_avg); #endif #undef PN_SCHEDSTAT @@ -417,43 +493,65 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #endif #ifdef CONFIG_CGROUP_SCHED +static DEFINE_SPINLOCK(sched_debug_lock); static char group_path[PATH_MAX]; -static char *task_group_path(struct task_group *tg) +static void task_group_path(struct task_group *tg, char *path, int plen) { - if (autogroup_path(tg, group_path, PATH_MAX)) - return group_path; + if (autogroup_path(tg, path, plen)) + return; - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, path, plen); +} - return group_path; +/* + * Only 1 SEQ_printf_task_group_path() caller can use the full length + * group_path[] for cgroup path. Other simultaneous callers will have + * to use a shorter stack buffer. A "..." suffix is appended at the end + * of the stack buffer so that it will show up in case the output length + * matches the given buffer size to indicate possible path name truncation. + */ +#define SEQ_printf_task_group_path(m, tg, fmt...) \ +{ \ + if (spin_trylock(&sched_debug_lock)) { \ + task_group_path(tg, group_path, sizeof(group_path)); \ + SEQ_printf(m, fmt, group_path); \ + spin_unlock(&sched_debug_lock); \ + } else { \ + char buf[128]; \ + char *bufend = buf + sizeof(buf) - 3; \ + task_group_path(tg, buf, bufend - buf); \ + strcpy(bufend - 1, "..."); \ + SEQ_printf(m, fmt, buf); \ + } \ } #endif static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { - if (rq->curr == p) + if (task_current(rq, p)) SEQ_printf(m, ">R"); else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), + SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", + SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif #ifdef CONFIG_CGROUP_SCHED - SEQ_printf(m, " %s", task_group_path(task_group(p))); + SEQ_printf_task_group_path(m, task_group(p), " %s") #endif SEQ_printf(m, "\n"); @@ -465,10 +563,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" + SEQ_printf(m, " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n"); SEQ_printf(m, "-------------------------------------------------------" - "----------------------------------------------------\n"); + "------------------------------------------------------\n"); rcu_read_lock(); for_each_process_thread(g, p) { @@ -490,7 +588,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, "\n"); - SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); + SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu); #else SEQ_printf(m, "\n"); SEQ_printf(m, "cfs_rq[%d]:\n", cpu); @@ -498,7 +596,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); if (rb_first_cached(&cfs_rq->tasks_timeline)) MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); @@ -506,7 +604,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) max_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", @@ -522,13 +620,17 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", + cfs_rq->idle_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", + cfs_rq->idle_h_nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); - SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", - cfs_rq->avg.runnable_load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", + cfs_rq->avg.runnable_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", @@ -537,8 +639,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", cfs_rq->removed.util_avg); - SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", - cfs_rq->removed.runnable_sum); + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg", + cfs_rq->removed.runnable_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); @@ -562,7 +664,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #ifdef CONFIG_RT_GROUP_SCHED SEQ_printf(m, "\n"); - SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); + SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu); #else SEQ_printf(m, "\n"); SEQ_printf(m, "rt_rq[%d]:\n", cpu); @@ -614,7 +716,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; #ifdef CONFIG_X86 { @@ -640,7 +741,6 @@ do { \ P(nr_running); P(nr_switches); - P(nr_load_updates); P(nr_uninterruptible); PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); @@ -666,13 +766,11 @@ do { \ } #undef P - spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); print_dl_stats(m, cpu); print_rq(m, rq, cpu); - spin_unlock_irqrestore(&sched_debug_lock, flags); SEQ_printf(m, "\n"); } @@ -721,6 +819,7 @@ static void sched_debug_header(struct seq_file *m) SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); + PN(sysctl_sched_idle_min_granularity); PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); @@ -764,7 +863,7 @@ void sysrq_sched_debug_show(void) } /* - * This itererator needs some explanation. + * This iterator needs some explanation. * It returns 1 for the header position. * This means 2 is CPU 0. * In a hotplugged system some CPUs, including CPU 0, may be missing so we have @@ -809,19 +908,13 @@ static const struct seq_operations sched_debug_sops = { .show = sched_debug_show, }; -static int __init init_sched_debug_procfs(void) -{ - if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops)) - return -ENOMEM; - return 0; -} - -__initcall(init_sched_debug_procfs); - -#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F)) +#define __P(F) __PS(#F, F) +#define P(F) __PS(#F, p->F) +#define PM(F, M) __PS(#F, p->F & (M)) +#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F))) +#define __PN(F) __PSN(#F, F) +#define PN(F) __PSN(#F, p->F) #ifdef CONFIG_NUMA_BALANCING @@ -838,25 +931,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, static void sched_show_numa(struct task_struct *p, struct seq_file *m) { #ifdef CONFIG_NUMA_BALANCING - struct mempolicy *pol; - if (p->mm) P(mm->numa_scan_seq); - task_lock(p); - pol = p->mempolicy; - if (pol && !(pol->flags & MPOL_F_MORON)) - pol = NULL; - mpol_get(pol); - task_unlock(p); - P(numa_pages_migrated); P(numa_preferred_nid); P(total_numa_faults); SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); - mpol_put(pol); #endif } @@ -870,18 +953,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, SEQ_printf(m, "---------------------------------------------------------" "----------\n"); -#define __P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define P_SCHEDSTAT(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) -#define __PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) -#define PN_SCHEDSTAT(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) + +#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->stats.F)) +#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F)) PN(se.exec_start); PN(se.vruntime); @@ -894,33 +968,34 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; - PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); - PN_SCHEDSTAT(se.statistics.wait_start); - PN_SCHEDSTAT(se.statistics.sleep_start); - PN_SCHEDSTAT(se.statistics.block_start); - PN_SCHEDSTAT(se.statistics.sleep_max); - PN_SCHEDSTAT(se.statistics.block_max); - PN_SCHEDSTAT(se.statistics.exec_max); - PN_SCHEDSTAT(se.statistics.slice_max); - PN_SCHEDSTAT(se.statistics.wait_max); - PN_SCHEDSTAT(se.statistics.wait_sum); - P_SCHEDSTAT(se.statistics.wait_count); - PN_SCHEDSTAT(se.statistics.iowait_sum); - P_SCHEDSTAT(se.statistics.iowait_count); - P_SCHEDSTAT(se.statistics.nr_migrations_cold); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); - P_SCHEDSTAT(se.statistics.nr_forced_migrations); - P_SCHEDSTAT(se.statistics.nr_wakeups); - P_SCHEDSTAT(se.statistics.nr_wakeups_sync); - P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); - P_SCHEDSTAT(se.statistics.nr_wakeups_local); - P_SCHEDSTAT(se.statistics.nr_wakeups_remote); - P_SCHEDSTAT(se.statistics.nr_wakeups_affine); - P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); - P_SCHEDSTAT(se.statistics.nr_wakeups_passive); - P_SCHEDSTAT(se.statistics.nr_wakeups_idle); + PN_SCHEDSTAT(sum_sleep_runtime); + PN_SCHEDSTAT(sum_block_runtime); + PN_SCHEDSTAT(wait_start); + PN_SCHEDSTAT(sleep_start); + PN_SCHEDSTAT(block_start); + PN_SCHEDSTAT(sleep_max); + PN_SCHEDSTAT(block_max); + PN_SCHEDSTAT(exec_max); + PN_SCHEDSTAT(slice_max); + PN_SCHEDSTAT(wait_max); + PN_SCHEDSTAT(wait_sum); + P_SCHEDSTAT(wait_count); + PN_SCHEDSTAT(iowait_sum); + P_SCHEDSTAT(iowait_count); + P_SCHEDSTAT(nr_migrations_cold); + P_SCHEDSTAT(nr_failed_migrations_affine); + P_SCHEDSTAT(nr_failed_migrations_running); + P_SCHEDSTAT(nr_failed_migrations_hot); + P_SCHEDSTAT(nr_forced_migrations); + P_SCHEDSTAT(nr_wakeups); + P_SCHEDSTAT(nr_wakeups_sync); + P_SCHEDSTAT(nr_wakeups_migrate); + P_SCHEDSTAT(nr_wakeups_local); + P_SCHEDSTAT(nr_wakeups_remote); + P_SCHEDSTAT(nr_wakeups_affine); + P_SCHEDSTAT(nr_wakeups_affine_attempts); + P_SCHEDSTAT(nr_wakeups_passive); + P_SCHEDSTAT(nr_wakeups_idle); avg_atom = p->se.sum_exec_runtime; if (nr_switches) @@ -938,26 +1013,33 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PN(avg_atom); __PN(avg_per_cpu); + +#ifdef CONFIG_SCHED_CORE + PN_SCHEDSTAT(core_forceidle_sum); +#endif } __P(nr_switches); - SEQ_printf(m, "%-45s:%21Ld\n", - "nr_voluntary_switches", (long long)p->nvcsw); - SEQ_printf(m, "%-45s:%21Ld\n", - "nr_involuntary_switches", (long long)p->nivcsw); + __PS("nr_voluntary_switches", p->nvcsw); + __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); - P(se.runnable_weight); #ifdef CONFIG_SMP P(se.avg.load_sum); - P(se.avg.runnable_load_sum); + P(se.avg.runnable_sum); P(se.avg.util_sum); P(se.avg.load_avg); - P(se.avg.runnable_load_avg); + P(se.avg.runnable_avg); P(se.avg.util_avg); P(se.avg.last_update_time); P(se.avg.util_est.ewma); - P(se.avg.util_est.enqueued); + PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED); +#endif +#ifdef CONFIG_UCLAMP_TASK + __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); + __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value); + __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); + __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); #endif P(policy); P(prio); @@ -966,11 +1048,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(dl.deadline); } #undef PN_SCHEDSTAT -#undef PN -#undef __PN #undef P_SCHEDSTAT -#undef P -#undef __P { unsigned int this_cpu = raw_smp_processor_id(); @@ -978,8 +1056,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, t0 = cpu_clock(this_cpu); t1 = cpu_clock(this_cpu); - SEQ_printf(m, "%-45s:%21Ld\n", - "clock-delta", (long long)(t1-t0)); + __PS("clock-delta", t1-t0); } sched_show_numa(p, m); @@ -988,6 +1065,16 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, void proc_sched_set_task(struct task_struct *p) { #ifdef CONFIG_SCHEDSTATS - memset(&p->se.statistics, 0, sizeof(p->se.statistics)); + memset(&p->stats, 0, sizeof(p->stats)); #endif } + +void resched_latency_warn(int cpu, u64 latency) +{ + static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1); + + WARN(__ratelimit(&latency_check_ratelimit), + "sched: CPU %d need_resched set for > %llu ns (%d ticks) " + "without schedule\n", + cpu, latency, cpu_rq(cpu)->ticks_without_resched); +} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c1217bfe5e81..e4a0b8bd941c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,9 +20,41 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include "sched.h" +#include <linux/energy_model.h> +#include <linux/mmap_lock.h> +#include <linux/hugetlb_inline.h> +#include <linux/jiffies.h> +#include <linux/mm_api.h> +#include <linux/highmem.h> +#include <linux/spinlock_api.h> +#include <linux/cpumask_api.h> +#include <linux/lockdep_api.h> +#include <linux/softirq.h> +#include <linux/refcount_api.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> +#include <linux/sched/isolation.h> +#include <linux/sched/nohz.h> + +#include <linux/cpuidle.h> +#include <linux/interrupt.h> +#include <linux/memory-tiers.h> +#include <linux/mempolicy.h> +#include <linux/mutex_api.h> +#include <linux/profile.h> +#include <linux/psi.h> +#include <linux/ratelimit.h> +#include <linux/task_work.h> + +#include <asm/switch_to.h> + +#include <linux/sched/cond_resched.h> -#include <trace/events/sched.h> +#include "sched.h" +#include "stats.h" +#include "autogroup.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -51,7 +83,7 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL; * * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ -enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: @@ -62,6 +94,14 @@ unsigned int sysctl_sched_min_granularity = 750000ULL; static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* + * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. + * Applies only when SCHED_IDLE tasks compete with normal tasks. + * + * (default: 0.75 msec) + */ +unsigned int sysctl_sched_idle_min_granularity = 750000ULL; + +/* * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity */ static unsigned int sched_nr_latency = 8; @@ -86,6 +126,19 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +int sched_thermal_decay_shift; +static int __init setup_sched_thermal_decay_shift(char *str) +{ + int _shift = 0; + + if (kstrtoint(str, 0, &_shift)) + pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); + + sched_thermal_decay_shift = clamp(_shift, 0, 10); + return 1; +} +__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); + #ifdef CONFIG_SMP /* * For asym packing, by default the lower numbered CPU has higher priority. @@ -102,6 +155,13 @@ int __weak arch_asym_cpu_priority(int cpu) */ #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) +/* + * The margin used when comparing CPU capacities. + * is 'cap1' noticeably greater than 'cap2' + * + * (default: ~5%) + */ +#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -115,7 +175,37 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_fair_sysctls[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif + {} +}; + +static int __init sched_fair_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_fair_sysctls); + return 0; +} +late_initcall(sched_fair_sysctl_init); #endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) @@ -178,7 +268,7 @@ static void update_sysctl(void) #undef SET_SYSCTL } -void sched_init_granularity(void) +void __init sched_init_granularity(void) { update_sysctl(); } @@ -218,22 +308,25 @@ static void __update_inv_weight(struct load_weight *lw) static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) { u64 fact = scale_load_down(weight); + u32 fact_hi = (u32)(fact >> 32); int shift = WMULT_SHIFT; + int fs; __update_inv_weight(lw); - if (unlikely(fact >> 32)) { - while (fact >> 32) { - fact >>= 1; - shift--; - } + if (unlikely(fact_hi)) { + fs = fls(fact_hi); + shift -= fs; + fact >>= fs; } fact = mul_u32_u32(fact, lw->inv_weight); - while (fact >> 32) { - fact >>= 1; - shift--; + fact_hi = (u32)(fact >> 32); + if (fact_hi) { + fs = fls(fact_hi); + shift -= fs; + fact >>= fs; } return mul_u64_u32_shr(delta_exec, fact, shift); @@ -247,46 +340,11 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -} /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (!path) - return; - - if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) - autogroup_path(cfs_rq->tg, path, len); - else if (cfs_rq && cfs_rq->tg->css.cgroup) - cgroup_path(cfs_rq->tg->css.cgroup, path, len); - else - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -432,40 +490,27 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } -#else /* !CONFIG_FAIR_GROUP_SCHED */ - -static inline struct task_struct *task_of(struct sched_entity *se) +static int tg_is_idle(struct task_group *tg) { - return container_of(se, struct task_struct, se); + return tg->idle > 0; } -#define for_each_sched_entity(se) \ - for (; se; se = NULL) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) { - return &task_rq(p)->cfs; + return cfs_rq->idle > 0; } -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static int se_is_idle(struct sched_entity *se) { - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; + if (entity_is_task(se)) + return task_has_idle_policy(task_of(se)); + return cfs_rq_is_idle(group_cfs_rq(se)); } -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} +#else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (path) - strlcpy(path, "(null)", len); -} +#define for_each_sched_entity(se) \ + for (; se; se = NULL) static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { @@ -493,6 +538,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } +static inline int tg_is_idle(struct task_group *tg) +{ + return 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + return 0; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline @@ -520,12 +580,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline int entity_before(struct sched_entity *a, +static inline bool entity_before(struct sched_entity *a, struct sched_entity *b) { return (s64)(a->vruntime - b->vruntime) < 0; } +#define __node_2_se(node) \ + rb_entry((node), struct sched_entity, run_node) + static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; @@ -541,8 +604,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } if (leftmost) { /* non-empty tree */ - struct sched_entity *se; - se = rb_entry(leftmost, struct sched_entity, run_node); + struct sched_entity *se = __node_2_se(leftmost); if (!curr) vruntime = se->vruntime; @@ -551,11 +613,13 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } /* ensure we never gain time by being placed backwards. */ - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif + u64_u32_store(cfs_rq->min_vruntime, + max_vruntime(cfs_rq->min_vruntime, vruntime)); +} + +static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +{ + return entity_before(__node_2_se(a), __node_2_se(b)); } /* @@ -563,32 +627,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - bool leftmost = true; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (entity_before(se, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&se->run_node, parent, link); - rb_insert_color_cached(&se->run_node, - &cfs_rq->tasks_timeline, leftmost); + rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -603,7 +642,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) if (!left) return NULL; - return rb_entry(left, struct sched_entity, run_node); + return __node_2_se(left); } static struct sched_entity *__pick_next_entity(struct sched_entity *se) @@ -613,7 +652,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) if (!next) return NULL; - return rb_entry(next, struct sched_entity, run_node); + return __node_2_se(next); } #ifdef CONFIG_SCHED_DEBUG @@ -624,23 +663,17 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) if (!last) return NULL; - return rb_entry(last, struct sched_entity, run_node); + return __node_2_se(last); } /************************************************************** * Scheduling class statistics methods: */ -int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_update_scaling(void) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); - if (ret || !write) - return ret; - sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, sysctl_sched_min_granularity); @@ -682,6 +715,8 @@ static u64 __sched_period(unsigned long nr_running) return sysctl_sched_latency; } +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq); + /* * We calculate the wall-time slice from the period by taking a part * proportional to the weight. @@ -690,23 +725,42 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); + unsigned int nr_running = cfs_rq->nr_running; + struct sched_entity *init_se = se; + unsigned int min_gran; + u64 slice; + + if (sched_feat(ALT_PERIOD)) + nr_running = rq_of(cfs_rq)->cfs.h_nr_running; + + slice = __sched_period(nr_running + !se->on_rq); for_each_sched_entity(se) { struct load_weight *load; struct load_weight lw; + struct cfs_rq *qcfs_rq; - cfs_rq = cfs_rq_of(se); - load = &cfs_rq->load; + qcfs_rq = cfs_rq_of(se); + load = &qcfs_rq->load; if (unlikely(!se->on_rq)) { - lw = cfs_rq->load; + lw = qcfs_rq->load; update_load_add(&lw, se->load.weight); load = &lw; } slice = __calc_delta(slice, se->load.weight, load); } + + if (sched_feat(BASE_SLICE)) { + if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq)) + min_gran = sysctl_sched_idle_min_granularity; + else + min_gran = sysctl_sched_min_granularity; + + slice = max_t(u64, slice, min_gran); + } + return slice; } @@ -741,15 +795,11 @@ void init_entity_runnable_average(struct sched_entity *se) * nothing has been attached to the task group yet. */ if (entity_is_task(se)) - sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); - - se->runnable_weight = se->load.weight; + sa->load_avg = scale_load_down(se->load.weight); /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static void attach_entity_cfs_rq(struct sched_entity *se); - /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -784,18 +834,6 @@ void post_init_entity_util_avg(struct task_struct *p) long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; - if (cap > 0) { - if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; - sa->util_avg /= (cfs_rq->avg.load_avg + 1); - - if (sa->util_avg > cap) - sa->util_avg = cap; - } else { - sa->util_avg = cap; - } - } - if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: @@ -811,7 +849,19 @@ void post_init_entity_util_avg(struct task_struct *p) return; } - attach_entity_cfs_rq(se); + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + } + + sa->runnable_avg = sa->util_avg; } #else /* !CONFIG_SMP */ @@ -821,7 +871,7 @@ void init_entity_runnable_average(struct sched_entity *se) void post_init_entity_util_avg(struct task_struct *p) { } -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static void update_tg_load_avg(struct cfs_rq *cfs_rq) { } #endif /* CONFIG_SMP */ @@ -844,8 +894,13 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->exec_start = now; - schedstat_set(curr->statistics.exec_max, - max(delta_exec, curr->statistics.exec_max)); + if (schedstat_enabled()) { + struct sched_statistics *stats; + + stats = __schedstats_from_se(curr); + __schedstat_set(stats->exec_max, + max(delta_exec, stats->exec_max)); + } curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); @@ -870,128 +925,70 @@ static void update_curr_fair(struct rq *rq) } static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 wait_start, prev_wait_start; + struct sched_statistics *stats; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; - wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(se->statistics.wait_start); + stats = __schedstats_from_se(se); - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > prev_wait_start)) - wait_start -= prev_wait_start; + if (entity_is_task(se)) + p = task_of(se); - __schedstat_set(se->statistics.wait_start, wait_start); + __update_stats_wait_start(rq_of(cfs_rq), p, stats); } static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct task_struct *p; - u64 delta; + struct sched_statistics *stats; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); + stats = __schedstats_from_se(se); - if (entity_is_task(se)) { + /* + * When the sched_schedstat changes from 0 to 1, some sched se + * maybe already in the runqueue, the se->statistics.wait_start + * will be 0.So it will let the delta wrong. We need to avoid this + * scenario. + */ + if (unlikely(!schedstat_val(stats->wait_start))) + return; + + if (entity_is_task(se)) p = task_of(se); - if (task_on_rq_migrating(p)) { - /* - * Preserve migrating task's wait time so wait_start - * time stamp can be adjusted to accumulate wait time - * prior to migration. - */ - __schedstat_set(se->statistics.wait_start, delta); - return; - } - trace_sched_stat_wait(p, delta); - } - __schedstat_set(se->statistics.wait_max, - max(schedstat_val(se->statistics.wait_max), delta)); - __schedstat_inc(se->statistics.wait_count); - __schedstat_add(se->statistics.wait_sum, delta); - __schedstat_set(se->statistics.wait_start, 0); + __update_stats_wait_end(rq_of(cfs_rq), p, stats); } static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { + struct sched_statistics *stats; struct task_struct *tsk = NULL; - u64 sleep_start, block_start; if (!schedstat_enabled()) return; - sleep_start = schedstat_val(se->statistics.sleep_start); - block_start = schedstat_val(se->statistics.block_start); + stats = __schedstats_from_se(se); if (entity_is_task(se)) tsk = task_of(se); - if (sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - __schedstat_set(se->statistics.sleep_max, delta); - - __schedstat_set(se->statistics.sleep_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.block_max))) - __schedstat_set(se->statistics.block_max, delta); - - __schedstat_set(se->statistics.block_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - if (tsk->in_iowait) { - __schedstat_add(se->statistics.iowait_sum, delta); - __schedstat_inc(se->statistics.iowait_count); - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } + __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats); } /* * Task is being enqueued - update stats: */ static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { if (!schedstat_enabled()) return; @@ -1001,14 +998,14 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * a dequeue/enqueue event is a NOP) */ if (se != cfs_rq->curr) - update_stats_wait_start(cfs_rq, se); + update_stats_wait_start_fair(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) - update_stats_enqueue_sleeper(cfs_rq, se); + update_stats_enqueue_sleeper_fair(cfs_rq, se); } static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { if (!schedstat_enabled()) @@ -1019,16 +1016,19 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * waiting task: */ if (se != cfs_rq->curr) - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end_fair(cfs_rq, se); if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { struct task_struct *tsk = task_of(se); + unsigned int state; - if (tsk->state & TASK_INTERRUPTIBLE) - __schedstat_set(se->statistics.sleep_start, + /* XXX racy against TTWU */ + state = READ_ONCE(tsk->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(tsk->stats.sleep_start, rq_clock(rq_of(cfs_rq))); - if (tsk->state & TASK_UNINTERRUPTIBLE) - __schedstat_set(se->statistics.block_start, + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(tsk->stats.block_start, rq_clock(rq_of(cfs_rq))); } } @@ -1049,6 +1049,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#ifdef CONFIG_NUMA +#define NUMA_IMBALANCE_MIN 2 + +static inline long +adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr) +{ + /* + * Allow a NUMA imbalance if busy CPUs is less than the maximum + * threshold. Above this threshold, individual tasks may be contending + * for both memory bandwidth and any shared HT resources. This is an + * approximation as the number of running tasks may not be related to + * the number of busy CPUs due to sched_setaffinity. + */ + if (dst_running > imb_numa_nr) + return imbalance; + + /* + * Allow a small imbalance based on a simple pair of communicating + * tasks that remain local when the destination is lightly loaded. + */ + if (imbalance <= NUMA_IMBALANCE_MIN) + return 0; + + return imbalance; +} +#endif /* CONFIG_NUMA */ + #ifdef CONFIG_NUMA_BALANCING /* * Approximate time to scan a full NUMA task in ms. The task scan period is @@ -1064,6 +1091,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +/* The page with hint page fault latency < threshold in ms is considered hot */ +unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; + +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + struct numa_group { refcount_t refcount; @@ -1076,12 +1109,13 @@ struct numa_group { unsigned long total_faults; unsigned long max_faults_cpu; /* + * faults[] array is split into two regions: faults_mem and faults_cpu. + * * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted * more by CPU use than by memory faults. */ - unsigned long *faults_cpu; - unsigned long faults[0]; + unsigned long faults[]; }; /* @@ -1091,7 +1125,7 @@ struct numa_group { static struct numa_group *deref_task_numa_group(struct task_struct *p) { return rcu_dereference_check(p->numa_group, p == current || - (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); + (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu))); } static struct numa_group *deref_curr_numa_group(struct task_struct *p) @@ -1121,7 +1155,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) return rss / nr_scan_pages; } -/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ #define MAX_SCAN_WINDOW 2560 static unsigned int task_scan_min(struct task_struct *p) @@ -1254,8 +1288,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { - return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + - group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; + return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] + + group->faults[task_faults_idx(NUMA_CPU, nid, 1)]; } static inline unsigned long group_faults_priv(struct numa_group *ng) @@ -1296,10 +1330,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng) /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, - int maxdist, bool task) + int lim_dist, bool task) { unsigned long score = 0; - int node; + int node, max_dist; /* * All nodes are directly connected, and the same distance @@ -1308,6 +1342,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, if (sched_numa_topology_type == NUMA_DIRECT) return 0; + /* sched_max_numa_distance may be changed in parallel. */ + max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, * which should be ok given the number of nodes rarely exceeds 8. @@ -1320,7 +1356,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * The furthest away nodes in the system are not interesting * for placement; nid was already counted. */ - if (dist == sched_max_numa_distance || node == nid) + if (dist >= max_dist || node == nid) continue; /* @@ -1330,8 +1366,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * "hoplimit", only nodes closer by than "hoplimit" are part * of each group. Skip other nodes. */ - if (sched_numa_topology_type == NUMA_BACKPLANE && - dist >= maxdist) + if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) continue; /* Add up the faults from nearby nodes. */ @@ -1349,8 +1384,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * This seems to result in good task placement. */ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { - faults *= (sched_max_numa_distance - dist); - faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + faults *= (max_dist - dist); + faults /= (max_dist - LOCAL_DISTANCE); } score += faults; @@ -1404,6 +1439,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, return 1000 * faults / total_faults; } +/* + * If memory tiering mode is enabled, cpupid of slow memory page is + * used to record scan time instead of CPU and PID. When tiering mode + * is disabled at run time, the scan time (in cpupid) will be + * interpreted as CPU and PID. So CPU needs to be checked to avoid to + * access out of array bound. + */ +static inline bool cpupid_valid(int cpupid) +{ + return cpupid_to_cpu(cpupid) < nr_cpu_ids; +} + +/* + * For memory tiering mode, if there are enough free pages (more than + * enough watermark defined here) in fast memory node, to take full + * advantage of fast memory capacity, all recently accessed slow + * memory pages will be migrated to fast memory node without + * considering hot threshold. + */ +static bool pgdat_free_space_enough(struct pglist_data *pgdat) +{ + int z; + unsigned long enough_wmark; + + enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, + pgdat->node_present_pages >> 4); + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (zone_watermark_ok(zone, 0, + wmark_pages(zone, WMARK_PROMO) + enough_wmark, + ZONE_MOVABLE, 0)) + return true; + } + return false; +} + +/* + * For memory tiering mode, when page tables are scanned, the scan + * time will be recorded in struct page in addition to make page + * PROT_NONE for slow memory page. So when the page is accessed, in + * hint page fault handler, the hint page fault latency is calculated + * via, + * + * hint page fault latency = hint page fault time - scan time + * + * The smaller the hint page fault latency, the higher the possibility + * for the page to be hot. + */ +static int numa_hint_fault_latency(struct page *page) +{ + int last_time, time; + + time = jiffies_to_msecs(jiffies); + last_time = xchg_page_access_time(page, time); + + return (time - last_time) & PAGE_ACCESS_TIME_MASK; +} + +/* + * For memory tiering mode, too high promotion/demotion throughput may + * hurt application latency. So we provide a mechanism to rate limit + * the number of pages that are tried to be promoted. + */ +static bool numa_promotion_rate_limit(struct pglist_data *pgdat, + unsigned long rate_limit, int nr) +{ + unsigned long nr_cand; + unsigned int now, start; + + now = jiffies_to_msecs(jiffies); + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + start = pgdat->nbp_rl_start; + if (now - start > MSEC_PER_SEC && + cmpxchg(&pgdat->nbp_rl_start, start, now) == start) + pgdat->nbp_rl_nr_cand = nr_cand; + if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) + return true; + return false; +} + +#define NUMA_MIGRATION_ADJUST_STEPS 16 + +static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, + unsigned long rate_limit, + unsigned int ref_th) +{ + unsigned int now, start, th_period, unit_th, th; + unsigned long nr_cand, ref_cand, diff_cand; + + now = jiffies_to_msecs(jiffies); + th_period = sysctl_numa_balancing_scan_period_max; + start = pgdat->nbp_th_start; + if (now - start > th_period && + cmpxchg(&pgdat->nbp_th_start, start, now) == start) { + ref_cand = rate_limit * + sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC; + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + diff_cand = nr_cand - pgdat->nbp_th_nr_cand; + unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS; + th = pgdat->nbp_threshold ? : ref_th; + if (diff_cand > ref_cand * 11 / 10) + th = max(th - unit_th, unit_th); + else if (diff_cand < ref_cand * 9 / 10) + th = min(th + unit_th, ref_th * 2); + pgdat->nbp_th_nr_cand = nr_cand; + pgdat->nbp_threshold = th; + } +} + bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { @@ -1411,9 +1560,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; + /* + * The pages in slow memory node should be migrated according + * to hot/cold instead of private/shared. + */ + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !node_is_toptier(src_nid)) { + struct pglist_data *pgdat; + unsigned long rate_limit; + unsigned int latency, th, def_th; + + pgdat = NODE_DATA(dst_nid); + if (pgdat_free_space_enough(pgdat)) { + /* workload changed, reset hot threshold */ + pgdat->nbp_threshold = 0; + return true; + } + + def_th = sysctl_numa_balancing_hot_threshold; + rate_limit = sysctl_numa_balancing_promote_rate_limit << \ + (20 - PAGE_SHIFT); + numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); + + th = pgdat->nbp_threshold ? : def_th; + latency = numa_hint_fault_latency(page); + if (latency >= th) + return false; + + return !numa_promotion_rate_limit(pgdat, rate_limit, + thp_nr_pages(page)); + } + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) + return false; + /* * Allow first faults or private faults to migrate immediately early in * the lifetime of a task. The magic number 4 is based on waiting for @@ -1473,36 +1657,52 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); - -static unsigned long cpu_runnable_load(struct rq *rq) -{ - return cfs_rq_runnable_load_avg(&rq->cfs); -} +/* + * 'numa_type' describes the node at the moment of load balancing. + */ +enum numa_type { + /* The node has spare capacity that can be used to run more tasks. */ + node_has_spare = 0, + /* + * The node is fully used and the tasks don't compete for more CPU + * cycles. Nevertheless, some tasks might wait before running. + */ + node_fully_busy, + /* + * The node is overloaded and can't provide expected CPU cycles to all + * tasks. + */ + node_overloaded +}; /* Cached statistics for all CPUs within a node */ struct numa_stats { unsigned long load; - + unsigned long runnable; + unsigned long util; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; + unsigned int nr_running; + unsigned int weight; + enum numa_type node_type; + int idle_cpu; }; -/* - * XXX borrowed from update_sg_lb_stats - */ -static void update_numa_stats(struct numa_stats *ns, int nid) +static inline bool is_core_idle(int cpu) { - int cpu; +#ifdef CONFIG_SCHED_SMT + int sibling; - memset(ns, 0, sizeof(*ns)); - for_each_cpu(cpu, cpumask_of_node(nid)) { - struct rq *rq = cpu_rq(cpu); + for_each_cpu(sibling, cpu_smt_mask(cpu)) { + if (cpu == sibling) + continue; - ns->load += cpu_runnable_load(rq); - ns->compute_capacity += capacity_of(cpu); + if (!idle_cpu(sibling)) + return false; } +#endif + return true; } struct task_numa_env { @@ -1510,6 +1710,7 @@ struct task_numa_env { int src_cpu, src_nid; int dst_cpu, dst_nid; + int imb_numa_nr; struct numa_stats src_stats, dst_stats; @@ -1521,20 +1722,130 @@ struct task_numa_env { int best_cpu; }; +static unsigned long cpu_load(struct rq *rq); +static unsigned long cpu_runnable(struct rq *rq); + +static inline enum +numa_type numa_classify(unsigned int imbalance_pct, + struct numa_stats *ns) +{ + if ((ns->nr_running > ns->weight) && + (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || + ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) + return node_overloaded; + + if ((ns->nr_running < ns->weight) || + (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) && + ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100)))) + return node_has_spare; + + return node_fully_busy; +} + +#ifdef CONFIG_SCHED_SMT +/* Forward declarations of select_idle_sibling helpers */ +static inline bool test_idle_cores(int cpu); +static inline int numa_idle_core(int idle_core, int cpu) +{ + if (!static_branch_likely(&sched_smt_present) || + idle_core >= 0 || !test_idle_cores(cpu)) + return idle_core; + + /* + * Prefer cores instead of packing HT siblings + * and triggering future load balancing. + */ + if (is_core_idle(cpu)) + idle_core = cpu; + + return idle_core; +} +#else +static inline int numa_idle_core(int idle_core, int cpu) +{ + return idle_core; +} +#endif + +/* + * Gather all necessary information to make NUMA balancing placement + * decisions that are compatible with standard load balancer. This + * borrows code and logic from update_sg_lb_stats but sharing a + * common implementation is impractical. + */ +static void update_numa_stats(struct task_numa_env *env, + struct numa_stats *ns, int nid, + bool find_idle) +{ + int cpu, idle_core = -1; + + memset(ns, 0, sizeof(*ns)); + ns->idle_cpu = -1; + + rcu_read_lock(); + for_each_cpu(cpu, cpumask_of_node(nid)) { + struct rq *rq = cpu_rq(cpu); + + ns->load += cpu_load(rq); + ns->runnable += cpu_runnable(rq); + ns->util += cpu_util_cfs(cpu); + ns->nr_running += rq->cfs.h_nr_running; + ns->compute_capacity += capacity_of(cpu); + + if (find_idle && !rq->nr_running && idle_cpu(cpu)) { + if (READ_ONCE(rq->numa_migrate_on) || + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) + continue; + + if (ns->idle_cpu == -1) + ns->idle_cpu = cpu; + + idle_core = numa_idle_core(idle_core, cpu); + } + } + rcu_read_unlock(); + + ns->weight = cpumask_weight(cpumask_of_node(nid)); + + ns->node_type = numa_classify(env->imbalance_pct, ns); + + if (idle_core >= 0) + ns->idle_cpu = idle_core; +} + static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { struct rq *rq = cpu_rq(env->dst_cpu); - /* Bail out if run-queue part of active NUMA balance. */ - if (xchg(&rq->numa_migrate_on, 1)) + /* Check if run-queue part of active NUMA balance. */ + if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) { + int cpu; + int start = env->dst_cpu; + + /* Find alternative idle CPU. */ + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { + if (cpu == env->best_cpu || !idle_cpu(cpu) || + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { + continue; + } + + env->dst_cpu = cpu; + rq = cpu_rq(env->dst_cpu); + if (!xchg(&rq->numa_migrate_on, 1)) + goto assign; + } + + /* Failed to find an alternative idle CPU */ return; + } +assign: /* * Clear previous best_cpu/rq numa-migrate flag, since task now * found a better CPU to move/swap. */ - if (env->best_cpu != -1) { + if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) { rq = cpu_rq(env->best_cpu); WRITE_ONCE(rq->numa_migrate_on, 0); } @@ -1590,7 +1901,7 @@ static bool load_too_imbalanced(long src_load, long dst_load, * into account that it might be best if task running on the dst_cpu should * be exchanged with the source task */ -static void task_numa_compare(struct task_numa_env *env, +static bool task_numa_compare(struct task_numa_env *env, long taskimp, long groupimp, bool maymove) { struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); @@ -1601,9 +1912,10 @@ static void task_numa_compare(struct task_numa_env *env, int dist = env->dist; long moveimp = imp; long load; + bool stopsearch = false; if (READ_ONCE(dst_rq->numa_migrate_on)) - return; + return false; rcu_read_lock(); cur = rcu_dereference(dst_rq->curr); @@ -1614,8 +1926,10 @@ static void task_numa_compare(struct task_numa_env *env, * Because we have preemption enabled we can get migrated around and * end try selecting ourselves (current == env->p) as a swap candidate. */ - if (cur == env->p) + if (cur == env->p) { + stopsearch = true; goto unlock; + } if (!cur) { if (maymove && moveimp >= env->best_imp) @@ -1624,23 +1938,41 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; } + /* Skip this swap candidate if cannot move to the source cpu. */ + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) + goto unlock; + + /* + * Skip this swap candidate if it is not moving to its preferred + * node and the best task is. + */ + if (env->best_task && + env->best_task->numa_preferred_nid == env->src_nid && + cur->numa_preferred_nid != env->src_nid) { + goto unlock; + } + /* * "imp" is the fault differential for the source task between the * source and destination node. Calculate the total differential for * the source task and potential destination task. The more negative * the value is, the more remote accesses that would be expected to * be incurred if the tasks were swapped. - */ - /* Skip this swap candidate if cannot move to the source cpu */ - if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) - goto unlock; - - /* + * * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ cur_ng = rcu_dereference(cur->numa_group); if (cur_ng == p_ng) { + /* + * Do not swap within a group or between tasks that have + * no group if there is spare capacity. Swapping does + * not address the load imbalance and helps one task at + * the cost of punishing another. + */ + if (env->dst_stats.node_type == node_has_spare) + goto unlock; + imp = taskimp + task_weight(cur, env->src_nid, dist) - task_weight(cur, env->dst_nid, dist); /* @@ -1662,6 +1994,19 @@ static void task_numa_compare(struct task_numa_env *env, task_weight(cur, env->dst_nid, dist); } + /* Discourage picking a task already on its preferred node */ + if (cur->numa_preferred_nid == env->dst_nid) + imp -= imp / 16; + + /* + * Encourage picking a task that moves to its preferred node. + * This potentially makes imp larger than it's maximum of + * 1998 (see SMALLIMP and task_weight for why) but in this + * case, it does not matter. + */ + if (cur->numa_preferred_nid == env->src_nid) + imp += imp / 8; + if (maymove && moveimp > imp && moveimp > env->best_imp) { imp = moveimp; cur = NULL; @@ -1669,6 +2014,15 @@ static void task_numa_compare(struct task_numa_env *env, } /* + * Prefer swapping with a task moving to its preferred node over a + * task that is not. + */ + if (env->best_task && cur->numa_preferred_nid == env->src_nid && + env->best_task->numa_preferred_nid != env->src_nid) { + goto assign; + } + + /* * If the NUMA importance is less than SMALLIMP, * task migration might only result in ping pong * of tasks and also hurt performance due to cache @@ -1691,42 +2045,96 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; assign: - /* - * One idle CPU per node is evaluated for a task numa move. - * Call select_idle_sibling to maybe find a better one. - */ + /* Evaluate an idle CPU for a task numa move. */ if (!cur) { + int cpu = env->dst_stats.idle_cpu; + + /* Nothing cached so current CPU went idle since the search. */ + if (cpu < 0) + cpu = env->dst_cpu; + /* - * select_idle_siblings() uses an per-CPU cpumask that - * can be used from IRQ context. + * If the CPU is no longer truly idle and the previous best CPU + * is, keep using it. */ - local_irq_disable(); - env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, - env->dst_cpu); - local_irq_enable(); + if (!idle_cpu(cpu) && env->best_cpu >= 0 && + idle_cpu(env->best_cpu)) { + cpu = env->best_cpu; + } + + env->dst_cpu = cpu; } task_numa_assign(env, cur, imp); + + /* + * If a move to idle is allowed because there is capacity or load + * balance improves then stop the search. While a better swap + * candidate may exist, a search is not free. + */ + if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu)) + stopsearch = true; + + /* + * If a swap candidate must be identified and the current best task + * moves its preferred node then stop the search. + */ + if (!maymove && env->best_task && + env->best_task->numa_preferred_nid == env->src_nid) { + stopsearch = true; + } unlock: rcu_read_unlock(); + + return stopsearch; } static void task_numa_find_cpu(struct task_numa_env *env, long taskimp, long groupimp) { - long src_load, dst_load, load; bool maymove = false; int cpu; - load = task_h_load(env->p); - dst_load = env->dst_stats.load + load; - src_load = env->src_stats.load - load; - /* - * If the improvement from just moving env->p direction is better - * than swapping tasks around, check if a move is possible. + * If dst node has spare capacity, then check if there is an + * imbalance that would be overruled by the load balancer. */ - maymove = !load_too_imbalanced(src_load, dst_load, env); + if (env->dst_stats.node_type == node_has_spare) { + unsigned int imbalance; + int src_running, dst_running; + + /* + * Would movement cause an imbalance? Note that if src has + * more running tasks that the imbalance is ignored as the + * move improves the imbalance from the perspective of the + * CPU load balancer. + * */ + src_running = env->src_stats.nr_running - 1; + dst_running = env->dst_stats.nr_running + 1; + imbalance = max(0, dst_running - src_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running, + env->imb_numa_nr); + + /* Use idle CPU if there is no imbalance */ + if (!imbalance) { + maymove = true; + if (env->dst_stats.idle_cpu >= 0) { + env->dst_cpu = env->dst_stats.idle_cpu; + task_numa_assign(env, NULL, 0); + return; + } + } + } else { + long src_load, dst_load, load; + /* + * If the improvement from just moving env->p direction is better + * than swapping tasks around, check if a move is possible. + */ + load = task_h_load(env->p); + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + maymove = !load_too_imbalanced(src_load, dst_load, env); + } for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ @@ -1734,7 +2142,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, continue; env->dst_cpu = cpu; - task_numa_compare(env, taskimp, groupimp, maymove); + if (task_numa_compare(env, taskimp, groupimp, maymove)) + break; } } @@ -1769,8 +2178,10 @@ static int task_numa_migrate(struct task_struct *p) */ rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - if (sd) + if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + env.imb_numa_nr = sd->imb_numa_nr; + } rcu_read_unlock(); /* @@ -1788,10 +2199,10 @@ static int task_numa_migrate(struct task_struct *p) dist = env.dist = node_distance(env.src_nid, env.dst_nid); taskweight = task_weight(p, env.src_nid, dist); groupweight = group_weight(p, env.src_nid, dist); - update_numa_stats(&env.src_stats, env.src_nid); + update_numa_stats(&env, &env.src_stats, env.src_nid, false); taskimp = task_weight(p, env.dst_nid, dist) - taskweight; groupimp = group_weight(p, env.dst_nid, dist) - groupweight; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); /* Try to find a spot on the preferred nid. */ task_numa_find_cpu(&env, taskimp, groupimp); @@ -1805,7 +2216,7 @@ static int task_numa_migrate(struct task_struct *p) */ ng = deref_curr_numa_group(p); if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -1824,7 +2235,7 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); task_numa_find_cpu(&env, taskimp, groupimp); } } @@ -1848,15 +2259,17 @@ static int task_numa_migrate(struct task_struct *p) } /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) + if (env.best_cpu == -1) { + trace_sched_stick_numa(p, env.src_cpu, NULL, -1); return -EAGAIN; + } best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) - trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); + trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu); return ret; } @@ -1864,7 +2277,7 @@ static int task_numa_migrate(struct task_struct *p) WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) - trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); + trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu); put_task_struct(env.best_task); return ret; } @@ -1891,7 +2304,7 @@ static void numa_migrate_preferred(struct task_struct *p) } /* - * Find out how many nodes on the workload is actively running on. Do this by + * Find out how many nodes the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. @@ -1901,13 +2314,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group) unsigned long faults, max_faults = 0; int nid, active_nodes = 0; - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults > max_faults) max_faults = faults; } - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults * ACTIVE_NODE_FRACTION > max_faults) active_nodes++; @@ -1945,7 +2358,7 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is - * completely idle or all activity is areas that are not of interest + * completely idle or all activity is in areas that are not of interest * to automatic numa balancing. Related to that, if there were failed * migration then it implies we are migrating too quickly or the local * node is overloaded. In either case, scan slower @@ -2061,7 +2474,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) dist = sched_max_numa_distance; - for_each_online_node(node) { + for_each_node_state(node, N_CPU) { score = group_weight(p, node, dist); if (score > max_score) { max_score = score; @@ -2080,7 +2493,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) * inside the highest scoring group of nodes. The nodemask tricks * keep the complexity of the search down. */ - nodes = node_online_map; + nodes = node_states[N_CPU]; for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { unsigned long max_faults = 0; nodemask_t max_group = NODE_MASK_NONE; @@ -2202,7 +2615,7 @@ static void task_numa_placement(struct task_struct *p) * is at the beginning of the numa_faults array. */ ng->faults[mem_idx] += diff; - ng->faults_cpu[mem_idx] += f_diff; + ng->faults[cpu_idx] += f_diff; ng->total_faults += diff; group_faults += ng->faults[mem_idx]; } @@ -2219,6 +2632,21 @@ static void task_numa_placement(struct task_struct *p) } } + /* Cannot migrate task to CPU-less node */ + if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) { + int near_nid = max_nid; + int distance, near_distance = INT_MAX; + + for_each_node_state(nid, N_CPU) { + distance = node_distance(max_nid, nid); + if (distance < near_distance) { + near_nid = nid; + near_distance = distance; + } + } + max_nid = near_nid; + } + if (ng) { numa_group_count_active_nodes(ng); spin_unlock_irq(group_lock); @@ -2256,7 +2684,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (unlikely(!deref_curr_numa_group(p))) { unsigned int size = sizeof(struct numa_group) + - 4*nr_node_ids*sizeof(unsigned long); + NR_NUMA_HINT_FAULT_STATS * + nr_node_ids * sizeof(unsigned long); grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!grp) @@ -2267,9 +2696,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); grp->gid = p->pid; - /* Second half of the array tracks nids where faults happen */ - grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * - nr_node_ids; for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults[i]; @@ -2326,7 +2752,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!join) return; - BUG_ON(irqs_disabled()); + WARN_ON_ONCE(irqs_disabled()); double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { @@ -2353,7 +2779,7 @@ no_join: } /* - * Get rid of NUMA staticstics associated with a task (either current or dead). + * Get rid of NUMA statistics associated with a task (either current or dead). * If @final is set, the task is dead and has reached refcount zero, so we can * safely free all relevant data structures. Otherwise, there might be * concurrent reads from places like load balancing and procfs, and we should @@ -2411,6 +2837,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; + /* + * NUMA faults statistics are unnecessary for the slow memory + * node for memory tiering mode. + */ + if (!node_is_toptier(mem_node) && + (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING || + !cpupid_valid(last_cpupid))) + return; + /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * @@ -2491,6 +2926,7 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; + MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2545,15 +2981,18 @@ static void task_numa_work(struct callback_head *work) return; - if (!down_read_trylock(&mm->mmap_sem)) + if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, start); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) { reset_ptenuma_scan(p); start = 0; - vma = mm->mmap; + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); } - for (; vma; vma = vma->vm_next) { + + for (; vma; vma = mas_find(&mas, ULONG_MAX)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; @@ -2573,7 +3012,7 @@ static void task_numa_work(struct callback_head *work) * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + if (!vma_is_accessible(vma)) continue; do { @@ -2613,7 +3052,7 @@ out: mm->numa_scan_offset = start; else reset_ptenuma_scan(p); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* * Make sure tasks use at least 32x as much time to run other code @@ -2642,9 +3081,12 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->node_stamp = 0; p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_migrate_retry = 0; /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; p->numa_faults = NULL; + p->numa_pages_migrated = 0; + p->total_numa_faults = 0; RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; @@ -2682,7 +3124,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* @@ -2700,7 +3142,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) - task_work_add(curr, work, true); + task_work_add(curr, work, TWA_RESUME); } } @@ -2770,6 +3212,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running++; + if (se_is_idle(se)) + cfs_rq->idle_nr_running++; } static void @@ -2783,6 +3227,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running--; + if (se_is_idle(se)) + cfs_rq->idle_nr_running--; } /* @@ -2835,25 +3281,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_SMP static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_weight += se->runnable_weight; - - cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; - cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; -} - -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_weight -= se->runnable_weight; - - sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); - sub_positive(&cfs_rq->avg.runnable_load_sum, - se_runnable(se) * se->avg.runnable_load_sum); -} - -static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { cfs_rq->avg.load_avg += se->avg.load_avg; @@ -2865,48 +3292,42 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } #else static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight, unsigned long runnable) + unsigned long weight) { if (se->on_rq) { /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); - dequeue_runnable_load_avg(cfs_rq, se); + update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); - se->runnable_weight = runnable; update_load_set(&se->load, weight); #ifdef CONFIG_SMP do { - u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); - se->avg.runnable_load_avg = - div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); } while (0); #endif enqueue_load_avg(cfs_rq, se); - if (se->on_rq) { - account_entity_enqueue(cfs_rq, se); - enqueue_runnable_load_avg(cfs_rq, se); - } + if (se->on_rq) + update_load_add(&cfs_rq->load, se->load.weight); + } void reweight_task(struct task_struct *p, int prio) @@ -2916,10 +3337,12 @@ void reweight_task(struct task_struct *p, int prio) struct load_weight *load = &se->load; unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight, weight); + reweight_entity(cfs_rq, se, weight); load->inv_weight = sched_prio_to_wmult[prio]; } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP /* @@ -2931,7 +3354,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (1) - * \Sum grq->load.weight + * \Sum grq->load.weight * * Now, because computing that sum is prohibitively expensive to compute (been * there, done that) we approximate it with this average stuff. The average @@ -2945,7 +3368,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->avg.load_avg * ge->load.weight = ------------------------------ (3) - * tg->load_avg + * tg->load_avg * * Where: tg->load_avg ~= \Sum grq->avg.load_avg * @@ -2961,7 +3384,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- = tg->weight (4) - * grp->load.weight + * grp->load.weight * * That is, the sum collapses because all other CPUs are idle; the UP scenario. * @@ -2980,7 +3403,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (6) - * tg_load_avg' + * tg_load_avg' * * Where: * @@ -3028,54 +3451,8 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } - -/* - * This calculates the effective runnable weight for a group entity based on - * the group entity weight calculated above. - * - * Because of the above approximation (2), our group entity weight is - * an load_avg based ratio (3). This means that it includes blocked load and - * does not represent the runnable weight. - * - * Approximate the group entity's runnable weight per ratio from the group - * runqueue: - * - * grq->avg.runnable_load_avg - * ge->runnable_weight = ge->load.weight * -------------------------- (7) - * grq->avg.load_avg - * - * However, analogous to above, since the avg numbers are slow, this leads to - * transients in the from-idle case. Instead we use: - * - * ge->runnable_weight = ge->load.weight * - * - * max(grq->avg.runnable_load_avg, grq->runnable_weight) - * ----------------------------------------------------- (8) - * max(grq->avg.load_avg, grq->load.weight) - * - * Where these max() serve both to use the 'instant' values to fix the slow - * from-idle and avoid the /0 on to-idle, similar to (6). - */ -static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) -{ - long runnable, load_avg; - - load_avg = max(cfs_rq->avg.load_avg, - scale_load_down(cfs_rq->load.weight)); - - runnable = max(cfs_rq->avg.runnable_load_avg, - scale_load_down(cfs_rq->runnable_weight)); - - runnable *= shares; - if (load_avg) - runnable /= load_avg; - - return clamp_t(long, runnable, MIN_SHARES, shares); -} #endif /* CONFIG_SMP */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); - /* * Recomputes the group entity based on the current state of its group * runqueue. @@ -3083,7 +3460,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); static void update_cfs_group(struct sched_entity *se) { struct cfs_rq *gcfs_rq = group_cfs_rq(se); - long shares, runnable; + long shares; if (!gcfs_rq) return; @@ -3092,16 +3469,15 @@ static void update_cfs_group(struct sched_entity *se) return; #ifndef CONFIG_SMP - runnable = shares = READ_ONCE(gcfs_rq->tg->shares); + shares = READ_ONCE(gcfs_rq->tg->shares); if (likely(se->load.weight == shares)) return; #else shares = calc_group_shares(gcfs_rq); - runnable = calc_group_runnable(gcfs_rq, shares); #endif - reweight_entity(cfs_rq_of(se), se, shares, runnable); + reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -3127,18 +3503,85 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) * As is, the util number is not freq-invariant (we'd have to * implement arch_scale_freq_capacity() for that). * - * See cpu_util(). + * See cpu_util_cfs(). */ cpufreq_update_util(rq, flags); } } #ifdef CONFIG_SMP +static inline bool load_avg_is_decayed(struct sched_avg *sa) +{ + if (sa->load_sum) + return false; + + if (sa->util_sum) + return false; + + if (sa->runnable_sum) + return false; + + /* + * _avg must be null when _sum are null because _avg = _sum / divider + * Make sure that rounding and/or propagation of PELT values never + * break this. + */ + SCHED_WARN_ON(sa->load_avg || + sa->util_avg || + sa->runnable_avg); + + return true; +} + +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return u64_u32_load_copy(cfs_rq->avg.last_update_time, + cfs_rq->last_update_time_copy); +} #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list + * immediately before a parent cfs_rq, and cfs_rqs are removed from the list + * bottom-up, we only have to test whether the cfs_rq before us on the list + * is our child. + * If cfs_rq is not on the list, test whether a child needs its to be added to + * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details). + */ +static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) +{ + struct cfs_rq *prev_cfs_rq; + struct list_head *prev; + + if (cfs_rq->on_list) { + prev = cfs_rq->leaf_cfs_rq_list.prev; + } else { + struct rq *rq = rq_of(cfs_rq); + + prev = rq->tmp_alone_branch; + } + + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); + + return (prev_cfs_rq->tg->parent == cfs_rq->tg); +} + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (!load_avg_is_decayed(&cfs_rq->avg)) + return false; + + if (child_cfs_rq_on_list(cfs_rq)) + return false; + + return true; +} + /** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed - * @force: update regardless of how small the difference * * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. * However, because tg->load_avg is a global value there are performance @@ -3150,7 +3593,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) * * Updating tg's load_avg is necessary before update_cfs_share(). */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; @@ -3160,7 +3603,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) if (cfs_rq->tg == &root_task_group) return; - if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } @@ -3190,32 +3633,13 @@ void set_task_rq_fair(struct sched_entity *se, if (!(se->avg.last_update_time && prev)) return; -#ifndef CONFIG_64BIT - { - u64 p_last_update_time_copy; - u64 n_last_update_time_copy; - - do { - p_last_update_time_copy = prev->load_last_update_time_copy; - n_last_update_time_copy = next->load_last_update_time_copy; + p_last_update_time = cfs_rq_last_update_time(prev); + n_last_update_time = cfs_rq_last_update_time(next); - smp_rmb(); - - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; - - } while (p_last_update_time != p_last_update_time_copy || - n_last_update_time != n_last_update_time_copy); - } -#else - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; -#endif __update_load_avg_blocked_se(p_last_update_time, se); se->avg.last_update_time = n_last_update_time; } - /* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant @@ -3226,11 +3650,11 @@ void set_task_rq_fair(struct sched_entity *se, * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * - * Per the above update_tg_cfs_util() is trivial and simply copies the running - * sum over (but still wrong, because the group entity and group rq do not have - * their PELT windows aligned). + * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial + * and simply copies the running/runnable sum over (but still wrong, because + * the group entity and group rq do not have their PELT windows aligned). * - * However, update_tg_cfs_runnable() is more complex. So we have: + * However, update_tg_cfs_load() is more complex. So we have: * * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) * @@ -3283,60 +3707,102 @@ void set_task_rq_fair(struct sched_entity *se, * XXX: only do this for the part of runnable > running ? * */ - static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. */ + divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + new_sum = se->avg.util_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.util_sum; + se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; + add_positive(&cfs_rq->avg.util_avg, delta_avg); + add_positive(&cfs_rq->avg.util_sum, delta_sum); + + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { + long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + u32 new_sum, divider; + + /* Nothing to update */ + if (!delta_avg) + return; + + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + + /* Set new sched_entity's runnable */ + se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; + new_sum = se->avg.runnable_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.runnable_sum; + se->avg.runnable_sum = new_sum; + + /* Update parent cfs_rq runnable */ + add_positive(&cfs_rq->avg.runnable_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); +} + +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) +{ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; - unsigned long runnable_load_avg, load_avg; - u64 runnable_load_sum, load_sum = 0; + unsigned long load_avg; + u64 load_sum = 0; s64 delta_sum; + u32 divider; if (!runnable_sum) return; gcfs_rq->prop_runnable_sum = 0; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + if (runnable_sum >= 0) { /* * Add runnable; clip at LOAD_AVG_MAX. Reflects that until * the CPU is saturated running == runnable. */ runnable_sum += se->avg.load_sum; - runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); + runnable_sum = min_t(long, runnable_sum, divider); } else { /* * Estimate the new unweighted runnable_sum of the gcfs_rq by * assuming all tasks are equally runnable. */ if (scale_load_down(gcfs_rq->load.weight)) { - load_sum = div_s64(gcfs_rq->avg.load_sum, + load_sum = div_u64(gcfs_rq->avg.load_sum, scale_load_down(gcfs_rq->load.weight)); } @@ -3353,30 +3819,22 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); - load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, LOAD_AVG_MAX); + load_sum = se_weight(se) * runnable_sum; + load_avg = div_u64(load_sum, divider); - delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; delta_avg = load_avg - se->avg.load_avg; + if (!delta_avg) + return; + + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; add_positive(&cfs_rq->avg.load_avg, delta_avg); add_positive(&cfs_rq->avg.load_sum, delta_sum); - - runnable_load_sum = (s64)se_runnable(se) * runnable_sum; - runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - - if (se->on_rq) { - delta_sum = runnable_load_sum - - se_weight(se) * se->avg.runnable_load_sum; - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); - add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); - } - - se->avg.runnable_load_sum = runnable_sum; - se->avg.runnable_load_avg = runnable_load_avg; + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3405,6 +3863,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) update_tg_cfs_util(cfs_rq, se, gcfs_rq); update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); + update_tg_cfs_load(cfs_rq, se, gcfs_rq); trace_pelt_cfs_tp(cfs_rq); trace_pelt_se_tp(se); @@ -3444,7 +3903,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} static inline int propagate_entity_load_avg(struct sched_entity *se) { @@ -3455,18 +3914,100 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_NO_HZ_COMMON +static inline void migrate_se_pelt_lag(struct sched_entity *se) +{ + u64 throttled = 0, now, lut; + struct cfs_rq *cfs_rq; + struct rq *rq; + bool is_idle; + + if (load_avg_is_decayed(&se->avg)) + return; + + cfs_rq = cfs_rq_of(se); + rq = rq_of(cfs_rq); + + rcu_read_lock(); + is_idle = is_idle_task(rcu_dereference(rq->curr)); + rcu_read_unlock(); + + /* + * The lag estimation comes with a cost we don't want to pay all the + * time. Hence, limiting to the case where the source CPU is idle and + * we know we are at the greatest risk to have an outdated clock. + */ + if (!is_idle) + return; + + /* + * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where: + * + * last_update_time (the cfs_rq's last_update_time) + * = cfs_rq_clock_pelt()@cfs_rq_idle + * = rq_clock_pelt()@cfs_rq_idle + * - cfs->throttled_clock_pelt_time@cfs_rq_idle + * + * cfs_idle_lag (delta between rq's update and cfs_rq's update) + * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle + * + * rq_idle_lag (delta between now and rq's update) + * = sched_clock_cpu() - rq_clock()@rq_idle + * + * We can then write: + * + * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time + + * sched_clock_cpu() - rq_clock()@rq_idle + * Where: + * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle + * rq_clock()@rq_idle is rq->clock_idle + * cfs->throttled_clock_pelt_time@cfs_rq_idle + * is cfs_rq->throttled_pelt_idle + */ + +#ifdef CONFIG_CFS_BANDWIDTH + throttled = u64_u32_load(cfs_rq->throttled_pelt_idle); + /* The clock has been stopped for throttling */ + if (throttled == U64_MAX) + return; +#endif + now = u64_u32_load(rq->clock_pelt_idle); + /* + * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case + * is observed the old clock_pelt_idle value and the new clock_idle, + * which lead to an underestimation. The opposite would lead to an + * overestimation. + */ + smp_rmb(); + lut = cfs_rq_last_update_time(cfs_rq); + + now -= throttled; + if (now < lut) + /* + * cfs_rq->avg.last_update_time is more recent than our + * estimation, let's use it. + */ + now = lut; + else + now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle); + + __update_load_avg_blocked_se(now, se); +} +#else +static void migrate_se_pelt_lag(struct sched_entity *se) {} +#endif + /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_pelt() * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) - * avg. The immediate corollary is that all (fair) tasks must be attached, see - * post_init_entity_util_avg(). + * avg. The immediate corollary is that all (fair) tasks must be attached. * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed load. + * Return: true if the load decayed or we removed load. * * Since both these conditions indicate a changed cfs_rq->avg.load we should * call update_tg_load_avg() when this function returns true. @@ -3474,41 +4015,64 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; + unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; struct sched_avg *sa = &cfs_rq->avg; int decayed = 0; if (cfs_rq->removed.nr) { unsigned long r; - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); swap(cfs_rq->removed.load_avg, removed_load); - swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); + swap(cfs_rq->removed.runnable_avg, removed_runnable); cfs_rq->removed.nr = 0; raw_spin_unlock(&cfs_rq->removed.lock); r = removed_load; sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * divider); + /* See sa->util_sum below */ + sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); r = removed_util; sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * divider); + /* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ + sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); - add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); + r = removed_runnable; + sub_positive(&sa->runnable_avg, r); + sub_positive(&sa->runnable_sum, r * divider); + /* See sa->util_sum above */ + sa->runnable_sum = max_t(u32, sa->runnable_sum, + sa->runnable_avg * PELT_MIN_DIVIDER); + + /* + * removed_runnable is the unweighted version of removed_load so we + * can use it to estimate removed_load_sum. + */ + add_tg_cfs_propagate(cfs_rq, + -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT); decayed = 1; } decayed |= __update_load_avg_cfs_rq(now, cfs_rq); - -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->load_last_update_time_copy = sa->last_update_time; -#endif - + u64_u32_store_copy(sa->last_update_time, + cfs_rq->last_update_time_copy, + sa->last_update_time); return decayed; } @@ -3522,7 +4086,11 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = get_pelt_divider(&cfs_rq->avg); /* * When we attach the @se to the @cfs_rq, we must align the decay @@ -3542,17 +4110,19 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ se->avg.util_sum = se->avg.util_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.runnable_load_sum = se->avg.load_sum; + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + cfs_rq->avg.runnable_avg += se->avg.runnable_avg; + cfs_rq->avg.runnable_sum += se->avg.runnable_sum; add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); @@ -3574,6 +4144,15 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -3588,6 +4167,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 #define DO_ATTACH 0x4 +#define DO_DETACH 0x8 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -3615,37 +4195,23 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * IOW we're enqueueing a task on a new CPU. */ attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); + } else if (flags & DO_DETACH) { + /* + * DO_DETACH means we're here from dequeue_entity() + * and we are migrating task out of the CPU. + */ + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq); } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); if (flags & UPDATE_TG) - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); } } -#ifndef CONFIG_64BIT -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) -{ - u64 last_update_time_copy; - u64 last_update_time; - - do { - last_update_time_copy = cfs_rq->load_last_update_time_copy; - smp_rmb(); - last_update_time = cfs_rq->avg.last_update_time; - } while (last_update_time != last_update_time_copy); - - return last_update_time; -} -#else -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) -{ - return cfs_rq->avg.last_update_time; -} -#endif - /* * Synchronize entity load avg of dequeued entity without locking * the previous rq. @@ -3670,8 +4236,8 @@ static void remove_entity_load_avg(struct sched_entity *se) /* * tasks cannot exit without having gone through wake_up_new_task() -> - * post_init_entity_util_avg() which will have added things to the - * cfs_rq, so we can remove unconditionally. + * enqueue_task_fair() which will have added things to the cfs_rq, + * so we can remove unconditionally. */ sync_entity_load_avg(se); @@ -3680,13 +4246,13 @@ static void remove_entity_load_avg(struct sched_entity *se) ++cfs_rq->removed.nr; cfs_rq->removed.util_avg += se->avg.util_avg; cfs_rq->removed.load_avg += se->avg.load_avg; - cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ + cfs_rq->removed.runnable_avg += se->avg.runnable_avg; raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) { - return cfs_rq->avg.runnable_load_avg; + return cfs_rq->avg.runnable_avg; } static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) @@ -3694,6 +4260,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -3703,7 +4271,7 @@ static inline unsigned long _task_util_est(struct task_struct *p) { struct util_est ue = READ_ONCE(p->se.avg.util_est); - return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED); + return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)); } static inline unsigned long task_util_est(struct task_struct *p) @@ -3737,36 +4305,51 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, enqueued = cfs_rq->avg.util_est.enqueued; enqueued += _task_util_est(p); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); } +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) + /* * Check if a (signed) value is within a specified (unsigned) margin, * based on the observation that: * * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) * - * NOTE: this only works when value + maring < INT_MAX. + * NOTE: this only works when value + margin < INT_MAX. */ static inline bool within_margin(int value, int margin) { return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); } -static void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +static inline void util_est_update(struct cfs_rq *cfs_rq, + struct task_struct *p, + bool task_sleep) { - long last_ewma_diff; + long last_ewma_diff, last_enqueued_diff; struct util_est ue; - int cpu; if (!sched_feat(UTIL_EST)) return; - /* Update root cfs_rq's estimated utilization */ - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); - /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -3782,11 +4365,13 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) if (ue.enqueued & UTIL_AVG_UNCHANGED) return; + last_enqueued_diff = ue.enqueued; + /* * Reset EWMA on utilization increases, the moving average is used only * to smooth utilization decreases. */ - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + ue.enqueued = task_util(p); if (sched_feat(UTIL_EST_FASTUP)) { if (ue.ewma < ue.enqueued) { ue.ewma = ue.enqueued; @@ -3795,19 +4380,23 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) } /* - * Skip update of task's estimated utilization when its EWMA is + * Skip update of task's estimated utilization when its members are * already ~1% close to its last activation value. */ last_ewma_diff = ue.enqueued - ue.ewma; - if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) + last_enqueued_diff -= ue.enqueued; + if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) { + if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN)) + goto done; + return; + } /* * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - cpu = cpu_of(rq_of(cfs_rq)); - if (task_util(p) > capacity_orig_of(cpu)) + if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) return; /* @@ -3831,20 +4420,24 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma += last_ewma_diff; ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; done: + ue.enqueued |= UTIL_AVG_UNCHANGED; WRITE_ONCE(p->se.avg.util_est, ue); + + trace_sched_util_est_se_tp(&p->se); } -static inline int task_fits_capacity(struct task_struct *p, long capacity) +static inline int task_fits_capacity(struct task_struct *p, + unsigned long capacity) { return fits_capacity(uclamp_task_util(p), capacity); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return; - if (!p) { + if (!p || p->nr_cpus_allowed == 1) { rq->misfit_task_load = 0; return; } @@ -3854,14 +4447,24 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - rq->misfit_task_load = task_h_load(p); + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } #else /* CONFIG_SMP */ +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + return true; +} + #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 +#define DO_DETACH 0x0 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { @@ -3875,7 +4478,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq, struct rq_flags *rf) +static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -3884,8 +4487,11 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -3919,7 +4525,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) /* sleeps up to a single latency don't count. */ if (!initial) { - unsigned long thresh = sysctl_sched_latency; + unsigned long thresh; + + if (se_is_idle(se)) + thresh = sysctl_sched_min_granularity; + else + thresh = sysctl_sched_latency; /* * Halve their sleep time's effect, to allow @@ -3937,26 +4548,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); -static inline void check_schedstat_required(void) -{ -#ifdef CONFIG_SCHEDSTATS - if (schedstat_enabled()) - return; - - /* Force schedstat enabled if a dependent tracepoint is active */ - if (trace_sched_stat_wait_enabled() || - trace_sched_stat_sleep_enabled() || - trace_sched_stat_iowait_enabled() || - trace_sched_stat_blocked_enabled() || - trace_sched_stat_runtime_enabled()) { - printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " - "stat_blocked and stat_runtime require the " - "kernel parameter schedstats=enable or " - "kernel.sched_schedstats=1\n"); - } -#endif -} - +static inline bool cfs_bandwidth_used(void); /* * MIGRATION @@ -4015,29 +4607,31 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_running of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + se_update_runnable(se); update_cfs_group(se); - enqueue_runnable_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); check_schedstat_required(); - update_stats_enqueue(cfs_rq, se, flags); + update_stats_enqueue_fair(cfs_rq, se, flags); check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; if (cfs_rq->nr_running == 1) { - list_add_leaf_cfs_rq(cfs_rq); check_enqueue_throttle(cfs_rq); + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } } @@ -4091,6 +4685,11 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + int action = UPDATE_TG; + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; + /* * Update run-time statistics of the 'current'. */ @@ -4099,15 +4698,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Subtract its load from the cfs_rq->runnable_avg. + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_running of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(cfs_rq, se, UPDATE_TG); - dequeue_runnable_load_avg(cfs_rq, se); + update_load_avg(cfs_rq, se, action); + se_update_runnable(se); - update_stats_dequeue(cfs_rq, se, flags); + update_stats_dequeue_fair(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -4138,6 +4738,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); + + if (cfs_rq->nr_running == 0) + update_idle_cfs_rq_clock_pelt(cfs_rq); } /* @@ -4183,6 +4786,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { + clear_buddies(cfs_rq, se); + /* 'current' is not kept within the tree. */ if (se->on_rq) { /* @@ -4190,7 +4795,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); } @@ -4205,9 +4810,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (schedstat_enabled() && rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { - schedstat_set(se->statistics.slice_max, - max((u64)schedstat_val(se->statistics.slice_max), - se->sum_exec_runtime - se->prev_sum_exec_runtime)); + struct sched_statistics *stats; + + stats = __schedstats_from_se(se); + __schedstat_set(stats->slice_max, + max((u64)stats->slice_max, + se->sum_exec_runtime - se->prev_sum_exec_runtime)); } se->prev_sum_exec_runtime = se->sum_exec_runtime; @@ -4242,7 +4850,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ - if (cfs_rq->skip == se) { + if (cfs_rq->skip && cfs_rq->skip == se) { struct sched_entity *second; if (se == curr) { @@ -4257,19 +4865,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) se = second; } - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) - se = cfs_rq->last; - - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { + /* + * Someone really wants this to run. If it's not unfair, run it. + */ se = cfs_rq->next; - - clear_buddies(cfs_rq, se); + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + se = cfs_rq->last; + } return se; } @@ -4291,7 +4897,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) check_spread(cfs_rq, prev); if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); + update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ @@ -4392,8 +4998,20 @@ static inline u64 sched_cfs_bandwidth_slice(void) */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - if (cfs_b->quota != RUNTIME_INF) - cfs_b->runtime = cfs_b->quota; + s64 runtime; + + if (unlikely(cfs_b->quota == RUNTIME_INF)) + return; + + cfs_b->runtime += cfs_b->quota; + runtime = cfs_b->runtime_snap - cfs_b->runtime; + if (runtime > 0) { + cfs_b->burst_time += runtime; + cfs_b->nr_burst++; + } + + cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst); + cfs_b->runtime_snap = cfs_b->runtime; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4402,16 +5020,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) } /* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, + struct cfs_rq *cfs_rq, u64 target_runtime) { - struct task_group *tg = cfs_rq->tg; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount; + u64 min_amount, amount = 0; + + lockdep_assert_held(&cfs_b->lock); /* note: this is a positive sum as runtime_remaining <= 0 */ - min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + min_amount = target_runtime - cfs_rq->runtime_remaining; - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { @@ -4423,13 +5041,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; return cfs_rq->runtime_remaining > 0; } +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + int ret; + + raw_spin_lock(&cfs_b->lock); + ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); + raw_spin_unlock(&cfs_b->lock); + + return ret; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ @@ -4492,11 +5122,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - - cfs_rq->throttled_clock_task; + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; - /* Add cfs_rq with already running entity in the list */ - if (cfs_rq->nr_running >= 1) + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } @@ -4510,7 +5140,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); } cfs_rq->throttle_count++; @@ -4518,13 +5148,33 @@ static int tg_throttle_down(struct task_group *tg, void *data) return 0; } -static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; - bool empty; + + raw_spin_lock(&cfs_b->lock); + /* This will start the period timer if necessary */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { + /* + * We have raced with bandwidth becoming available, and if we + * actually throttled the timer might not unthrottle us for an + * entire period. We additionally needed to make sure that any + * subsequent check_cfs_rq_runtime calls agree not to throttle + * us, as we may commit to do cfs put_prev+pick_next, so we ask + * for 1ns of runtime rather than just check cfs_b. + */ + dequeue = 0; + } else { + list_add_tail_rcu(&cfs_rq->throttled_list, + &cfs_b->throttled_cfs_rq); + } + raw_spin_unlock(&cfs_b->lock); + + if (!dequeue) + return false; /* Throttle no longer required. */ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4539,43 +5189,50 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) - break; + goto done; + + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; - if (dequeue) - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; - if (qcfs_rq->load.weight) - dequeue = 0; + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; + } } - if (!se) - sub_nr_running(rq, task_delta); + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; - cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); - raw_spin_lock(&cfs_b->lock); - empty = list_empty(&cfs_b->throttled_cfs_rq); + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); - /* - * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is - * not running add to the tail so that later runqueues don't get starved. - */ - if (cfs_b->distribute_running) - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - else - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + } + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta); + +done: /* - * If we're the first throttled task, make sure the bandwidth - * timer is running. + * Note: distribution will already see us throttled via the + * throttled-list. rq->lock protects completion. */ - if (empty) - start_cfs_bandwidth(cfs_b); - - raw_spin_unlock(&cfs_b->lock); + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + return true; } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -4583,7 +5240,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - int enqueue = 1; long task_delta, idle_task_delta; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4600,40 +5256,72 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); - if (!cfs_rq->load.weight) - return; + if (!cfs_rq->load.weight) { + if (!cfs_rq->on_list) + return; + /* + * Nothing to run but something to decay (on_list)? + * Complete the branch. + */ + for_each_sched_entity(se) { + if (list_add_leaf_cfs_rq(cfs_rq_of(se))) + break; + } + goto unthrottle_throttle; + } task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + if (se->on_rq) - enqueue = 0; + break; + enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); - cfs_rq = cfs_rq_of(se); - if (enqueue) - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; - if (cfs_rq_throttled(cfs_rq)) - break; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(qcfs_rq)) + goto unthrottle_throttle; } - assert_list_leaf_cfs_rq(rq); + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (!se) - add_nr_running(rq, task_delta); + update_load_avg(qcfs_rq, se, UPDATE_TG); + se_update_runnable(se); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(qcfs_rq)) + goto unthrottle_throttle; + } + + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, task_delta); + +unthrottle_throttle: + assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) +static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { struct cfs_rq *cfs_rq; - u64 runtime; - u64 starting_runtime = remaining; + u64 runtime, remaining = 1; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -4648,10 +5336,13 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) /* By the above check, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + raw_spin_lock(&cfs_b->lock); runtime = -cfs_rq->runtime_remaining + 1; - if (runtime > remaining) - runtime = remaining; - remaining -= runtime; + if (runtime > cfs_b->runtime) + runtime = cfs_b->runtime; + cfs_b->runtime -= runtime; + remaining = cfs_b->runtime; + raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += runtime; @@ -4666,8 +5357,6 @@ next: break; } rcu_read_unlock(); - - return starting_runtime - remaining; } /* @@ -4678,7 +5367,6 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { - u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4688,6 +5376,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->nr_periods += overrun; + /* Refill extra burst quota even if cfs_b->idle */ + __refill_cfs_bandwidth_runtime(cfs_b); + /* * idle depends on !throttled (for the case of a large deficit), and if * we're going inactive then everything else can be deferred @@ -4695,8 +5386,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u if (cfs_b->idle && !throttled) goto out_deactivate; - __refill_cfs_bandwidth_runtime(cfs_b); - if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; @@ -4707,24 +5396,15 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u cfs_b->nr_throttled += overrun; /* - * This check is repeated as we are holding onto the new bandwidth while - * we unthrottle. This can potentially race with an unthrottled group - * trying to acquire new bandwidth from the global pool. This can result - * in us over-using our runtime if it is all used during this loop, but - * only by limited amounts in that extreme case. + * This check is repeated as we release cfs_b->lock while we unthrottle. */ - while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - runtime = cfs_b->runtime; - cfs_b->distribute_running = 1; + while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime); + distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - - lsub_positive(&cfs_b->runtime, runtime); } /* @@ -4758,7 +5438,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; - u64 remaining; + s64 remaining; /* if the call-back is running a quota refresh is already occurring */ if (hrtimer_callback_running(refresh_timer)) @@ -4766,7 +5446,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) /* is a quota refresh about to occur? */ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); - if (remaining < min_expire) + if (remaining < (s64)min_expire) return 1; return 0; @@ -4837,10 +5517,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->slack_started = false; - if (cfs_b->distribute_running) { - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); - return; - } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); @@ -4850,26 +5526,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - if (runtime) - cfs_b->distribute_running = 1; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime); - - raw_spin_lock_irqsave(&cfs_b->lock, flags); - lsub_positive(&cfs_b->runtime, runtime); - cfs_b->distribute_running = 0; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); + distribute_cfs_runtime(cfs_b); } /* * When a group wakes up we want to make sure that its quota is not already * expired/exceeded, otherwise it may be allowed to steal additional ticks of - * runtime as update_curr() throttling can not not trigger until it's on-rq. + * runtime as update_curr() throttling can not trigger until it's on-rq. */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { @@ -4904,7 +5572,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@ -4923,8 +5591,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_rq_throttled(cfs_rq)) return true; - throttle_cfs_rq(cfs_rq); - return true; + return throttle_cfs_rq(cfs_rq); } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -4954,6 +5621,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); + if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -4966,6 +5635,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (new < max_cfs_quota_period) { cfs_b->period = ns_to_ktime(new); cfs_b->quota *= 2; + cfs_b->burst *= 2; pr_warn_ratelimited( "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", @@ -4983,8 +5653,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) /* reset count so we don't come right back in here */ count = 0; } - - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; @@ -4999,13 +5667,13 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); + cfs_b->burst = 0; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; - cfs_b->distribute_running = 0; cfs_b->slack_started = false; } @@ -5044,12 +5712,12 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * bits doesn't do much. */ -/* cpu online calback */ +/* cpu online callback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { struct task_group *tg; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -5068,7 +5736,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct task_group *tg; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -5157,7 +5825,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) s64 delta = slice - ran; if (delta < 0) { - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); return; } @@ -5174,7 +5842,7 @@ static void hrtick_update(struct rq *rq) { struct task_struct *curr = rq->curr; - if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) @@ -5192,11 +5860,9 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP -static inline unsigned long cpu_util(int cpu); - static inline bool cpu_overutilized(int cpu) { - return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); + return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)); } static inline void update_overutilized_status(struct rq *rq) @@ -5217,6 +5883,17 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } +/* + * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use + * of idle_nr_running, which does not consider idle descendants of normal + * entities. + */ +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq) +{ + return cfs_rq->nr_running && + cfs_rq->nr_running == cfs_rq->idle_nr_running; +} + #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { @@ -5235,6 +5912,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); /* * The code below (indirectly) updates schedutil which looks at @@ -5258,68 +5936,58 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running increment below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto enqueue_throttle; + flags = ENQUEUE_WAKEUP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; - - if (cfs_rq_throttled(cfs_rq)) - break; update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); update_cfs_group(se); - } - if (!se) { - add_nr_running(rq, 1); - /* - * Since new tasks are assigned an initial util_avg equal to - * half of the spare capacity of their CPU, tiny tasks have the - * ability to cross the overutilized threshold, which will - * result in the load balancer ruining all the task placement - * done by EAS. As a way to mitigate that effect, do not account - * for the first enqueue operation of new tasks during the - * overutilized flag detection. - * - * A better way of solving this problem would be to wait for - * the PELT signals of tasks to converge before taking them - * into account, but that is not straightforward to implement, - * and the following generally works well enough in practice. - */ - if (flags & ENQUEUE_WAKEUP) - update_overutilized_status(rq); + cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto enqueue_throttle; } - if (cfs_bandwidth_used()) { - /* - * When bandwidth control is enabled; the cfs_rq_throttled() - * breaks in the above iteration can result in incomplete - * leaf list maintenance, resulting in triggering the assertion - * below. - */ - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, 1); - if (list_add_leaf_cfs_rq(cfs_rq)) - break; - } - } + /* + * Since new tasks are assigned an initial util_avg equal to + * half of the spare capacity of their CPU, tiny tasks have the + * ability to cross the overutilized threshold, which will + * result in the load balancer ruining all the task placement + * done by EAS. As a way to mitigate that effect, do not account + * for the first enqueue operation of new tasks during the + * overutilized flag detection. + * + * A better way of solving this problem would be to wait for + * the PELT signals of tasks to converge before taking them + * into account, but that is not straightforward to implement, + * and the following generally works well enough in practice. + */ + if (!task_new) + update_overutilized_status(rq); +enqueue_throttle: assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -5340,21 +6008,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + util_est_dequeue(&rq->cfs, p); + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running decrement below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto dequeue_throttle; + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5372,32 +6041,40 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + update_cfs_group(se); + cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - break; + goto dequeue_throttle; - update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_group(se); } - if (!se) - sub_nr_running(rq, 1); + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, 1); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; - util_est_dequeue(&rq->cfs, p, task_sleep); +dequeue_throttle: + util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); } #ifdef CONFIG_SMP /* Working cpumask for: load_balance, load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); #ifdef CONFIG_NO_HZ_COMMON @@ -5405,6 +6082,7 @@ static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; int has_blocked; /* Idle CPUS has blocked load */ + int needs_update; /* Newly idle CPUs need their next_balance collated */ unsigned long next_balance; /* in jiffy units */ unsigned long next_blocked; /* Next update of blocked load in jiffies */ } nohz ____cacheline_aligned; @@ -5447,6 +6125,29 @@ static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) return load; } +static unsigned long cpu_runnable(struct rq *rq) +{ + return cfs_rq_runnable_avg(&rq->cfs); +} + +static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + unsigned int runnable; + + /* Task has no contribution or is new */ + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_runnable(rq); + + cfs_rq = &rq->cfs; + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + + /* Discount task's runnable from CPU's runnable */ + lsub_positive(&runnable, p->se.avg.runnable_avg); + + return runnable; +} + static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; @@ -5490,7 +6191,7 @@ static int wake_wide(struct task_struct *p) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int factor = __this_cpu_read(sd_llc_size); if (master < slave) swap(master, slave); @@ -5532,6 +6233,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; + if (available_idle_cpu(prev_cpu)) + return prev_cpu; + return nr_cpumask_bits; } @@ -5589,18 +6293,17 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); - schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); + schedstat_inc(p->stats.nr_wakeups_affine_attempts); if (target == nr_cpumask_bits) return prev_cpu; schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); + schedstat_inc(p->stats.nr_wakeups_affine); return target; } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag); +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5621,11 +6324,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + struct rq *rq = cpu_rq(i); + + if (!sched_core_cookie_match(rq, p)) + continue; + if (sched_idle_cpu(i)) return i; if (available_idle_cpu(i)) { - struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { /* @@ -5683,7 +6390,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu); if (!group) { sd = sd->child; continue; @@ -5711,6 +6418,15 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; } +static inline int __select_idle_cpu(int cpu, struct task_struct *p) +{ + if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && + sched_cpu_cookie_match(cpu_rq(cpu), p)) + return cpu; + + return -1; +} + #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -5724,7 +6440,7 @@ static inline void set_idle_cores(int cpu, int val) WRITE_ONCE(sds->has_idle_cores, val); } -static inline bool test_idle_cores(int cpu, bool def) +static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; @@ -5732,7 +6448,7 @@ static inline bool test_idle_cores(int cpu, bool def) if (sds) return READ_ONCE(sds->has_idle_cores); - return def; + return false; } /* @@ -5748,7 +6464,7 @@ void __update_idle_core(struct rq *rq) int cpu; rcu_read_lock(); - if (test_idle_cores(core, true)) + if (test_idle_cores(core)) goto unlock; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -5769,37 +6485,31 @@ unlock: * there are no idle cores left in the system; tracked through * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. */ -static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu; - - if (!static_branch_likely(&sched_smt_present)) - return -1; - - if (!test_idle_cores(target, false)) - return -1; - - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - - for_each_cpu_wrap(core, cpus, target) { - bool idle = true; + bool idle = true; + int cpu; - for_each_cpu(cpu, cpu_smt_mask(core)) { - __cpumask_clear_cpu(cpu, cpus); - if (!available_idle_cpu(cpu)) - idle = false; + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (!available_idle_cpu(cpu)) { + idle = false; + if (*idle_cpu == -1) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { + *idle_cpu = cpu; + break; + } + continue; + } + break; } - - if (idle) - return core; + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr)) + *idle_cpu = cpu; } - /* - * Failed to find an idle core; stop looking for one. - */ - set_idle_cores(target, 0); + if (idle) + return core; + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); return -1; } @@ -5810,11 +6520,8 @@ static int select_idle_smt(struct task_struct *p, int target) { int cpu; - if (!static_branch_likely(&sched_smt_present)) - return -1; - - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { + if (cpu == target) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -5825,9 +6532,18 @@ static int select_idle_smt(struct task_struct *p, int target) #else /* CONFIG_SCHED_SMT */ -static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static inline void set_idle_cores(int cpu, int val) { - return -1; +} + +static inline bool test_idle_cores(int cpu) +{ + return false; +} + +static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) +{ + return __select_idle_cpu(core, p); } static inline int select_idle_smt(struct task_struct *p, int target) @@ -5842,55 +6558,134 @@ static inline int select_idle_smt(struct task_struct *p, int target) * comparing the average scan cost (tracked in sd->avg_scan_cost) against the * average idle time for this rq (as found in rq->avg_idle). */ -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - struct sched_domain *this_sd; - u64 avg_cost, avg_idle; - u64 time, cost; - s64 delta; + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); + int i, cpu, idle_cpu = -1, nr = INT_MAX; + struct sched_domain_shared *sd_share; + struct rq *this_rq = this_rq(); int this = smp_processor_id(); - int cpu, nr = INT_MAX; + struct sched_domain *this_sd = NULL; + u64 time = 0; - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - /* - * Due to large variance we need a large fuzz factor; hackbench in - * particularly is sensitive here. - */ - avg_idle = this_rq()->avg_idle / 512; - avg_cost = this_sd->avg_scan_cost + 1; + if (sched_feat(SIS_PROP) && !has_idle_core) { + u64 avg_cost, avg_idle, span_avg; + unsigned long now = jiffies; - if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) - return -1; + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + + /* + * If we're busy, the assumption that the last idle period + * predicts the future is flawed; age away the remaining + * predicted idle time. + */ + if (unlikely(this_rq->wake_stamp < now)) { + while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) { + this_rq->wake_stamp++; + this_rq->wake_avg_idle >>= 1; + } + } + + avg_idle = this_rq->wake_avg_idle; + avg_cost = this_sd->avg_scan_cost + 1; - if (sched_feat(SIS_PROP)) { - u64 span_avg = sd->span_weight * avg_idle; + span_avg = sd->span_weight * avg_idle; if (span_avg > 4*avg_cost) nr = div_u64(span_avg, avg_cost); else nr = 4; + + time = cpu_clock(this); + } + + if (sched_feat(SIS_UTIL)) { + sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + if (sd_share) { + /* because !--nr is the condition to stop scan */ + nr = READ_ONCE(sd_share->nr_idle_scan) + 1; + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; + } } - time = cpu_clock(this); + for_each_cpu_wrap(cpu, cpus, target + 1) { + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (!--nr) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + break; + } + } + + if (has_idle_core) + set_idle_cores(target, false); + + if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { + time = cpu_clock(this) - time; + + /* + * Account for the scan cost of wakeups against the average + * idle time. + */ + this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time); + + update_avg(&this_sd->avg_scan_cost, time); + } + + return idle_cpu; +} + +/* + * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which + * the task fits. If no CPU is big enough, but there are idle ones, try to + * maximize capacity. + */ +static int +select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) +{ + unsigned long task_util, best_cap = 0; + int cpu, best_cpu = -1; + struct cpumask *cpus; + + cpus = this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + task_util = uclamp_task_util(p); + for_each_cpu_wrap(cpu, cpus, target) { - if (!--nr) - return -1; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - break; + unsigned long cpu_cap = capacity_of(cpu); + + if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + continue; + if (fits_capacity(task_util, cpu_cap)) + return cpu; + + if (cpu_cap > best_cap) { + best_cap = cpu_cap; + best_cpu = cpu; + } } - time = cpu_clock(this) - time; - cost = this_sd->avg_scan_cost; - delta = (s64)(time - cost) / 8; - this_sd->avg_scan_cost += delta; + return best_cpu; +} + +static inline bool asym_fits_capacity(unsigned long task_util, int cpu) +{ + if (sched_asym_cpucap_active()) + return fits_capacity(task_util, capacity_of(cpu)); - return cpu; + return true; } /* @@ -5898,17 +6693,35 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { + bool has_idle_core = false; struct sched_domain *sd; + unsigned long task_util; int i, recent_used_cpu; - if (available_idle_cpu(target) || sched_idle_cpu(target)) + /* + * On asymmetric system, update task utilization because we will check + * that the task fits with cpu's capacity. + */ + if (sched_asym_cpucap_active()) { + sync_entity_load_avg(&p->se); + task_util = uclamp_task_util(p); + } + + /* + * per-cpu select_rq_mask usage + */ + lockdep_assert_irqs_disabled(); + + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev))) + (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + asym_fits_capacity(task_util, prev)) return prev; /* @@ -5920,95 +6733,126 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * pattern is IO completions. */ if (is_per_cpu_kthread(current) && + in_task() && prev == smp_processor_id() && - this_rq()->nr_running <= 1) { + this_rq()->nr_running <= 1 && + asym_fits_capacity(task_util, prev)) { return prev; } /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + asym_fits_capacity(task_util, recent_used_cpu)) { + return recent_used_cpu; + } + + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (sched_asym_cpucap_active()) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); /* - * Replace recent_used_cpu with prev as it is a potential - * candidate for the next wake: + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. */ - p->recent_used_cpu = prev; - return recent_used_cpu; + if (sd) { + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } } sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; - i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; + if (sched_smt_active()) { + has_idle_core = test_idle_cores(target); - i = select_idle_cpu(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; + if (!has_idle_core && cpus_share_cache(prev, target)) { + i = select_idle_smt(p, prev); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } + } - i = select_idle_smt(p, target); + i = select_idle_cpu(p, sd, has_idle_core, target); if ((unsigned)i < nr_cpumask_bits) return i; return target; } -/** - * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks - * @cpu: the CPU to get the utilization of - * - * The unit of the return value must be the one of capacity so we can compare - * the utilization with the capacity of the CPU that is available for CFS task - * (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * The estimated utilization of a CPU is defined to be the maximum between its - * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks - * currently RUNNABLE on that CPU. - * This allows to properly represent the expected utilization of a CPU which - * has just got a big task running since a long sleep period. At the same time - * however it preserves the benefits of the "blocked utilization" in - * describing the potential for other tasks waking up on the same CPU. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - * - * Return: the (estimated) utilization for the specified CPU - */ -static inline unsigned long cpu_util(int cpu) +/* + * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu + * (@dst_cpu = -1) or migrated to @dst_cpu. + */ +static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) { - struct cfs_rq *cfs_rq; - unsigned int util; + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); + /* + * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its + * contribution. If @p migrates from another CPU to @cpu add its + * contribution. In all the other cases @cpu is not impacted by the + * migration so its util_avg is already correct. + */ + if (task_cpu(p) == cpu && dst_cpu != cpu) + lsub_positive(&util, task_util(p)); + else if (task_cpu(p) != cpu && dst_cpu == cpu) + util += task_util(p); + + if (sched_feat(UTIL_EST)) { + unsigned long util_est; - if (sched_feat(UTIL_EST)) - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); - return min_t(unsigned long, util, capacity_orig_of(cpu)); + /* + * During wake-up @p isn't enqueued yet and doesn't contribute + * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. + * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p + * has been enqueued. + * + * During exec (@dst_cpu = -1) @p is enqueued and does + * contribute to cpu_rq(cpu)->cfs.util_est.enqueued. + * Remove it to "simulate" cpu_util without @p's contribution. + * + * Despite the task_on_rq_queued(@p) check there is still a + * small window for a possible race when an exec + * select_task_rq_fair() races with LB's detach_task(). + * + * detach_task() + * deactivate_task() + * p->on_rq = TASK_ON_RQ_MIGRATING; + * -------------------------------- A + * dequeue_task() \ + * dequeue_task_fair() + Race Time + * util_est_dequeue() / + * -------------------------------- B + * + * The additional check "current == p" is required to further + * reduce the race window. + */ + if (dst_cpu == cpu) + util_est += _task_util_est(p); + else if (unlikely(task_on_rq_queued(p) || current == p)) + lsub_positive(&util_est, _task_util_est(p)); + + util = max(util, util_est); + } + + return min(util, capacity_orig_of(cpu)); } /* @@ -6026,181 +6870,104 @@ static inline unsigned long cpu_util(int cpu) */ static unsigned long cpu_util_without(int cpu, struct task_struct *p) { - struct cfs_rq *cfs_rq; - unsigned int util; - /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util(cpu); - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - /* Discount task's util from CPU's util */ - lsub_positive(&util, task_util(p)); + return cpu_util_cfs(cpu); - /* - * Covered cases: - * - * a) if *p is the only task sleeping on this CPU, then: - * cpu_util (== task_util) > util_est (== 0) - * and thus we return: - * cpu_util_without = (cpu_util - task_util) = 0 - * - * b) if other tasks are SLEEPING on this CPU, which is now exiting - * IDLE, then: - * cpu_util >= task_util - * cpu_util > util_est (== 0) - * and thus we discount *p's blocked utilization to return: - * cpu_util_without = (cpu_util - task_util) >= 0 - * - * c) if other tasks are RUNNABLE on that CPU and - * util_est > cpu_util - * then we use util_est since it returns a more restrictive - * estimation of the spare capacity on that CPU, by just - * considering the expected utilization of tasks already - * runnable on that CPU. - * - * Cases a) and b) are covered by the above code, while case c) is - * covered by the following code when estimated utilization is - * enabled. - */ - if (sched_feat(UTIL_EST)) { - unsigned int estimated = - READ_ONCE(cfs_rq->avg.util_est.enqueued); - - /* - * Despite the following checks we still have a small window - * for a possible race, when an execl's select_task_rq_fair() - * races with LB's detach_task(): - * - * detach_task() - * p->on_rq = TASK_ON_RQ_MIGRATING; - * ---------------------------------- A - * deactivate_task() \ - * dequeue_task() + RaceTime - * util_est_dequeue() / - * ---------------------------------- B - * - * The additional check on "current == p" it's required to - * properly fix the execl regression and it helps in further - * reducing the chances for the above race. - */ - if (unlikely(task_on_rq_queued(p) || current == p)) - lsub_positive(&estimated, _task_util_est(p)); - - util = max(util, estimated); - } - - /* - * Utilization (estimated) can exceed the CPU capacity, thus let's - * clamp to the maximum CPU capacity to ensure consistency with - * the cpu_util call. - */ - return min_t(unsigned long, util, capacity_orig_of(cpu)); + return cpu_util_next(cpu, p, -1); } /* - * Disable WAKE_AFFINE in the case where task @p doesn't fit in the - * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. - * - * In that case WAKE_AFFINE doesn't make sense and we'll let - * BALANCE_WAKE sort things out. + * energy_env - Utilization landscape for energy estimation. + * @task_busy_time: Utilization contribution by the task for which we test the + * placement. Given by eenv_task_busy_time(). + * @pd_busy_time: Utilization of the whole perf domain without the task + * contribution. Given by eenv_pd_busy_time(). + * @cpu_cap: Maximum CPU capacity for the perf domain. + * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap). */ -static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) -{ - long min_cap, max_cap; - - if (!static_branch_unlikely(&sched_asym_cpucapacity)) - return 0; - - min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); - max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; +struct energy_env { + unsigned long task_busy_time; + unsigned long pd_busy_time; + unsigned long cpu_cap; + unsigned long pd_cap; +}; - /* Minimum capacity is close to max, no need to abort wake_affine */ - if (max_cap - min_cap < max_cap >> 3) - return 0; +/* + * Compute the task busy time for compute_energy(). This time cannot be + * injected directly into effective_cpu_util() because of the IRQ scaling. + * The latter only makes sense with the most recent CPUs where the task has + * run. + */ +static inline void eenv_task_busy_time(struct energy_env *eenv, + struct task_struct *p, int prev_cpu) +{ + unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu); + unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu)); - /* Bring task utilization in sync with prev_cpu */ - sync_entity_load_avg(&p->se); + if (unlikely(irq >= max_cap)) + busy_time = max_cap; + else + busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap); - return !task_fits_capacity(p, min_cap); + eenv->task_busy_time = busy_time; } /* - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) - * to @dst_cpu. + * Compute the perf_domain (PD) busy time for compute_energy(). Based on the + * utilization for each @pd_cpus, it however doesn't take into account + * clamping since the ratio (utilization / cpu_capacity) is already enough to + * scale the EM reported power consumption at the (eventually clamped) + * cpu_capacity. + * + * The contribution of the task @p for which we want to estimate the + * energy cost is removed (by cpu_util_next()) and must be calculated + * separately (see eenv_task_busy_time). This ensures: + * + * - A stable PD utilization, no matter which CPU of that PD we want to place + * the task on. + * + * - A fair comparison between CPUs as the task contribution (task_util()) + * will always be the same no matter which CPU utilization we rely on + * (util_avg or util_est). + * + * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't + * exceed @eenv->pd_cap. */ -static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +static inline void eenv_pd_busy_time(struct energy_env *eenv, + struct cpumask *pd_cpus, + struct task_struct *p) { - struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; - unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); - - /* - * If @p migrates from @cpu to another, remove its contribution. Or, - * if @p migrates from another CPU to @cpu, add its contribution. In - * the other cases, @cpu is not impacted by the migration, so the - * util_avg should already be correct. - */ - if (task_cpu(p) == cpu && dst_cpu != cpu) - sub_positive(&util, task_util(p)); - else if (task_cpu(p) != cpu && dst_cpu == cpu) - util += task_util(p); - - if (sched_feat(UTIL_EST)) { - util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + unsigned long busy_time = 0; + int cpu; - /* - * During wake-up, the task isn't enqueued yet and doesn't - * appear in the cfs_rq->avg.util_est.enqueued of any rq, - * so just add it (if needed) to "simulate" what will be - * cpu_util() after the task has been enqueued. - */ - if (dst_cpu == cpu) - util_est += _task_util_est(p); + for_each_cpu(cpu, pd_cpus) { + unsigned long util = cpu_util_next(cpu, p, -1); - util = max(util, util_est); + busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); } - return min(util, capacity_orig_of(cpu)); + eenv->pd_busy_time = min(eenv->pd_cap, busy_time); } /* - * compute_energy(): Estimates the energy that @pd would consume if @p was - * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of @pd's CPUs after the task migration, and uses the Energy Model - * to compute what would be the energy if we decided to actually migrate that - * task. + * Compute the maximum utilization for compute_energy() when the task @p + * is placed on the cpu @dst_cpu. + * + * Returns the maximum utilization among @eenv->cpus. This utilization can't + * exceed @eenv->cpu_cap. */ -static long -compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +static inline unsigned long +eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, + struct task_struct *p, int dst_cpu) { - struct cpumask *pd_mask = perf_domain_span(pd); - unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); - unsigned long max_util = 0, sum_util = 0; + unsigned long max_util = 0; int cpu; - /* - * The capacity state of CPUs of the current rd can be driven by CPUs - * of another rd if they belong to the same pd. So, account for the - * utilization of these CPUs too by masking pd with cpu_online_mask - * instead of the rd span. - * - * If an entire pd is outside of the current rd, it will not appear in - * its pd list and will not be accounted by compute_energy(). - */ - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); - struct task_struct *tsk = cpu == dst_cpu ? p : NULL; - - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, - ENERGY_UTIL, NULL); + for_each_cpu(cpu, pd_cpus) { + struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; + unsigned long util = cpu_util_next(cpu, p, dst_cpu); + unsigned long cpu_util; /* * Performance domain frequency: utilization clamping @@ -6209,12 +6976,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, - FREQUENCY_UTIL, tsk); + cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); max_util = max(max_util, cpu_util); } - return em_pd_energy(pd->em_pd, max_util, sum_util); + return min(max_util, eenv->cpu_cap); +} + +/* + * compute_energy(): Use the Energy Model to estimate the energy that @pd would + * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task + * contribution is ignored. + */ +static inline unsigned long +compute_energy(struct energy_env *eenv, struct perf_domain *pd, + struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu) +{ + unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); + unsigned long busy_time = eenv->pd_busy_time; + + if (dst_cpu >= 0) + busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time); + + return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); } /* @@ -6258,17 +7042,18 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) */ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; - struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - unsigned long cpu_cap, util, base_energy = 0; - int cpu, best_energy_cpu = prev_cpu; + struct root_domain *rd = this_rq()->rd; + int cpu, best_energy_cpu, target = -1; struct sched_domain *sd; struct perf_domain *pd; + struct energy_env eenv; rcu_read_lock(); pd = rcu_dereference(rd->pd); if (!pd || READ_ONCE(rd->overutilized)) - goto fail; + goto unlock; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -6278,110 +7063,145 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) - goto fail; + goto unlock; + + target = prev_cpu; sync_entity_load_avg(&p->se); if (!task_util_est(p)) goto unlock; + eenv_task_busy_time(&eenv, p, prev_cpu); + for (; pd; pd = pd->next) { - unsigned long cur_delta, spare_cap, max_spare_cap = 0; - unsigned long base_energy_pd; + unsigned long cpu_cap, cpu_thermal_cap, util; + unsigned long cur_delta, max_spare_cap = 0; + bool compute_prev_delta = false; int max_spare_cap_cpu = -1; + unsigned long base_energy; - /* Compute the 'base' energy of the pd, without @p */ - base_energy_pd = compute_energy(p, -1, pd); - base_energy += base_energy_pd; + cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); + + if (cpumask_empty(cpus)) + continue; + + /* Account thermal pressure for the energy estimation */ + cpu = cpumask_first(cpus); + cpu_thermal_cap = arch_scale_cpu_capacity(cpu); + cpu_thermal_cap -= arch_scale_thermal_pressure(cpu); + + eenv.cpu_cap = cpu_thermal_cap; + eenv.pd_cap = 0; + + for_each_cpu(cpu, cpus) { + eenv.pd_cap += cpu_thermal_cap; + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; - for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); - spare_cap = cpu_cap - util; /* * Skip CPUs that cannot satisfy the capacity request. * IOW, placing the task there would make the CPU * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is - * aligned with schedutil_cpu_util(). + * aligned with sched_cpu_util(). */ util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) continue; - /* Always use prev_cpu as a candidate. */ - if (cpu == prev_cpu) { - prev_delta = compute_energy(p, prev_cpu, pd); - prev_delta -= base_energy_pd; - best_delta = min(best_delta, prev_delta); - } + lsub_positive(&cpu_cap, util); - /* - * Find the CPU with the maximum spare capacity in - * the performance domain - */ - if (spare_cap > max_spare_cap) { - max_spare_cap = spare_cap; + if (cpu == prev_cpu) { + /* Always use prev_cpu as a candidate. */ + compute_prev_delta = true; + } else if (cpu_cap > max_spare_cap) { + /* + * Find the CPU with the maximum spare capacity + * in the performance domain. + */ + max_spare_cap = cpu_cap; max_spare_cap_cpu = cpu; } } - /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { - cur_delta = compute_energy(p, max_spare_cap_cpu, pd); - cur_delta -= base_energy_pd; + if (max_spare_cap_cpu < 0 && !compute_prev_delta) + continue; + + eenv_pd_busy_time(&eenv, cpus, p); + /* Compute the 'base' energy of the pd, without @p */ + base_energy = compute_energy(&eenv, pd, cpus, p, -1); + + /* Evaluate the energy impact of using prev_cpu. */ + if (compute_prev_delta) { + prev_delta = compute_energy(&eenv, pd, cpus, p, + prev_cpu); + /* CPU utilization has changed */ + if (prev_delta < base_energy) + goto unlock; + prev_delta -= base_energy; + best_delta = min(best_delta, prev_delta); + } + + /* Evaluate the energy impact of using max_spare_cap_cpu. */ + if (max_spare_cap_cpu >= 0) { + cur_delta = compute_energy(&eenv, pd, cpus, p, + max_spare_cap_cpu); + /* CPU utilization has changed */ + if (cur_delta < base_energy) + goto unlock; + cur_delta -= base_energy; if (cur_delta < best_delta) { best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; } } } -unlock: rcu_read_unlock(); - /* - * Pick the best CPU if prev_cpu cannot be used, or if it saves at - * least 6% of the energy used by prev_cpu. - */ - if (prev_delta == ULONG_MAX) - return best_energy_cpu; - - if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) - return best_energy_cpu; + if (best_delta < prev_delta) + target = best_energy_cpu; - return prev_cpu; + return target; -fail: +unlock: rcu_read_unlock(); - return -1; + return target; } /* * select_task_rq_fair: Select target runqueue for the waking task in domains - * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * * Balances load by selecting the idlest CPU in the idlest group, or under * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. * * Returns the target CPU number. - * - * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) { + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; - int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + /* SD_flags and WF_flags share the first nibble */ + int sd_flag = wake_flags & 0xF; - if (sd_flag & SD_BALANCE_WAKE) { + /* + * required for stable ->cpus_allowed + */ + lockdep_assert_held(&p->pi_lock); + if (wake_flags & WF_TTWU) { record_wakee(p); if (sched_energy_enabled()) { @@ -6391,15 +7211,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = prev_cpu; } - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && - cpumask_test_cpu(cpu, p->cpus_ptr); + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); } rcu_read_lock(); for_each_domain(cpu, tmp) { - if (!(tmp->flags & SD_LOAD_BALANCE)) - break; - /* * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. @@ -6413,6 +7229,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f break; } + /* + * Usually only true for WF_EXEC and WF_FORK, as sched_domains + * usually do not have SD_BALANCE_WAKE set. That means wakeup + * will usually go to the fast path. + */ if (tmp->flags & sd_flag) sd = tmp; else if (!want_affine) @@ -6422,21 +7243,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (unlikely(sd)) { /* Slow path */ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); - } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - - if (want_affine) - current->recent_used_cpu = cpu; } rcu_read_unlock(); return new_cpu; } -static void detach_entity_cfs_rq(struct sched_entity *se); - /* * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -6444,57 +7259,41 @@ static void detach_entity_cfs_rq(struct sched_entity *se); */ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { + struct sched_entity *se = &p->se; + /* * As blocked tasks retain absolute vruntime the migration needs to * deal with this by subtracting the old and adding the new * min_vruntime -- the latter is done by enqueue_entity() when placing * the task on the new runqueue. */ - if (p->state == TASK_WAKING) { - struct sched_entity *se = &p->se; + if (READ_ONCE(p->__state) == TASK_WAKING) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 min_vruntime; - -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; - - do { - min_vruntime_copy = cfs_rq->min_vruntime_copy; - smp_rmb(); - min_vruntime = cfs_rq->min_vruntime; - } while (min_vruntime != min_vruntime_copy); -#else - min_vruntime = cfs_rq->min_vruntime; -#endif - se->vruntime -= min_vruntime; + se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); } - if (p->on_rq == TASK_ON_RQ_MIGRATING) { - /* - * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' - * rq->lock and can modify state directly. - */ - lockdep_assert_held(&task_rq(p)->lock); - detach_entity_cfs_rq(&p->se); + if (!task_on_rq_migrating(p)) { + remove_entity_load_avg(se); - } else { /* - * We are supposed to update the task to "current" time, then - * its up to date and ready to go to new CPU/cfs_rq. But we - * have difficulty in getting what current time is, so simply - * throw away the out-of-date time. This will result in the - * wakee task is less decayed, but giving the wakee more load - * sounds not bad. + * Here, the task's PELT values have been updated according to + * the current rq's clock. But if that clock hasn't been + * updated in a while, a substantial idle time will be missed, + * leading to an inflation after wake-up on the new rq. + * + * Estimate the missing time from the cfs_rq last_update_time + * and update sched_avg to improve the PELT continuity after + * migration. */ - remove_entity_load_avg(&p->se); + migrate_se_pelt_lag(se); } /* Tell new CPU we are migrated */ - p->se.avg.last_update_time = 0; + se->avg.last_update_time = 0; /* We have migrated, no longer consider this task hot */ - p->se.exec_start = 0; + se->exec_start = 0; update_scan_period(p, new_cpu); } @@ -6565,24 +7364,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->last = se; } } static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->next = se; } } @@ -6603,13 +7400,14 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; + int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) return; /* * This is possible from callers such as attach_tasks(), in which we - * unconditionally check_prempt_curr() after an enqueue (which may have + * unconditionally check_preempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ @@ -6647,8 +7445,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; find_matching_se(&se, &pse); + WARN_ON_ONCE(!pse); + + cse_is_idle = se_is_idle(se); + pse_is_idle = se_is_idle(pse); + + /* + * Preempt an idle group in favor of a non-idle group (and don't preempt + * in the inverse case). + */ + if (cse_is_idle && !pse_is_idle) + goto preempt; + if (cse_is_idle != pse_is_idle) + return; + update_curr(cfs_rq_of(se)); - BUG_ON(!pse); if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is @@ -6679,6 +7490,39 @@ preempt: set_last_buddy(se); } +#ifdef CONFIG_SMP +static struct task_struct *pick_task_fair(struct rq *rq) +{ + struct sched_entity *se; + struct cfs_rq *cfs_rq; + +again: + cfs_rq = &rq->cfs; + if (!cfs_rq->nr_running) + return NULL; + + do { + struct sched_entity *curr = cfs_rq->curr; + + /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ + if (curr) { + if (curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; + + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto again; + } + + se = pick_next_entity(cfs_rq, curr); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + return task_of(se); +} +#endif + struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -6790,7 +7634,7 @@ done: __maybe_unused; list_move(&p->se.group_node, &rq->cfs_tasks); #endif - if (hrtick_enabled(rq)) + if (hrtick_enabled_fair(rq)) hrtick_start_fair(rq, p); update_misfit_status(p, rq); @@ -6878,7 +7722,7 @@ static void yield_task_fair(struct rq *rq) set_skip_buddy(se); } -static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -7033,8 +7877,8 @@ enum group_type { */ group_fully_busy, /* - * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity - * and must be migrated to a more powerful CPU. + * One task doesn't fit with CPU's capacity and must be migrated to a + * more powerful CPU. */ group_misfit_task, /* @@ -7066,8 +7910,7 @@ enum migration_type { #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 -#define LBF_NOHZ_STATS 0x10 -#define LBF_NOHZ_AGAIN 0x20 +#define LBF_ACTIVE_LB 0x10 struct lb_env { struct sched_domain *sd; @@ -7103,7 +7946,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); if (p->sched_class != &fair_sched_class) return 0; @@ -7111,6 +7954,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (unlikely(task_has_idle_policy(p))) return 0; + /* SMT siblings share cache */ + if (env->sd->flags & SD_SHARE_CPUCAPACITY) + return 0; + /* * Buddy candidates are cache hot: */ @@ -7121,6 +7968,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (sysctl_sched_migration_cost == -1) return 1; + + /* + * Don't migrate task if the task's cookie does not match + * with the destination CPU's core cookie. + */ + if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p)) + return 1; + if (sysctl_sched_migration_cost == 0) return 0; @@ -7197,7 +8052,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); /* * We do not migrate tasks that are: @@ -7209,10 +8064,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; + /* Disregard pcpu kthreads; they are where they need to be. */ + if (kthread_is_per_cpu(p)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; - schedstat_inc(p->se.statistics.nr_failed_migrations_affine); + schedstat_inc(p->stats.nr_failed_migrations_affine); env->flags |= LBF_SOME_PINNED; @@ -7221,10 +8080,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * - * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have - * already computed one in current iteration. + * Avoid computing new_dst_cpu + * - for NEWLY_IDLE + * - if we have already computed one in current iteration + * - if it's an active balance */ - if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) + if (env->idle == CPU_NEWLY_IDLE || + env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB)) return 0; /* Prevent to re-select dst_cpu via env's CPUs: */ @@ -7239,20 +8101,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; } - /* Record that we found atleast one task that could run on dst_cpu */ + /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_running(env->src_rq, p)) { - schedstat_inc(p->se.statistics.nr_failed_migrations_running); + if (task_on_cpu(env->src_rq, p)) { + schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } /* * Aggressive migration if: - * 1) destination numa is preferred - * 2) task is cache cold, or - * 3) too many balance attempts have failed. + * 1) active balance + * 2) destination numa is preferred + * 3) task is cache cold, or + * 4) too many balance attempts have failed. */ + if (env->flags & LBF_ACTIVE_LB) + return 1; + tsk_cache_hot = migrate_degrades_locality(p, env); if (tsk_cache_hot == -1) tsk_cache_hot = task_hot(p, env); @@ -7261,12 +8127,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->se.statistics.nr_forced_migrations); + schedstat_inc(p->stats.nr_forced_migrations); } return 1; } - schedstat_inc(p->se.statistics.nr_failed_migrations_hot); + schedstat_inc(p->stats.nr_failed_migrations_hot); return 0; } @@ -7275,7 +8141,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) */ static void detach_task(struct task_struct *p, struct lb_env *env) { - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); @@ -7291,7 +8157,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); list_for_each_entry_reverse(p, &env->src_rq->cfs_tasks, se.group_node) { @@ -7312,8 +8178,6 @@ static struct task_struct *detach_one_task(struct lb_env *env) return NULL; } -static const unsigned int sched_nr_migrate_break = 32; - /* * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". @@ -7327,7 +8191,16 @@ static int detach_tasks(struct lb_env *env) struct task_struct *p; int detached = 0; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); + + /* + * Source run queue has been emptied by another CPU, clear + * LBF_ALL_PINNED flag as we will not test any task. + */ + if (env->src_rq->nr_running <= 1) { + env->flags &= ~LBF_ALL_PINNED; + return 0; + } if (env->imbalance <= 0) return 0; @@ -7340,26 +8213,37 @@ static int detach_tasks(struct lb_env *env) if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) break; - p = list_last_entry(tasks, struct task_struct, se.group_node); - env->loop++; - /* We've more or less seen every task there is, call it quits */ - if (env->loop > env->loop_max) + /* + * We've more or less seen every task there is, call it quits + * unless we haven't found any movable task yet. + */ + if (env->loop > env->loop_max && + !(env->flags & LBF_ALL_PINNED)) break; /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sched_nr_migrate_break; + env->loop_break += SCHED_NR_MIGRATE_BREAK; env->flags |= LBF_NEED_BREAK; break; } + p = list_last_entry(tasks, struct task_struct, se.group_node); + if (!can_migrate_task(p, env)) goto next; switch (env->migration_type) { case migrate_load: - load = task_h_load(p); + /* + * Depending of the number of CPUs and tasks and the + * cgroup hierarchy, task_h_load() can return a null + * value. Make sure that env->imbalance decreases + * otherwise detach_tasks() will stop only after + * detaching up to loop_max tasks. + */ + load = max_t(unsigned long, task_h_load(p), 1); if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) @@ -7371,8 +8255,7 @@ static int detach_tasks(struct lb_env *env) * scheduler fails to find a good waiting task to * migrate. */ - if (load/2 > env->imbalance && - env->sd->nr_balance_failed <= env->sd->cache_nice_tries) + if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= load; @@ -7442,9 +8325,9 @@ next: */ static void attach_task(struct rq *rq, struct task_struct *p) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); - BUG_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); check_preempt_curr(rq, p, 0); } @@ -7506,6 +8389,9 @@ static inline bool others_have_blocked(struct rq *rq) if (READ_ONCE(rq->avg_dl.util_avg)) return true; + if (thermal_load_avg(rq)) + return true; + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ if (READ_ONCE(rq->avg_irq.util_avg)) return true; @@ -7514,16 +8400,20 @@ static inline bool others_have_blocked(struct rq *rq) return false; } -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +static inline void update_blocked_load_tick(struct rq *rq) { - rq->last_blocked_load_update_tick = jiffies; + WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); +} +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +{ if (!has_blocked) rq->has_blocked_load = 0; } #else static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } +static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif @@ -7531,6 +8421,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) { const struct sched_class *curr_class; u64 now = rq_clock_pelt(rq); + unsigned long thermal_pressure; bool decayed; /* @@ -7539,8 +8430,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done) */ curr_class = rq->curr->sched_class; + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | update_irq_load_avg(rq, 0); if (others_have_blocked(rq)) @@ -7551,23 +8445,6 @@ static bool __update_blocked_others(struct rq *rq, bool *done) #ifdef CONFIG_FAIR_GROUP_SCHED -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load.weight) - return false; - - if (cfs_rq->avg.load_sum) - return false; - - if (cfs_rq->avg.util_sum) - return false; - - if (cfs_rq->avg.runnable_load_sum) - return false; - - return true; -} - static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq, *pos; @@ -7582,7 +8459,10 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) struct sched_entity *se; if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); + + if (cfs_rq->nr_running == 0) + update_idle_cfs_rq_clock_pelt(cfs_rq); if (cfs_rq == &rq->cfs) decayed = true; @@ -7591,7 +8471,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) - update_load_avg(cfs_rq_of(se), se, 0); + update_load_avg(cfs_rq_of(se), se, UPDATE_TG); /* * There can be a lot of idle CPU cgroups. Don't let fully @@ -7680,6 +8560,7 @@ static void update_blocked_averages(int cpu) struct rq_flags rf; rq_lock_irqsave(rq, &rf); + update_blocked_load_tick(rq); update_rq_clock(rq); decayed |= __update_blocked_others(rq, &done); @@ -7700,7 +8581,8 @@ struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long group_capacity; - unsigned long group_util; /* Total utilization of the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ unsigned int sum_nr_running; /* Nr of tasks running in the group */ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; @@ -7751,7 +8633,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } -static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long max = arch_scale_cpu_capacity(cpu); @@ -7763,8 +8645,15 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) if (unlikely(irq >= max)) return 1; + /* + * avg_rt.util_avg and avg_dl.util_avg track binary signals + * (running and not running) with weights 0 and 1024 respectively. + * avg_thermal.load_avg tracks thermal pressure and the weighted + * average uses the actual delta max capacity(load). + */ used = READ_ONCE(rq->avg_rt.util_avg); used += READ_ONCE(rq->avg_dl.util_avg); + used += thermal_load_avg(rq); if (unlikely(used >= max)) return 1; @@ -7776,7 +8665,7 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); @@ -7785,6 +8674,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity = 1; cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); + sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; sdg->sgc->max_capacity = capacity; @@ -7906,7 +8797,7 @@ static inline int sg_imbalanced(struct sched_group *group) /* * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. - * We consider that a group has spare capacity if the * number of task is + * We consider that a group has spare capacity if the number of task is * smaller than the number of CPUs or if the utilization is lower than the * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into @@ -7921,6 +8812,10 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) if (sgs->sum_nr_running < sgs->group_weight) return true; + if ((sgs->group_capacity * imbalance_pct) < + (sgs->group_runnable * 100)) + return false; + if ((sgs->group_capacity * 100) > (sgs->group_util * imbalance_pct)) return true; @@ -7946,27 +8841,11 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) (sgs->group_util * imbalance_pct)) return true; - return false; -} - -/* - * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller - * per-CPU capacity than sched_group ref. - */ -static inline bool -group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) -{ - return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity); -} + if ((sgs->group_capacity * imbalance_pct) < + (sgs->group_runnable * 100)) + return true; -/* - * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller - * per-CPU capacity_orig than sched_group ref. - */ -static inline bool -group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) -{ - return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity); + return false; } static inline enum @@ -7992,36 +8871,124 @@ group_type group_classify(unsigned int imbalance_pct, return group_has_spare; } -static bool update_nohz_stats(struct rq *rq, bool force) +/** + * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks + * @dst_cpu: Destination CPU of the load balancing + * @sds: Load-balancing data with statistics of the local group + * @sgs: Load-balancing statistics of the candidate busiest group + * @sg: The candidate busiest group + * + * Check the state of the SMT siblings of both @sds::local and @sg and decide + * if @dst_cpu can pull tasks. + * + * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of + * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks + * only if @dst_cpu has higher priority. + * + * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more + * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. + * Bigger imbalances in the number of busy CPUs will be dealt with in + * update_sd_pick_busiest(). + * + * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings + * of @dst_cpu are idle and @sg has lower priority. + * + * Return: true if @dst_cpu can pull tasks, false otherwise. + */ +static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + struct sg_lb_stats *sgs, + struct sched_group *sg) { -#ifdef CONFIG_NO_HZ_COMMON - unsigned int cpu = rq->cpu; +#ifdef CONFIG_SCHED_SMT + bool local_is_smt, sg_is_smt; + int sg_busy_cpus; - if (!rq->has_blocked_load) - return false; + local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; + sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; - if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) - return false; + sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; - if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick)) - return true; + if (!local_is_smt) { + /* + * If we are here, @dst_cpu is idle and does not have SMT + * siblings. Pull tasks if candidate group has two or more + * busy CPUs. + */ + if (sg_busy_cpus >= 2) /* implies sg_is_smt */ + return true; - update_blocked_averages(cpu); + /* + * @dst_cpu does not have SMT siblings. @sg may have SMT + * siblings and only one is busy. In such case, @dst_cpu + * can help if it has higher priority and is idle (i.e., + * it has no running tasks). + */ + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + } - return rq->has_blocked_load; + /* @dst_cpu has SMT siblings. */ + + if (sg_is_smt) { + int local_busy_cpus = sds->local->group_weight - + sds->local_stat.idle_cpus; + int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; + + if (busy_cpus_delta == 1) + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + + return false; + } + + /* + * @sg does not have SMT siblings. Ensure that @sds::local does not end + * up with more than one busy SMT sibling and only pull tasks if there + * are not busy CPUs (i.e., no CPU has running tasks). + */ + if (!sds->local_stat.sum_nr_running) + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + + return false; #else + /* Always return false so that callers deal with non-SMT cases. */ return false; #endif } +static inline bool +sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + /* Only do SMT checks if either local or candidate have SMT siblings */ + if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || + (group->flags & SD_SHARE_CPUCAPACITY)) + return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); + + return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); +} + +static inline bool +sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) +{ + /* + * When there is more than 1 task, the group_overloaded case already + * takes care of cpu with reduced capacity + */ + if (rq->cfs.h_nr_running != 1) + return false; + + return check_cpu_capacity(rq, sd); +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. + * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_status: Holds flag indicating the status of the sched_group */ static inline void update_sg_lb_stats(struct lb_env *env, + struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, int *sg_status) @@ -8030,16 +8997,15 @@ static inline void update_sg_lb_stats(struct lb_env *env, memset(sgs, 0, sizeof(*sgs)); - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + local_group = group == sds->local; for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); + unsigned long load = cpu_load(rq); - if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) - env->flags |= LBF_NOHZ_AGAIN; - - sgs->group_load += cpu_load(rq); - sgs->group_util += cpu_util(i); + sgs->group_load += load; + sgs->group_util += cpu_util_cfs(i); + sgs->group_runnable += cpu_runnable(rq); sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; @@ -8067,26 +9033,31 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (local_group) continue; - /* Check for a misfit task on the cpu */ - if (env->sd->flags & SD_ASYM_CPUCAPACITY && - sgs->group_misfit_task_load < rq->misfit_task_load) { - sgs->group_misfit_task_load = rq->misfit_task_load; - *sg_status |= SG_OVERLOAD; + if (env->sd->flags & SD_ASYM_CPUCAPACITY) { + /* Check for a misfit task on the cpu */ + if (sgs->group_misfit_task_load < rq->misfit_task_load) { + sgs->group_misfit_task_load = rq->misfit_task_load; + *sg_status |= SG_OVERLOAD; + } + } else if ((env->idle != CPU_NOT_IDLE) && + sched_reduced_capacity(rq, env->sd)) { + /* Check for a task running on a CPU with reduced capacity */ + if (sgs->group_misfit_task_load < load) + sgs->group_misfit_task_load = load; } } - /* Check if dst CPU is idle and preferred to this group */ - if (env->sd->flags & SD_ASYM_PACKING && - env->idle != CPU_NOT_IDLE && - sgs->sum_h_nr_running && - sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { - sgs->group_asym_packing = 1; - } - sgs->group_capacity = group->sgc->capacity; sgs->group_weight = group->group_weight; + /* Check if dst CPU is idle and preferred to this group */ + if (!local_group && env->sd->flags & SD_ASYM_PACKING && + env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + sched_asym(env, sds, sgs, group)) { + sgs->group_asym_packing = 1; + } + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); /* Computing avg_load makes sense only when group is overloaded */ @@ -8125,8 +9096,9 @@ static bool update_sd_pick_busiest(struct lb_env *env, * CPUs in the group should either be possible to resolve * internally or be covered by avg_load imbalance (eventually). */ - if (sgs->group_type == group_misfit_task && - (!group_smaller_max_cpu_capacity(sg, sds->local) || + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && + (sgs->group_type == group_misfit_task) && + (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || sds->local_stat.group_type != group_has_spare)) return false; @@ -8210,7 +9182,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, */ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && (sgs->group_type <= group_fully_busy) && - (group_smaller_min_cpu_capacity(sds->local, sg))) + (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu)))) return false; return true; @@ -8286,7 +9258,7 @@ static int idle_cpu_without(int cpu, struct task_struct *p) */ #ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) + if (rq->ttwu_pending) return 0; #endif @@ -8315,6 +9287,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_load += cpu_load_without(rq, p); sgs->group_util += cpu_util_without(i, p); + sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; @@ -8345,7 +9318,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, * Computing avg_load makes sense only when group is fully busy or * overloaded */ - if (sgs->group_type < group_fully_busy) + if (sgs->group_type == group_fully_busy || + sgs->group_type == group_overloaded) sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / sgs->group_capacity; } @@ -8387,8 +9361,14 @@ static bool update_pick_idlest(struct sched_group *idlest, case group_has_spare: /* Select group with most idle CPUs */ - if (idlest_sgs->idle_cpus >= sgs->idle_cpus) + if (idlest_sgs->idle_cpus > sgs->idle_cpus) return false; + + /* Select group with lowest group_util */ + if (idlest_sgs->idle_cpus == sgs->idle_cpus && + idlest_sgs->group_util <= sgs->group_util) + return false; + break; } @@ -8402,8 +9382,7 @@ static bool update_pick_idlest(struct sched_group *idlest, * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; @@ -8414,9 +9393,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, .group_type = group_overloaded, }; - imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - do { int local_group; @@ -8425,6 +9401,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, p->cpus_ptr)) continue; + /* Skip over this group if no cookie matched */ + if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group)) + continue; + local_group = cpumask_test_cpu(this_cpu, sched_group_span(group)); @@ -8470,6 +9450,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, switch (local_sgs.group_type) { case group_overloaded: case group_fully_busy: + + /* Calculate allowed imbalance based on load */ + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + /* * When comparing groups across NUMA domains, it's possible for * the local domain to be very lightly loaded relative to the @@ -8506,7 +9491,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, break; case group_has_spare: +#ifdef CONFIG_NUMA if (sd->flags & SD_NUMA) { + int imb_numa_nr = sd->imb_numa_nr; #ifdef CONFIG_NUMA_BALANCING int idlest_cpu; /* @@ -8519,16 +9506,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, idlest_cpu = cpumask_first(sched_group_span(idlest)); if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) return idlest; -#endif +#endif /* CONFIG_NUMA_BALANCING */ /* - * Otherwise, keep the task on this node to stay close - * its wakeup source and improve locality. If there is - * a real need of migration, periodic load balance will + * Otherwise, keep the task close to the wakeup source + * and improve locality if the number of running tasks + * would remain below threshold where an imbalance is + * allowed while accounting for the possibility the + * task is pinned to a subset of CPUs. If there is a + * real need of migration, periodic load balance will * take care of it. */ - if (local_sgs.idle_cpus) + if (p->nr_cpus_allowed != NR_CPUS) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); + + cpumask_and(cpus, sched_group_span(local), p->cpus_ptr); + imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr); + } + + imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus); + if (!adjust_numa_imbalance(imbalance, + local_sgs.sum_nr_running + 1, + imb_numa_nr)) { return NULL; + } } +#endif /* CONFIG_NUMA */ /* * Select group with highest number of idle CPUs. We could also @@ -8544,6 +9546,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, return idlest; } +static void update_idle_cpu_scan(struct lb_env *env, + unsigned long sum_util) +{ + struct sched_domain_shared *sd_share; + int llc_weight, pct; + u64 x, y, tmp; + /* + * Update the number of CPUs to scan in LLC domain, which could + * be used as a hint in select_idle_cpu(). The update of sd_share + * could be expensive because it is within a shared cache line. + * So the write of this hint only occurs during periodic load + * balancing, rather than CPU_NEWLY_IDLE, because the latter + * can fire way more frequently than the former. + */ + if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) + return; + + llc_weight = per_cpu(sd_llc_size, env->dst_cpu); + if (env->sd->span_weight != llc_weight) + return; + + sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + if (!sd_share) + return; + + /* + * The number of CPUs to search drops as sum_util increases, when + * sum_util hits 85% or above, the scan stops. + * The reason to choose 85% as the threshold is because this is the + * imbalance_pct(117) when a LLC sched group is overloaded. + * + * let y = SCHED_CAPACITY_SCALE - p * x^2 [1] + * and y'= y / SCHED_CAPACITY_SCALE + * + * x is the ratio of sum_util compared to the CPU capacity: + * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) + * y' is the ratio of CPUs to be scanned in the LLC domain, + * and the number of CPUs to scan is calculated by: + * + * nr_scan = llc_weight * y' [2] + * + * When x hits the threshold of overloaded, AKA, when + * x = 100 / pct, y drops to 0. According to [1], + * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000 + * + * Scale x by SCHED_CAPACITY_SCALE: + * x' = sum_util / llc_weight; [3] + * + * and finally [1] becomes: + * y = SCHED_CAPACITY_SCALE - + * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4] + * + */ + /* equation [3] */ + x = sum_util; + do_div(x, llc_weight); + + /* equation [4] */ + pct = env->sd->imbalance_pct; + tmp = x * x * pct * pct; + do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); + tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); + y = SCHED_CAPACITY_SCALE - tmp; + + /* equation [2] */ + y *= llc_weight; + do_div(y, SCHED_CAPACITY_SCALE); + if ((int)y != sd_share->nr_idle_scan) + WRITE_ONCE(sd_share->nr_idle_scan, (int)y); +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -8556,13 +9629,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; + unsigned long sum_util = 0; int sg_status = 0; -#ifdef CONFIG_NO_HZ_COMMON - if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) - env->flags |= LBF_NOHZ_STATS; -#endif - do { struct sg_lb_stats *sgs = &tmp_sgs; int local_group; @@ -8577,7 +9646,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, sgs, &sg_status); + update_sg_lb_stats(env, sds, sg, sgs, &sg_status); if (local_group) goto next_group; @@ -8593,20 +9662,13 @@ next_group: sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sum_util += sgs->group_util; sg = sg->next; } while (sg != env->sd->groups); /* Tag domain that child domain prefers tasks go to siblings first */ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; -#ifdef CONFIG_NO_HZ_COMMON - if ((env->flags & LBF_NOHZ_AGAIN) && - cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { - - WRITE_ONCE(nohz.next_blocked, - jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); - } -#endif if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); @@ -8626,6 +9688,8 @@ next_group: WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); } + + update_idle_cpu_scan(env, sum_util); } /** @@ -8642,9 +9706,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s busiest = &sds->busiest_stat; if (busiest->group_type == group_misfit_task) { - /* Set imbalance to allow misfit tasks to be balanced. */ - env->migration_type = migrate_misfit; - env->imbalance = 1; + if (env->sd->flags & SD_ASYM_CPUCAPACITY) { + /* Set imbalance to allow misfit tasks to be balanced. */ + env->migration_type = migrate_misfit; + env->imbalance = 1; + } else { + /* + * Set load imbalance to allow moving task from cpu + * with reduced capacity. + */ + env->migration_type = migrate_load; + env->imbalance = busiest->group_misfit_task_load; + } return; } @@ -8675,7 +9748,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * emptying busiest. */ if (local->group_type == group_has_spare) { - if (busiest->group_type > group_fully_busy) { + if ((busiest->group_type > group_fully_busy) && + !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity @@ -8711,7 +9785,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ env->migration_type = migrate_task; lsub_positive(&nr_diff, local->sum_nr_running); - env->imbalance = nr_diff >> 1; + env->imbalance = nr_diff; } else { /* @@ -8719,29 +9793,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * idle cpus. */ env->migration_type = migrate_task; - env->imbalance = max_t(long, 0, (local->idle_cpus - - busiest->idle_cpus) >> 1); + env->imbalance = max_t(long, 0, + (local->idle_cpus - busiest->idle_cpus)); } +#ifdef CONFIG_NUMA /* Consider allowing a small imbalance between NUMA groups */ if (env->sd->flags & SD_NUMA) { - unsigned int imbalance_min; - - /* - * Compute an allowed imbalance based on a simple - * pair of communicating tasks that should remain - * local and ignore them. - * - * NOTE: Generally this would have been based on - * the domain size and this was evaluated. However, - * the benefit is similar across a range of workloads - * and machines but scaling by the domain size adds - * the risk that lower domains have to be rebalanced. - */ - imbalance_min = 2; - if (busiest->sum_nr_running <= imbalance_min) - env->imbalance = 0; + env->imbalance = adjust_numa_imbalance(env->imbalance, + local->sum_nr_running + 1, + env->sd->imb_numa_nr); } +#endif + + /* Number of tasks to move to restore balance */ + env->imbalance >>= 1; return; } @@ -8759,6 +9825,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / local->group_capacity; + /* + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. + */ + if (local->avg_load >= busiest->avg_load) { + env->imbalance = 0; + return; + } + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / sds->total_capacity; } @@ -8786,7 +9861,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded * has_spare nr_idle balanced N/A N/A balanced balanced * fully_busy nr_idle nr_idle N/A N/A balanced balanced - * misfit_task force N/A N/A N/A force force + * misfit_task force N/A N/A N/A N/A N/A * asym_packing force force N/A N/A force force * imbalanced force force N/A N/A force force * overloaded force force N/A N/A force avg_load @@ -8803,12 +9878,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. + * @env: The load balancing environment. * * Also calculates the amount of runnable load which should be moved * to restore balance. * - * @env: The load balancing environment. - * * Return: - The busiest group if imbalance exists. */ static struct sched_group *find_busiest_group(struct lb_env *env) @@ -8978,8 +10052,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - capacity = capacity_of(i); nr_running = rq->cfs.h_nr_running; + if (!nr_running) + continue; + + capacity = capacity_of(i); /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -8988,7 +10065,13 @@ static struct rq *find_busiest_queue(struct lb_env *env, * average load. */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && - capacity_of(env->dst_cpu) < capacity && + !capacity_greater(capacity_of(env->dst_cpu), capacity) && + nr_running == 1) + continue; + + /* Make sure we only pull tasks from a CPU of lower priority */ + if ((env->sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(i, env->dst_cpu) && nr_running == 1) continue; @@ -9025,7 +10108,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, break; case migrate_util: - util = cpu_util(cpu_of(rq)); + util = cpu_util_cfs(i); + + /* + * Don't try to pull utilization from a CPU with one + * running task. Whatever its utilization, we will fail + * detach the task. + */ + if (nr_running <= 1) + continue; if (busiest_util < util) { busiest_util = util; @@ -9077,13 +10168,32 @@ asym_active_balance(struct lb_env *env) } static inline bool -voluntary_active_balance(struct lb_env *env) +imbalanced_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + /* + * The imbalanced case includes the case of pinned tasks preventing a fair + * distribution of the load on the system but also the even distribution of the + * threads on a system with spare capacity + */ + if ((env->migration_type == migrate_task) && + (sd->nr_balance_failed > sd->cache_nice_tries+2)) + return 1; + + return 0; +} + +static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; if (asym_active_balance(env)) return 1; + if (imbalanced_active_balance(env)) + return 1; + /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. * It's worth migrating the task if the src_cpu's capacity is reduced @@ -9103,22 +10213,12 @@ voluntary_active_balance(struct lb_env *env) return 0; } -static int need_active_balance(struct lb_env *env) -{ - struct sched_domain *sd = env->sd; - - if (voluntary_active_balance(env)) - return 1; - - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); -} - static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - int cpu, balance_cpu = -1; + int cpu; /* * Ensure the balancing environment is consistent; can happen @@ -9130,27 +10230,27 @@ static int should_we_balance(struct lb_env *env) /* * In the newly idle case, we will allow all the CPUs * to do the newly idle load balance. + * + * However, we bail out if we already have tasks or a wakeup pending, + * to optimize wakeup latency. */ - if (env->idle == CPU_NEWLY_IDLE) + if (env->idle == CPU_NEWLY_IDLE) { + if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending) + return 0; return 1; + } /* Try to find first idle CPU */ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { if (!idle_cpu(cpu)) continue; - balance_cpu = cpu; - break; + /* Are we the first idle CPU? */ + return cpu == env->dst_cpu; } - if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); - - /* - * First idle CPU or the first CPU(busiest) in this sched group - * is eligible for doing load balancing at this and above domains. - */ - return balance_cpu == env->dst_cpu; + /* Are we the first CPU of this group ? */ + return group_balance_cpu(sg) == env->dst_cpu; } /* @@ -9167,14 +10267,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); - struct lb_env env = { .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, .dst_grpmask = sched_group_span(sd->groups), .idle = idle, - .loop_break = sched_nr_migrate_break, + .loop_break = SCHED_NR_MIGRATE_BREAK, .cpus = cpus, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), @@ -9202,7 +10301,7 @@ redo: goto out_balanced; } - BUG_ON(busiest == env.dst_rq); + WARN_ON_ONCE(busiest == env.dst_rq); schedstat_add(sd->lb_imbalance[idle], env.imbalance); @@ -9210,6 +10309,8 @@ redo: env.src_rq = busiest; ld_moved = 0; + /* Clear this flag as soon as we find a pullable task */ + env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -9217,7 +10318,6 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - env.flags |= LBF_ALL_PINNED; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -9249,7 +10349,9 @@ more_balance: if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; - goto more_balance; + /* Stop if we tried all running tasks */ + if (env.loop < busiest->nr_running) + goto more_balance; } /* @@ -9266,7 +10368,7 @@ more_balance: * load to given_cpu. In rare situations, this may cause * conflicts (balance_cpu and given_cpu/ilb_cpu deciding * _independently_ and at _same_ time to move some load to - * given_cpu) causing exceess load to be moved to given_cpu. + * given_cpu) causing excess load to be moved to given_cpu. * This however should not happen so much in practice and * moreover subsequent load balance cycles should correct the * excess load moved. @@ -9280,7 +10382,7 @@ more_balance: env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_DST_PINNED; env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; /* * Go back to "more_balance" rather than "redo" since we @@ -9312,7 +10414,7 @@ more_balance: */ if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; goto redo; } goto out_all_pinned; @@ -9333,7 +10435,7 @@ more_balance: if (need_active_balance(&env)) { unsigned long flags; - raw_spin_lock_irqsave(&busiest->lock, flags); + raw_spin_rq_lock_irqsave(busiest, flags); /* * Don't kick the active_load_balance_cpu_stop, @@ -9341,12 +10443,13 @@ more_balance: * moved to this_cpu: */ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { - raw_spin_unlock_irqrestore(&busiest->lock, - flags); - env.flags |= LBF_ALL_PINNED; + raw_spin_rq_unlock_irqrestore(busiest, flags); goto out_one_pinned; } + /* Record that we found at least one task that could run on this_cpu */ + env.flags &= ~LBF_ALL_PINNED; + /* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared @@ -9357,32 +10460,21 @@ more_balance: busiest->push_cpu = this_cpu; active_balance = 1; } - raw_spin_unlock_irqrestore(&busiest->lock, flags); + raw_spin_rq_unlock_irqrestore(busiest, flags); if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); } - - /* We've kicked active balancing, force task migration. */ - sd->nr_balance_failed = sd->cache_nice_tries+1; } - } else + } else { sd->nr_balance_failed = 0; + } - if (likely(!active_balance) || voluntary_active_balance(&env)) { + if (likely(!active_balance) || need_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * detach_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; } goto out; @@ -9416,7 +10508,7 @@ out_one_pinned: /* * newidle_balance() disregards balance intervals, so we could * repeatedly reach this code, which would lead to balance_interval - * skyrocketting in a short amount of time. Skip the balance_interval + * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. */ if (env.idle == CPU_NEWLY_IDLE) @@ -9441,6 +10533,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); + + /* + * Reduce likelihood of busy balancing at higher domains racing with + * balancing at lower domains by preventing their balancing periods + * from being multiples of each other. + */ + if (cpu_busy) + interval -= 1; + interval = clamp(interval, 1UL, max_load_balance_interval); return interval; @@ -9498,14 +10599,13 @@ static int active_load_balance_cpu_stop(void *data) * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-CPU setup. */ - BUG_ON(busiest_rq == target_rq); + WARN_ON_ONCE(busiest_rq == target_rq); /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; } if (likely(sd)) { @@ -9516,13 +10616,7 @@ static int active_load_balance_cpu_stop(void *data) .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, - /* - * can_migrate_task() doesn't need to compute new_dst_cpu - * for active balancing. Since we have CPU_IDLE, but no - * @dst_grpmask we need to make that test go away with lying - * about DST_PINNED. - */ - .flags = LBF_DST_PINNED, + .flags = LBF_ACTIVE_LB, }; schedstat_inc(sd->alb_count); @@ -9561,6 +10655,30 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } +static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +{ + if (cost > sd->max_newidle_lb_cost) { + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + sd->max_newidle_lb_cost = cost; + sd->last_decay_max_lb_cost = jiffies; + } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + /* + * Decay the newidle max times by ~1% per second to ensure that + * it is not outdated and the current max cost is actually + * shorter. + */ + sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; + sd->last_decay_max_lb_cost = jiffies; + + return true; + } + + return false; +} + /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -9584,19 +10702,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) for_each_domain(cpu, sd) { /* * Decay the newidle max times here because this is a regular - * visit to all the domains. Decay ~1% per second. + * visit to all the domains. */ - if (time_after(jiffies, sd->next_decay_max_lb_cost)) { - sd->max_newidle_lb_cost = - (sd->max_newidle_lb_cost * 253) / 256; - sd->next_decay_max_lb_cost = jiffies + HZ; - need_decay = 1; - } + need_decay = update_newidle_cost(sd, 0); max_cost += sd->max_newidle_lb_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more @@ -9652,22 +10762,9 @@ out: * When the cpu is attached to null domain for ex, it will not be * updated. */ - if (likely(update_next_balance)) { + if (likely(update_next_balance)) rq->next_balance = next_balance; -#ifdef CONFIG_NO_HZ_COMMON - /* - * If this CPU has been elected to perform the nohz idle - * balance. Other idle CPUs have already rebalanced with - * nohz_idle_balance() and nohz.next_balance has been - * updated accordingly. This CPU is now running the idle load - * balance for itself and we need to update the - * nohz.next_balance accordingly. - */ - if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) - nohz.next_balance = rq->next_balance; -#endif - } } static inline int on_null_domain(struct rq *rq) @@ -9681,16 +10778,22 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set * anywhere yet. */ static inline int find_new_ilb(void) { int ilb; + const struct cpumask *hk_mask; + + hk_mask = housekeeping_cpumask(HK_TYPE_MISC); + + for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { + + if (ilb == smp_processor_id()) + continue; - for_each_cpu_and(ilb, nohz.idle_cpus_mask, - housekeeping_cpumask(HK_FLAG_MISC)) { if (idle_cpu(ilb)) return ilb; } @@ -9700,30 +10803,38 @@ static inline int find_new_ilb(void) /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). + * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { int ilb_cpu; - nohz.next_balance++; + /* + * Increase nohz.next_balance only when if full ilb is triggered but + * not if we only update stats. + */ + if (flags & NOHZ_BALANCE_KICK) + nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); if (ilb_cpu >= nr_cpu_ids) return; + /* + * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets + * the first flag owns it; cleared by nohz_csd_func(). + */ flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); if (flags & NOHZ_KICK_MASK) return; /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target CPU which + * This way we generate an IPI on the target CPU which * is idle. And the softirq performing nohz idle load balance * will be run before returning from the IPI. */ - smp_send_reschedule(ilb_cpu); + smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); } /* @@ -9762,7 +10873,7 @@ static void nohz_balancer_kick(struct rq *rq) goto out; if (rq->nr_running >= 2) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto out; } @@ -9776,7 +10887,7 @@ static void nohz_balancer_kick(struct rq *rq) * on. */ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } @@ -9790,7 +10901,7 @@ static void nohz_balancer_kick(struct rq *rq) */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym_prefer(i, cpu)) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } @@ -9803,7 +10914,7 @@ static void nohz_balancer_kick(struct rq *rq) * to run the misfit task on. */ if (check_misfit_status(rq, sd)) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -9830,13 +10941,16 @@ static void nohz_balancer_kick(struct rq *rq) */ nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } unlock: rcu_read_unlock(); out: + if (READ_ONCE(nohz.needs_update)) + flags |= NOHZ_NEXT_KICK; + if (flags) kick_ilb(flags); } @@ -9902,7 +11016,7 @@ void nohz_balance_enter_idle(int cpu) return; /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) + if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) return; /* @@ -9933,12 +11047,13 @@ void nohz_balance_enter_idle(int cpu) /* * Ensures that if nohz_idle_balance() fails to observe our * @idle_cpus_mask store, it must observe the @has_blocked - * store. + * and @needs_update stores. */ smp_mb__after_atomic(); set_cpu_sd_state_idle(cpu); + WRITE_ONCE(nohz.needs_update, 1); out: /* * Each time a cpu enter idle, we assume that it has blocked load and @@ -9947,15 +11062,30 @@ out: WRITE_ONCE(nohz.has_blocked, 1); } +static bool update_nohz_stats(struct rq *rq) +{ + unsigned int cpu = rq->cpu; + + if (!rq->has_blocked_load) + return false; + + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return false; + + if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) + return true; + + update_blocked_averages(cpu); + + return rq->has_blocked_load; +} + /* * Internal function that runs load balance for all idle cpus. The load balance * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. - * The function returns false if the loop has stopped before running - * through all idle CPUs. */ -static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, - enum cpu_idle_type idle) +static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) { /* Earliest time when we have to do rebalance again */ unsigned long now = jiffies; @@ -9964,7 +11094,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, int update_next_balance = 0; int this_cpu = this_rq->cpu; int balance_cpu; - int ret = false; struct rq *rq; SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); @@ -9972,12 +11101,17 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, /* * We assume there will be no idle load after this update and clear * the has_blocked flag. If a cpu enters idle in the mean time, it will - * set the has_blocked flag and trig another update of idle load. + * set the has_blocked flag and trigger another update of idle load. * Because a cpu that becomes idle, is added to idle_cpus_mask before * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. + * + * Same applies to idle_cpus_mask vs needs_update. */ - WRITE_ONCE(nohz.has_blocked, 0); + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.has_blocked, 0); + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 0); /* * Ensures that if we miss the CPU, we must see the has_blocked @@ -9985,8 +11119,12 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, */ smp_mb(); - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { - if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) + /* + * Start with the next CPU after this_cpu so we will end with this_cpu and let a + * chance for other idle cpu to pull load. + */ + for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) { + if (!idle_cpu(balance_cpu)) continue; /* @@ -9995,13 +11133,17 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, * balancing owner will pick it up. */ if (need_resched()) { - has_blocked_load = true; + if (flags & NOHZ_STATS_KICK) + has_blocked_load = true; + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 1); goto abort; } rq = cpu_rq(balance_cpu); - has_blocked_load |= update_nohz_stats(rq, true); + if (flags & NOHZ_STATS_KICK) + has_blocked_load |= update_nohz_stats(rq); /* * If time for next balance is due, @@ -10024,26 +11166,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, } } - /* Newly idle CPU doesn't need an update */ - if (idle != CPU_NEWLY_IDLE) { - update_blocked_averages(this_cpu); - has_blocked_load |= this_rq->has_blocked_load; - } - - if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(this_rq, CPU_IDLE); - - WRITE_ONCE(nohz.next_blocked, - now + msecs_to_jiffies(LOAD_AVG_PERIOD)); - - /* The full idle balance loop has been done */ - ret = true; - -abort: - /* There is still blocked load, enable periodic update */ - if (has_blocked_load) - WRITE_ONCE(nohz.has_blocked, 1); - /* * next_balance will be updated only when there is a need. * When the CPU is attached to null domain for ex, it will not be @@ -10052,7 +11174,14 @@ abort: if (likely(update_next_balance)) nohz.next_balance = next_balance; - return ret; + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.next_blocked, + now + msecs_to_jiffies(LOAD_AVG_PERIOD)); + +abort: + /* There is still blocked load, enable periodic update */ + if (has_blocked_load) + WRITE_ONCE(nohz.has_blocked, 1); } /* @@ -10061,27 +11190,39 @@ abort: */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - int this_cpu = this_rq->cpu; - unsigned int flags; + unsigned int flags = this_rq->nohz_idle_balance; - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) + if (!flags) return false; - if (idle != CPU_IDLE) { - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - return false; - } + this_rq->nohz_idle_balance = 0; - /* could be _relaxed() */ - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - if (!(flags & NOHZ_KICK_MASK)) + if (idle != CPU_IDLE) return false; - _nohz_idle_balance(this_rq, flags, idle); + _nohz_idle_balance(this_rq, flags); return true; } +/* + * Check if we need to run the ILB for updating blocked load before entering + * idle state. + */ +void nohz_run_idle_balance(int cpu) +{ + unsigned int flags; + + flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu)); + + /* + * Update the blocked load only if no SCHED_SOFTIRQ is about to happen + * (ie NOHZ_STATS_KICK set) and will do the same. + */ + if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) + _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); +} + static void nohz_newidle_balance(struct rq *this_rq) { int this_cpu = this_rq->cpu; @@ -10090,7 +11231,7 @@ static void nohz_newidle_balance(struct rq *this_rq) * This CPU doesn't want to be disturbed by scheduler * housekeeping */ - if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) + if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) return; /* Will wake up very soon. No time for doing anything else*/ @@ -10102,16 +11243,11 @@ static void nohz_newidle_balance(struct rq *this_rq) time_before(jiffies, READ_ONCE(nohz.next_blocked))) return; - raw_spin_unlock(&this_rq->lock); /* - * This CPU is going to be idle and blocked load of idle CPUs - * need to be updated. Run the ilb locally as it is a good - * candidate for ilb instead of waking up another idle CPU. - * Kick an normal ilb if we failed to do the update. + * Set the need to trigger ILB in order to update blocked load + * before entering idle state. */ - if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) - kick_ilb(NOHZ_STATS_KICK); - raw_spin_lock(&this_rq->lock); + atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); } #else /* !CONFIG_NO_HZ_COMMON */ @@ -10126,7 +11262,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * idle_balance is called by schedule() if this_cpu is about to become + * newidle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -10134,15 +11270,23 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; + u64 t0, t1, curr_cost = 0; struct sched_domain *sd; int pulled_task = 0; - u64 curr_cost = 0; update_misfit_status(NULL, this_rq); + + /* + * There is a task waiting to run. No need to search for one. + * Return 0; the task will be enqueued when switching to idle. + */ + if (this_rq->ttwu_pending) + return 0; + /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -10163,67 +11307,64 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) */ rq_unpin_lock(this_rq, rf); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !READ_ONCE(this_rq->rd->overload)) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + + if (!READ_ONCE(this_rq->rd->overload) || + (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) update_next_balance(sd, &next_balance); rcu_read_unlock(); - nohz_newidle_balance(this_rq); - goto out; } + rcu_read_unlock(); - raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); + t0 = sched_clock_cpu(this_cpu); update_blocked_averages(this_cpu); + rcu_read_lock(); for_each_domain(this_cpu, sd) { int continue_balancing = 1; - u64 t0, domain_cost; + u64 domain_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; + update_next_balance(sd, &next_balance); - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, &next_balance); + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) break; - } if (sd->flags & SD_BALANCE_NEWIDLE) { - t0 = sched_clock_cpu(this_cpu); pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); - domain_cost = sched_clock_cpu(this_cpu) - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; + t1 = sched_clock_cpu(this_cpu); + domain_cost = t1 - t0; + update_newidle_cost(sd, domain_cost); curr_cost += domain_cost; + t0 = t1; } - update_next_balance(sd, &next_balance); - /* * Stop searching for tasks to pull if there are * now runnable tasks on this rq. */ - if (pulled_task || this_rq->nr_running > 0) + if (pulled_task || this_rq->nr_running > 0 || + this_rq->ttwu_pending) break; } rcu_read_unlock(); - raw_spin_lock(&this_rq->lock); + raw_spin_rq_lock(this_rq); if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; -out: /* * While browsing the domains, we released the rq lock, a task could * have been enqueued in the meantime. Since we're not going idle, @@ -10232,16 +11373,19 @@ out: if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; - /* Move the next balance forward */ - if (time_after(this_rq->next_balance, next_balance)) - this_rq->next_balance = next_balance; - /* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) + this_rq->next_balance = next_balance; + if (pulled_task) this_rq->idle_stamp = 0; + else + nohz_newidle_balance(this_rq); rq_repin_lock(this_rq, rf); @@ -10279,8 +11423,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* + * Don't need to rebalance while attached to NULL domain or + * runqueue CPU is not active + */ + if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq)))) return; if (time_after_eq(jiffies, rq->next_balance)) @@ -10306,6 +11453,119 @@ static void rq_offline_fair(struct rq *rq) #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_CORE +static inline bool +__entity_slice_used(struct sched_entity *se, int min_nr_tasks) +{ + u64 slice = sched_slice(cfs_rq_of(se), se); + u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; + + return (rtime * min_nr_tasks > slice); +} + +#define MIN_NR_TASKS_DURING_FORCEIDLE 2 +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) +{ + if (!sched_core_enabled(rq)) + return; + + /* + * If runqueue has only one task which used up its slice and + * if the sibling is forced idle, then trigger schedule to + * give forced idle task a chance. + * + * sched_slice() considers only this active rq and it gets the + * whole slice. But during force idle, we have siblings acting + * like a single runqueue and hence we need to consider runnable + * tasks on this CPU and the forced idle CPU. Ideally, we should + * go through the forced idle rq, but that would be a perf hit. + * We can assume that the forced idle CPU has at least + * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check + * if we need to give up the CPU. + */ + if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && + __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) + resched_curr(rq); +} + +/* + * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + */ +static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (forceidle) { + if (cfs_rq->forceidle_seq == fi_seq) + break; + cfs_rq->forceidle_seq = fi_seq; + } + + cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; + } +} + +void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) +{ + struct sched_entity *se = &p->se; + + if (p->sched_class != &fair_sched_class) + return; + + se_fi_update(se, rq->core->core_forceidle_seq, in_fi); +} + +bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +{ + struct rq *rq = task_rq(a); + struct sched_entity *sea = &a->se; + struct sched_entity *seb = &b->se; + struct cfs_rq *cfs_rqa; + struct cfs_rq *cfs_rqb; + s64 delta; + + SCHED_WARN_ON(task_rq(b)->core != rq->core); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Find an se in the hierarchy for tasks a and b, such that the se's + * are immediate siblings. + */ + while (sea->cfs_rq->tg != seb->cfs_rq->tg) { + int sea_depth = sea->depth; + int seb_depth = seb->depth; + + if (sea_depth >= seb_depth) + sea = parent_entity(sea); + if (sea_depth <= seb_depth) + seb = parent_entity(seb); + } + + se_fi_update(sea, rq->core->core_forceidle_seq, in_fi); + se_fi_update(seb, rq->core->core_forceidle_seq, in_fi); + + cfs_rqa = sea->cfs_rq; + cfs_rqb = seb->cfs_rq; +#else + cfs_rqa = &task_rq(a)->cfs; + cfs_rqb = &task_rq(b)->cfs; +#endif + + /* + * Find delta after normalizing se's vruntime with its cfs_rq's + * min_vruntime_fi, which would have been updated in prior calls + * to se_fi_update(). + */ + delta = (s64)(sea->vruntime - seb->vruntime) + + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + + return delta > 0; +} +#else +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} +#endif + /* * scheduler tick hitting a task of our scheduling class. * @@ -10329,6 +11589,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) update_misfit_status(curr, rq); update_overutilized_status(task_rq(curr)); + + task_tick_core(rq, curr); } /* @@ -10385,7 +11647,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (rq->curr == p) { + if (task_current(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else @@ -10414,7 +11676,7 @@ static inline bool vruntime_normalized(struct task_struct *p) * waiting for actually being woken up by sched_ttwu_pending(). */ if (!se->sum_exec_runtime || - (p->state == TASK_WAKING && p->sched_remote_wakeup)) + (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup)) return true; return false; @@ -10427,7 +11689,13 @@ static inline bool vruntime_normalized(struct task_struct *p) */ static void propagate_entity_cfs_rq(struct sched_entity *se) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + return; + + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ se = se->parent; @@ -10435,10 +11703,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + update_load_avg(cfs_rq, se, UPDATE_TG); + if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(cfs_rq, se, UPDATE_TG); + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } } #else @@ -10449,10 +11720,21 @@ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); +#ifdef CONFIG_SMP + /* + * In case the task sched_avg hasn't been attached: + * - A forked task which hasn't been woken up by wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() but is + * waiting for actually being woken up by sched_ttwu_pending(). + */ + if (!se->avg.last_update_time) + return; +#endif + /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } @@ -10460,18 +11742,10 @@ static void attach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } @@ -10518,7 +11792,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); else check_preempt_curr(rq, p, 0); @@ -10556,55 +11830,36 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifndef CONFIG_64BIT - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif + u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_set_group_fair(struct task_struct *p) +static void task_change_group_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - - set_task_rq(p, task_cpu(p)); - se->depth = se->parent ? se->parent->depth + 1 : 0; -} + /* + * We couldn't detach or attach a forked task which + * hasn't been woken up by wake_up_new_task(). + */ + if (READ_ONCE(p->__state) == TASK_NEW) + return; -static void task_move_group_fair(struct task_struct *p) -{ detach_task_cfs_rq(p); - set_task_rq(p, task_cpu(p)); #ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; #endif + set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } -static void task_change_group_fair(struct task_struct *p, int type) -{ - switch (type) { - case TASK_SET_GROUP: - task_set_group_fair(p); - break; - - case TASK_MOVE_GROUP: - task_move_group_fair(p); - break; - } -} - void free_fair_sched_group(struct task_group *tg) { int i; - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); - for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -10639,7 +11894,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!cfs_rq) goto err; - se = kzalloc_node(sizeof(struct sched_entity), + se = kzalloc_node(sizeof(struct sched_entity_stats), GFP_KERNEL, cpu_to_node(i)); if (!se) goto err_free_rq; @@ -10681,6 +11936,8 @@ void unregister_fair_sched_group(struct task_group *tg) struct rq *rq; int cpu; + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + for_each_possible_cpu(cpu) { if (tg->se[cpu]) remove_entity_load_avg(tg->se[cpu]); @@ -10694,9 +11951,9 @@ void unregister_fair_sched_group(struct task_group *tg) rq = cpu_rq(cpu); - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); } } @@ -10733,10 +11990,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, static DEFINE_MUTEX(shares_mutex); -int sched_group_set_shares(struct task_group *tg, unsigned long shares) +static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + lockdep_assert_held(&shares_mutex); + /* * We can't change the weight of the root cgroup. */ @@ -10745,9 +12004,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - mutex_lock(&shares_mutex); if (tg->shares == shares) - goto done; + return 0; tg->shares = shares; for_each_possible_cpu(i) { @@ -10765,10 +12023,96 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) rq_unlock_irqrestore(rq, &rf); } -done: + return 0; +} + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int ret; + + mutex_lock(&shares_mutex); + if (tg_is_idle(tg)) + ret = -EINVAL; + else + ret = __sched_group_set_shares(tg, shares); + mutex_unlock(&shares_mutex); + + return ret; +} + +int sched_group_set_idle(struct task_group *tg, long idle) +{ + int i; + + if (tg == &root_task_group) + return -EINVAL; + + if (idle < 0 || idle > 1) + return -EINVAL; + + mutex_lock(&shares_mutex); + + if (tg->idle == idle) { + mutex_unlock(&shares_mutex); + return 0; + } + + tg->idle = idle; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se = tg->se[i]; + struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i]; + bool was_idle = cfs_rq_is_idle(grp_cfs_rq); + long idle_task_delta; + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + + grp_cfs_rq->idle = idle; + if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) + goto next_cpu; + + if (se->on_rq) { + parent_cfs_rq = cfs_rq_of(se); + if (cfs_rq_is_idle(grp_cfs_rq)) + parent_cfs_rq->idle_nr_running++; + else + parent_cfs_rq->idle_nr_running--; + } + + idle_task_delta = grp_cfs_rq->h_nr_running - + grp_cfs_rq->idle_h_nr_running; + if (!cfs_rq_is_idle(grp_cfs_rq)) + idle_task_delta *= -1; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!se->on_rq) + break; + + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* Already accounted at parent level and above. */ + if (cfs_rq_is_idle(cfs_rq)) + break; + } + +next_cpu: + rq_unlock_irqrestore(rq, &rf); + } + + /* Idle groups have minimum weight. */ + if (tg_is_idle(tg)) + __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); + else + __sched_group_set_shares(tg, NICE_0_LOAD); + mutex_unlock(&shares_mutex); return 0; } + #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } @@ -10803,8 +12147,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class = { - .next = &idle_sched_class, +DEFINE_SCHED_CLASS(fair) = { + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, @@ -10818,6 +12162,7 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .balance = balance_fair, + .pick_task = pick_task_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -10887,6 +12232,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { #ifdef CONFIG_SMP + int i; + + for_each_possible_cpu(i) { + zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + } + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ_COMMON @@ -10897,83 +12249,3 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } - -/* - * Helper functions to facilitate extracting info from tracepoints. - */ - -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) -{ -#ifdef CONFIG_SMP - return cfs_rq ? &cfs_rq->avg : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); - -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) -{ - if (!cfs_rq) { - if (str) - strlcpy(str, "(null)", len); - else - return NULL; - } - - cfs_rq_tg_path(cfs_rq, str, len); - return str; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); - -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) -{ - return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); - -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_rt : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); - -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_dl : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); - -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) - return rq ? &rq->avg_irq : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); - -int sched_trace_rq_cpu(struct rq *rq) -{ - return rq ? cpu_of(rq) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); - -const struct cpumask *sched_trace_rd_span(struct root_domain *rd) -{ -#ifdef CONFIG_SMP - return rd ? rd->span : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rd_span); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7481cd96f391..ee7f23c76bd3 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -27,7 +27,7 @@ SCHED_FEAT(NEXT_BUDDY, false) SCHED_FEAT(LAST_BUDDY, true) /* - * Consider buddies to be cache hot, decreases the likelyness of a + * Consider buddies to be cache hot, decreases the likeliness of a * cache buddy being migrated away, increases cache locality. */ SCHED_FEAT(CACHE_HOT_BUDDY, true) @@ -38,6 +38,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) +SCHED_FEAT(HRTICK_DL, false) SCHED_FEAT(DOUBLE_TICK, false) /* @@ -45,17 +46,22 @@ SCHED_FEAT(DOUBLE_TICK, false) */ SCHED_FEAT(NONTASK_CAPACITY, true) +#ifdef CONFIG_PREEMPT_RT +SCHED_FEAT(TTWU_QUEUE, false) +#else + /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, true) +#endif /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_AVG_CPU, false) -SCHED_FEAT(SIS_PROP, true) +SCHED_FEAT(SIS_PROP, false) +SCHED_FEAT(SIS_UTIL, true) /* * Issue a WARN when we do multiple update_rq_clock() calls @@ -77,7 +83,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(RT_RUNTIME_SHARE, false) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) @@ -90,3 +96,8 @@ SCHED_FEAT(WA_BIAS, true) */ SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(UTIL_EST_FASTUP, true) + +SCHED_FEAT(LATENCY_WARN, false) + +SCHED_FEAT(ALT_PERIOD, true) +SCHED_FEAT(BASE_SLICE, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index b743bf38f08f..f26ab2675f7d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -6,9 +6,6 @@ * (NOTE: these are not related to SCHED_IDLE batch scheduled * tasks which are handled in sched/fair.c ) */ -#include "sched.h" - -#include <trace/events/power.h> /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; @@ -54,17 +51,18 @@ __setup("hlt", cpu_idle_nopoll_setup); static noinline int __cpuidle cpu_idle_poll(void) { - rcu_idle_enter(); - trace_cpu_idle_rcuidle(0, smp_processor_id()); - local_irq_enable(); + trace_cpu_idle(0, smp_processor_id()); stop_critical_timings(); + ct_idle_enter(); + local_irq_enable(); while (!tif_need_resched() && - (cpu_idle_force_poll || tick_check_broadcast_expired())) + (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); + + ct_idle_exit(); start_critical_timings(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); - rcu_idle_exit(); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); return 1; } @@ -77,7 +75,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; - local_irq_enable(); + raw_local_irq_enable(); } /** @@ -90,12 +88,52 @@ void __cpuidle default_idle_call(void) if (current_clr_polling_and_test()) { local_irq_enable(); } else { + + trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); + + /* + * arch_cpu_idle() is supposed to enable IRQs, however + * we can't do that because of RCU and tracing. + * + * Trace IRQs enable here, then switch off RCU, and have + * arch_cpu_idle() use raw_local_irq_enable(). Note that + * ct_idle_enter() relies on lockdep IRQ state, so switch that + * last -- this is very similar to the entry code. + */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + ct_idle_enter(); + lockdep_hardirqs_on(_THIS_IP_); + arch_cpu_idle(); + + /* + * OK, so IRQs are enabled here, but RCU needs them disabled to + * turn itself back on.. funny thing is that disabling IRQs + * will cause tracing, which needs RCU. Jump through hoops to + * make it 'work'. + */ + raw_local_irq_disable(); + lockdep_hardirqs_off(_THIS_IP_); + ct_idle_exit(); + lockdep_hardirqs_on(_THIS_IP_); + raw_local_irq_enable(); + start_critical_timings(); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } } +static int call_cpuidle_s2idle(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + if (current_clr_polling_and_test()) + return -EBUSY; + + return cpuidle_enter_s2idle(drv, dev); +} + static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, int next_state) { @@ -122,7 +160,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, * * NOTE: no locks or semaphores should be used here * - * On archs that support TIF_POLLING_NRFLAG, is called with polling + * On architectures that support TIF_POLLING_NRFLAG, is called with polling * set, and it returns with polling set. If it ever stops polling, it * must clear the polling bit. */ @@ -149,7 +187,6 @@ static void cpuidle_idle_call(void) if (cpuidle_not_available(drv, dev)) { tick_nohz_idle_stop_tick(); - rcu_idle_enter(); default_idle_call(); goto exit_idle; @@ -159,7 +196,7 @@ static void cpuidle_idle_call(void) * Suspend-to-idle ("s2idle") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only * activity happens here and in interrupts (if any). In that case bypass - * the cpuidle governor and go stratight for the deepest idle state + * the cpuidle governor and go straight for the deepest idle state * available. Possibly also suspend the local tick and the entire * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. @@ -169,15 +206,10 @@ static void cpuidle_idle_call(void) u64 max_latency_ns; if (idle_should_enter_s2idle()) { - rcu_idle_enter(); - entered_state = cpuidle_enter_s2idle(drv, dev); - if (entered_state > 0) { - local_irq_enable(); + entered_state = call_cpuidle_s2idle(drv, dev); + if (entered_state > 0) goto exit_idle; - } - - rcu_idle_exit(); max_latency_ns = U64_MAX; } else { @@ -185,7 +217,6 @@ static void cpuidle_idle_call(void) } tick_nohz_idle_stop_tick(); - rcu_idle_enter(); next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); @@ -202,8 +233,6 @@ static void cpuidle_idle_call(void) else tick_nohz_idle_retain_tick(); - rcu_idle_enter(); - entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome @@ -219,8 +248,6 @@ exit_idle: */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); - - rcu_idle_exit(); } /* @@ -231,6 +258,12 @@ exit_idle: static void do_idle(void) { int cpu = smp_processor_id(); + + /* + * Check if we need to update blocked load + */ + nohz_run_idle_balance(cpu); + /* * If the arch has a polling bit, we maintain an invariant: * @@ -255,6 +288,7 @@ static void do_idle(void) } arch_cpu_idle_enter(); + rcu_nocb_flush_deferred_wakeup(); /* * In poll mode we reenable interrupts and spin. Also if we @@ -289,7 +323,11 @@ static void do_idle(void) */ smp_mb__after_atomic(); - sched_ttwu_pending(); + /* + * RCU relies on this call to be done outside of an RCU read-side + * critical section. + */ + flush_smp_call_function_queue(); schedule_idle(); if (unlikely(klp_patch_pending(current))) @@ -330,6 +368,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); WARN_ON_ONCE(!duration_ns); + WARN_ON_ONCE(current->mm); rcu_sleep_check(); preempt_disable(); @@ -337,10 +376,10 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) cpuidle_use_deepest_state(latency_ns); it.done = 0; - hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); it.timer.function = idle_inject_timer_fn; hrtimer_start(&it.timer, ns_to_ktime(duration_ns), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); while (!READ_ONCE(it.done)) do_idle(); @@ -367,7 +406,7 @@ void cpu_startup_entry(enum cpuhp_state state) #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } @@ -397,6 +436,13 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir schedstat_inc(rq->sched_goidle); } +#ifdef CONFIG_SMP +static struct task_struct *pick_task_idle(struct rq *rq) +{ + return rq->idle; +} +#endif + struct task_struct *pick_next_task_idle(struct rq *rq) { struct task_struct *next = rq->idle; @@ -413,10 +459,10 @@ struct task_struct *pick_next_task_idle(struct rq *rq) static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) { - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); - raw_spin_lock_irq(&rq->lock); + raw_spin_rq_lock_irq(rq); } /* @@ -442,11 +488,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) BUG(); } -static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_idle(struct rq *rq) { } @@ -454,8 +495,8 @@ static void update_curr_idle(struct rq *rq) /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class = { - /* .next is NULL */ +DEFINE_SCHED_CLASS(idle) = { + /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ @@ -469,14 +510,13 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .balance = balance_idle, + .pick_task = pick_task_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif .task_tick = task_tick_idle, - .get_rr_interval = get_rr_interval_idle, - .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, .update_curr = update_curr_idle, diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 008d6ac2342b..373d42c707bc 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -7,140 +7,182 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ -#include "sched.h" + +enum hk_flags { + HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), + HK_FLAG_RCU = BIT(HK_TYPE_RCU), + HK_FLAG_MISC = BIT(HK_TYPE_MISC), + HK_FLAG_SCHED = BIT(HK_TYPE_SCHED), + HK_FLAG_TICK = BIT(HK_TYPE_TICK), + HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), + HK_FLAG_WQ = BIT(HK_TYPE_WQ), + HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), + HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD), +}; DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); EXPORT_SYMBOL_GPL(housekeeping_overridden); -static cpumask_var_t housekeeping_mask; -static unsigned int housekeeping_flags; -bool housekeeping_enabled(enum hk_flags flags) +struct housekeeping { + cpumask_var_t cpumasks[HK_TYPE_MAX]; + unsigned long flags; +}; + +static struct housekeeping housekeeping; + +bool housekeeping_enabled(enum hk_type type) { - return !!(housekeeping_flags & flags); + return !!(housekeeping.flags & BIT(type)); } EXPORT_SYMBOL_GPL(housekeeping_enabled); -int housekeeping_any_cpu(enum hk_flags flags) +int housekeeping_any_cpu(enum hk_type type) { int cpu; if (static_branch_unlikely(&housekeeping_overridden)) { - if (housekeeping_flags & flags) { - cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id()); + if (housekeeping.flags & BIT(type)) { + cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id()); if (cpu < nr_cpu_ids) return cpu; - return cpumask_any_and(housekeeping_mask, cpu_online_mask); + return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); } } return smp_processor_id(); } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); -const struct cpumask *housekeeping_cpumask(enum hk_flags flags) +const struct cpumask *housekeeping_cpumask(enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - return housekeeping_mask; + if (housekeeping.flags & BIT(type)) + return housekeeping.cpumasks[type]; return cpu_possible_mask; } EXPORT_SYMBOL_GPL(housekeeping_cpumask); -void housekeeping_affine(struct task_struct *t, enum hk_flags flags) +void housekeeping_affine(struct task_struct *t, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - set_cpus_allowed_ptr(t, housekeeping_mask); + if (housekeeping.flags & BIT(type)) + set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]); } EXPORT_SYMBOL_GPL(housekeeping_affine); -bool housekeeping_test_cpu(int cpu, enum hk_flags flags) +bool housekeeping_test_cpu(int cpu, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - return cpumask_test_cpu(cpu, housekeeping_mask); + if (housekeeping.flags & BIT(type)) + return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]); return true; } EXPORT_SYMBOL_GPL(housekeeping_test_cpu); void __init housekeeping_init(void) { - if (!housekeeping_flags) + enum hk_type type; + + if (!housekeeping.flags) return; static_branch_enable(&housekeeping_overridden); - if (housekeeping_flags & HK_FLAG_TICK) + if (housekeeping.flags & HK_FLAG_TICK) sched_tick_offload_init(); - /* We need at least one CPU to handle housekeeping work */ - WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); + for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) { + /* We need at least one CPU to handle housekeeping work */ + WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type])); + } +} + +static void __init housekeeping_setup_type(enum hk_type type, + cpumask_var_t housekeeping_staging) +{ + + alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]); + cpumask_copy(housekeeping.cpumasks[type], + housekeeping_staging); } -static int __init housekeeping_setup(char *str, enum hk_flags flags) +static int __init housekeeping_setup(char *str, unsigned long flags) { - cpumask_var_t non_housekeeping_mask; - cpumask_var_t tmp; - int err; + cpumask_var_t non_housekeeping_mask, housekeeping_staging; + int err = 0; + + if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { + if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { + pr_warn("Housekeeping: nohz unsupported." + " Build with CONFIG_NO_HZ_FULL\n"); + return 0; + } + } alloc_bootmem_cpumask_var(&non_housekeeping_mask); - err = cpulist_parse(str, non_housekeeping_mask); - if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) { + if (cpulist_parse(str, non_housekeeping_mask) < 0) { pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; + goto free_non_housekeeping_mask; } - alloc_bootmem_cpumask_var(&tmp); - if (!housekeeping_flags) { - alloc_bootmem_cpumask_var(&housekeeping_mask); - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, non_housekeeping_mask); + alloc_bootmem_cpumask_var(&housekeeping_staging); + cpumask_andnot(housekeeping_staging, + cpu_possible_mask, non_housekeeping_mask); - cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); - if (cpumask_empty(tmp)) { + if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) { + __cpumask_set_cpu(smp_processor_id(), housekeeping_staging); + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); + if (!housekeeping.flags) { pr_warn("Housekeeping: must include one present CPU, " "using boot CPU:%d\n", smp_processor_id()); - __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); - __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); - } - } else { - cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); - if (cpumask_empty(tmp)) - __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); - cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); - if (!cpumask_equal(tmp, housekeeping_mask)) { - pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); - free_bootmem_cpumask_var(tmp); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; } } - free_bootmem_cpumask_var(tmp); - if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { - if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { - tick_nohz_full_setup(non_housekeeping_mask); - } else { - pr_warn("Housekeeping: nohz unsupported." - " Build with CONFIG_NO_HZ_FULL\n"); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; + if (!housekeeping.flags) { + /* First setup call ("nohz_full=" or "isolcpus=") */ + enum hk_type type; + + for_each_set_bit(type, &flags, HK_TYPE_MAX) + housekeeping_setup_type(type, housekeeping_staging); + } else { + /* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */ + enum hk_type type; + unsigned long iter_flags = flags & housekeeping.flags; + + for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) { + if (!cpumask_equal(housekeeping_staging, + housekeeping.cpumasks[type])) { + pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); + goto free_housekeeping_staging; + } } + + iter_flags = flags & ~housekeeping.flags; + + for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) + housekeeping_setup_type(type, housekeeping_staging); } - housekeeping_flags |= flags; + if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) + tick_nohz_full_setup(non_housekeeping_mask); + + housekeeping.flags |= flags; + err = 1; +free_housekeeping_staging: + free_bootmem_cpumask_var(housekeeping_staging); +free_non_housekeeping_mask: free_bootmem_cpumask_var(non_housekeeping_mask); - return 1; + return err; } static int __init housekeeping_nohz_full_setup(char *str) { - unsigned int flags; + unsigned long flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | + HK_FLAG_MISC | HK_FLAG_KTHREAD; return housekeeping_setup(str, flags); } @@ -148,7 +190,10 @@ __setup("nohz_full=", housekeeping_nohz_full_setup); static int __init housekeeping_isolcpus_setup(char *str) { - unsigned int flags = 0; + unsigned long flags = 0; + bool illegal = false; + char *par; + int len; while (isalpha(*str)) { if (!strncmp(str, "nohz,", 5)) { @@ -169,8 +214,22 @@ static int __init housekeeping_isolcpus_setup(char *str) continue; } - pr_warn("isolcpus: Error, unknown flag\n"); - return 0; + /* + * Skip unknown sub-parameter and validate that it is not + * containing an invalid character. + */ + for (par = str, len = 0; *str && *str != ','; str++, len++) { + if (!isalpha(*str) && *str != '_') + illegal = true; + } + + if (illegal) { + pr_warn("isolcpus: Invalid flag %.*s\n", len, par); + return 0; + } + + pr_info("isolcpus: Skipped unknown flag %.*s\n", len, par); + str++; } /* Default behaviour for isolcpus without flags */ diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index de22da666ac7..52c8f8226b0d 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,7 +6,6 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ -#include "sched.h" /* * Global load-average calculations @@ -81,7 +80,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) long nr_active, delta = 0; nr_active = this_rq->nr_running - adjust; - nr_active += (long)this_rq->nr_uninterruptible; + nr_active += (int)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; @@ -189,7 +188,7 @@ calc_load_n(unsigned long load, unsigned long exp, * w:0 1 1 0 0 1 1 0 0 * * This ensures we'll fold the old NO_HZ contribution in this window while - * accumlating the new one. + * accumulating the new one. * * - When we wake up from NO_HZ during the window, we push up our * contribution, since we effectively move our sample point to a known @@ -347,7 +346,7 @@ static inline void calc_global_nohz(void) { } * * Called from the global timer code. */ -void calc_global_load(unsigned long ticks) +void calc_global_load(void) { unsigned long sample_window; long active, delta; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 168479a7d61b..0c5be7ebb1dc 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -4,7 +4,134 @@ * * membarrier system call */ -#include "sched.h" + +/* + * For documentation purposes, here are some membarrier ordering + * scenarios to keep in mind: + * + * A) Userspace thread execution after IPI vs membarrier's memory + * barrier before sending the IPI + * + * Userspace variables: + * + * int x = 0, y = 0; + * + * The memory barrier at the start of membarrier() on CPU0 is necessary in + * order to enforce the guarantee that any writes occurring on CPU0 before + * the membarrier() is executed will be visible to any code executing on + * CPU1 after the IPI-induced memory barrier: + * + * CPU0 CPU1 + * + * x = 1 + * membarrier(): + * a: smp_mb() + * b: send IPI IPI-induced mb + * c: smp_mb() + * r2 = y + * y = 1 + * barrier() + * r1 = x + * + * BUG_ON(r1 == 0 && r2 == 0) + * + * The write to y and load from x by CPU1 are unordered by the hardware, + * so it's possible to have "r1 = x" reordered before "y = 1" at any + * point after (b). If the memory barrier at (a) is omitted, then "x = 1" + * can be reordered after (a) (although not after (c)), so we get r1 == 0 + * and r2 == 0. This violates the guarantee that membarrier() is + * supposed by provide. + * + * The timing of the memory barrier at (a) has to ensure that it executes + * before the IPI-induced memory barrier on CPU1. + * + * B) Userspace thread execution before IPI vs membarrier's memory + * barrier after completing the IPI + * + * Userspace variables: + * + * int x = 0, y = 0; + * + * The memory barrier at the end of membarrier() on CPU0 is necessary in + * order to enforce the guarantee that any writes occurring on CPU1 before + * the membarrier() is executed will be visible to any code executing on + * CPU0 after the membarrier(): + * + * CPU0 CPU1 + * + * x = 1 + * barrier() + * y = 1 + * r2 = y + * membarrier(): + * a: smp_mb() + * b: send IPI IPI-induced mb + * c: smp_mb() + * r1 = x + * BUG_ON(r1 == 0 && r2 == 1) + * + * The writes to x and y are unordered by the hardware, so it's possible to + * have "r2 = 1" even though the write to x doesn't execute until (b). If + * the memory barrier at (c) is omitted then "r1 = x" can be reordered + * before (b) (although not before (a)), so we get "r1 = 0". This violates + * the guarantee that membarrier() is supposed to provide. + * + * The timing of the memory barrier at (c) has to ensure that it executes + * after the IPI-induced memory barrier on CPU1. + * + * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * d: switch to kthread (includes mb) + * b: read rq->curr->mm == NULL + * e: switch to user (includes mb) + * c: smp_mb() + * + * Using the scenario from (A), we can show that (a) needs to be paired + * with (e). Using the scenario from (B), we can show that (c) needs to + * be paired with (d). + * + * D) exit_mm vs membarrier + * + * Two thread groups are created, A and B. Thread group B is created by + * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD. + * Let's assume we have a single thread within each thread group (Thread A + * and Thread B). Thread A runs on CPU0, Thread B runs on CPU1. + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * exit_mm(): + * d: smp_mb() + * e: current->mm = NULL + * b: read rq->curr->mm == NULL + * c: smp_mb() + * + * Using scenario (B), we can show that (c) needs to be paired with (d). + * + * E) kthread_{use,unuse}_mm vs membarrier + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * kthread_unuse_mm() + * d: smp_mb() + * e: current->mm = NULL + * b: read rq->curr->mm == NULL + * kthread_use_mm() + * f: current->mm = mm + * g: smp_mb() + * c: smp_mb() + * + * Using the scenario from (A), we can show that (a) needs to be paired + * with (g). Using the scenario from (B), we can show that (c) needs to + * be paired with (d). + */ /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, @@ -18,18 +145,57 @@ #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif +#ifdef CONFIG_RSEQ +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ) +#else +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ - | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK) static void ipi_mb(void *info) { smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_core(void *info) +{ + /* + * The smp_mb() in membarrier after all the IPIs is supposed to + * ensure that memory on remote CPUs that occur before the IPI + * become visible to membarrier()'s caller -- see scenario B in + * the big comment at the top of this file. + * + * A sync_core() would provide this guarantee, but + * sync_core_before_usermode() might end up being deferred until + * after membarrier()'s smp_mb(). + */ + smp_mb(); /* IPIs should be serializing but paranoid. */ + + sync_core_before_usermode(); +} + +static void ipi_rseq(void *info) +{ + /* + * Ensure that all stores done by the calling thread are visible + * to the current task before the current task resumes. We could + * probably optimize this away on most architectures, but by the + * time we've already sent an IPI, the cost of the extra smp_mb() + * is negligible. + */ + smp_mb(); + rseq_preempt(current); +} + static void ipi_sync_rq_state(void *info) { struct mm_struct *mm = (struct mm_struct *) info; @@ -63,6 +229,18 @@ void membarrier_exec_mmap(struct mm_struct *mm) this_cpu_write(runqueues.membarrier_state, 0); } +void membarrier_update_current_mm(struct mm_struct *next_mm) +{ + struct rq *rq = this_rq(); + int membarrier_state = 0; + + if (next_mm) + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} + static int membarrier_global_expedited(void) { int cpu; @@ -101,12 +279,11 @@ static int membarrier_global_expedited(void) continue; /* - * Skip the CPU if it runs a kernel thread. The scheduler - * leaves the prior task mm in place as an optimization when - * scheduling a kthread. + * Skip the CPU if it runs a kernel thread which is not using + * a task mm. */ p = rcu_dereference(cpu_rq(cpu)->curr); - if (p->flags & PF_KTHREAD) + if (!p->mm) continue; __cpumask_set_cpu(cpu, tmpmask); @@ -129,25 +306,35 @@ static int membarrier_global_expedited(void) return 0; } -static int membarrier_private_expedited(int flags) +static int membarrier_private_expedited(int flags, int cpu_id) { - int cpu; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; + smp_call_func_t ipi_func = ipi_mb; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + ipi_func = ipi_sync_core; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + if (!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY)) + return -EPERM; + ipi_func = ipi_rseq; } else { + WARN_ON_ONCE(flags); if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; } - if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) + if (flags != MEMBARRIER_FLAG_SYNC_CORE && + (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)) return 0; /* @@ -156,35 +343,73 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; cpus_read_lock(); - rcu_read_lock(); - for_each_online_cpu(cpu) { + + if (cpu_id >= 0) { struct task_struct *p; + if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) + goto out; + rcu_read_lock(); + p = rcu_dereference(cpu_rq(cpu_id)->curr); + if (!p || p->mm != mm) { + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } else { + int cpu; + + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); + } + + if (cpu_id >= 0) { /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. + * smp_call_function_single() will call ipi_func() if cpu_id + * is the calling CPU. */ - if (cpu == raw_smp_processor_id()) - continue; - p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) - __cpumask_set_cpu(cpu, tmpmask); + smp_call_function_single(cpu_id, ipi_func, NULL, 1); + } else { + /* + * For regular membarrier, we can save a few cycles by + * skipping the current cpu -- we're about to do smp_mb() + * below, and if we migrate to a different cpu, this cpu + * and the new cpu will execute a full barrier in the + * scheduler. + * + * For SYNC_CORE, we do need a barrier on the current cpu -- + * otherwise, if we are migrated and replaced by a different + * task in the same mm just before, during, or after + * membarrier, we will end up with some thread in the mm + * running without a core sync. + * + * For RSEQ, don't rseq_preempt() the caller. User code + * is not supposed to issue syscalls at all from inside an + * rseq critical section. + */ + if (flags != MEMBARRIER_FLAG_SYNC_CORE) { + preempt_disable(); + smp_call_function_many(tmpmask, ipi_func, NULL, true); + preempt_enable(); + } else { + on_each_cpu_mask(tmpmask, ipi_func, NULL, true); + } } - rcu_read_unlock(); - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - - free_cpumask_var(tmpmask); +out: + if (cpu_id < 0) + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -229,7 +454,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) /* * For each cpu runqueue, if the task's mm match @mm, ensure that all - * @mm's membarrier state set bits are also set in in the runqueue's + * @mm's membarrier state set bits are also set in the runqueue's * membarrier state. This ensures that a runqueue scheduling * between threads which are users of @mm has its membarrier state * updated. @@ -246,9 +471,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) } rcu_read_unlock(); - preempt_disable(); - smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); - preempt_enable(); + on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true); free_cpumask_var(tmpmask); cpus_read_unlock(); @@ -283,11 +506,18 @@ static int membarrier_register_private_expedited(int flags) set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, ret; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY; + } else { + WARN_ON_ONCE(flags); } /* @@ -299,6 +529,8 @@ static int membarrier_register_private_expedited(int flags) return 0; if (flags & MEMBARRIER_FLAG_SYNC_CORE) set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + if (flags & MEMBARRIER_FLAG_RSEQ) + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ; atomic_or(set_state, &mm->membarrier_state); ret = sync_runqueues_membarrier_state(mm); if (ret) @@ -310,8 +542,15 @@ static int membarrier_register_private_expedited(int flags) /** * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0 for all commands other than + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter + * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id + * contains the CPU on which to interrupt (= restart) + * the RSEQ critical section. + * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which + * RSEQ CS should be interrupted (@cmd must be + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ). * * If this system call is not implemented, -ENOSYS is returned. If the * command specified does not exist, not available on the running @@ -337,10 +576,21 @@ static int membarrier_register_private_expedited(int flags) * smp_mb() X O O * sys_membarrier() O O O */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) { - if (unlikely(flags)) - return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU)) + return -EINVAL; + break; + default: + if (unlikely(flags)) + return -EINVAL; + } + + if (!(flags & MEMBARRIER_CMD_FLAG_CPU)) + cpu_id = -1; + switch (cmd) { case MEMBARRIER_CMD_QUERY: { @@ -362,13 +612,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(0); + return membarrier_private_expedited(0, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: return membarrier_register_private_expedited(0); case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: - return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); default: return -EINVAL; } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index bd006b79b360..0f310768260c 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -24,12 +24,6 @@ * Author: Vincent Guittot <vincent.guittot@linaro.org> */ -#include <linux/sched.h> -#include "sched.h" -#include "pelt.h" - -#include <trace/events/sched.h> - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) @@ -83,8 +77,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 @@ -121,8 +113,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa, */ if (periods) { sa->load_sum = decay_load(sa->load_sum, periods); - sa->runnable_load_sum = - decay_load(sa->runnable_load_sum, periods); + sa->runnable_sum = + decay_load(sa->runnable_sum, periods); sa->util_sum = decay_load((u64)(sa->util_sum), periods); /* @@ -137,7 +129,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa, * runnable = running = 0; * * clause from ___update_load_sum(); this results in - * the below usage of @contrib to dissapear entirely, + * the below usage of @contrib to disappear entirely, * so no point in calculating it. */ contrib = __accumulate_pelt_segments(periods, @@ -149,7 +141,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa, if (load) sa->load_sum += load * contrib; if (runnable) - sa->runnable_load_sum += runnable * contrib; + sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT; if (running) sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; @@ -237,16 +229,40 @@ ___update_load_sum(u64 now, struct sched_avg *sa, return 1; } +/* + * When syncing *_avg with *_sum, we must take into account the current + * position in the PELT segment otherwise the remaining part of the segment + * will be considered as idle time whereas it's not yet elapsed and this will + * generate unwanted oscillation in the range [1002..1024[. + * + * The max value of *_sum varies with the position in the time segment and is + * equals to : + * + * LOAD_AVG_MAX*y + sa->period_contrib + * + * which can be simplified into: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024 + * + * The same care must be taken when a sched entity is added, updated or + * removed from a cfs_rq and we need to update sched_avg. Scheduler entities + * and the cfs rq, to which they are attached, have the same position in the + * time segment because they use the same clock. This means that we can use + * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity + * if it's more convenient. + */ static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) +___update_load_avg(struct sched_avg *sa, unsigned long load) { - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(sa); /* * Step 2: update *_avg. */ sa->load_avg = div_u64(load * sa->load_sum, divider); - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); + sa->runnable_avg = div_u64(sa->runnable_sum, divider); WRITE_ONCE(sa->util_avg, sa->util_sum / divider); } @@ -254,33 +270,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * sched_entity: * * task: - * se_runnable() == se_weight() + * se_weight() = se->load.weight + * se_runnable() = !!on_rq * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg - * - * load_sum := runnable_sum - * load_avg = se_weight(se) * runnable_avg + * se_runnable() = grq->h_nr_running * - * runnable_load_sum := runnable_sum - * runnable_load_avg = se_runnable(se) * runnable_avg + * runnable_sum = se_runnable() * runnable = grq->runnable_sum + * runnable_avg = runnable_sum * - * XXX collapse load_sum and runnable_load_sum + * load_sum := runnable + * load_avg = se_weight(se) * load_sum * * cfq_rq: * + * runnable_sum = \Sum se->avg.runnable_sum + * runnable_avg = \Sum se->avg.runnable_avg + * * load_sum = \Sum se_weight(se) * se->avg.load_sum * load_avg = \Sum se->avg.load_avg - * - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum - * runnable_load_avg = \Sum se->avg.runable_load_avg */ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) { if (___update_load_sum(now, &se->avg, 0, 0, 0)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + ___update_load_avg(&se->avg, se_weight(se)); trace_pelt_se_tp(se); return 1; } @@ -290,10 +305,10 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, + if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se), cfs_rq->curr == se)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + ___update_load_avg(&se->avg, se_weight(se)); cfs_se_util_change(&se->avg); trace_pelt_se_tp(se); return 1; @@ -306,10 +321,10 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - scale_load_down(cfs_rq->runnable_weight), + cfs_rq->h_nr_running, cfs_rq->curr != NULL)) { - ___update_load_avg(&cfs_rq->avg, 1, 1); + ___update_load_avg(&cfs_rq->avg, 1); trace_pelt_cfs_tp(cfs_rq); return 1; } @@ -322,9 +337,9 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum * - * load_avg and runnable_load_avg are not supported and meaningless. + * load_avg and runnable_avg are not supported and meaningless. * */ @@ -335,7 +350,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) running, running)) { - ___update_load_avg(&rq->avg_rt, 1, 1); + ___update_load_avg(&rq->avg_rt, 1); trace_pelt_rt_tp(rq); return 1; } @@ -348,7 +363,9 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum + * + * load_avg and runnable_avg are not supported and meaningless. * */ @@ -359,7 +376,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) running, running)) { - ___update_load_avg(&rq->avg_dl, 1, 1); + ___update_load_avg(&rq->avg_dl, 1); trace_pelt_dl_tp(rq); return 1; } @@ -367,13 +384,46 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +/* + * thermal: + * + * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked + * + * util_avg and runnable_load_avg are not supported and meaningless. + * + * Unlike rt/dl utilization tracking that track time spent by a cpu + * running a rt/dl task through util_avg, the average thermal pressure is + * tracked through load_avg. This is because thermal pressure signal is + * time weighted "delta" capacity unlike util_avg which is binary. + * "delta capacity" = actual capacity - + * capped capacity a cpu due to a thermal event. + */ + +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + if (___update_load_sum(now, &rq->avg_thermal, + capacity, + capacity, + capacity)) { + ___update_load_avg(&rq->avg_thermal, 1); + trace_pelt_thermal_tp(rq); + return 1; + } + + return 0; +} +#endif + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* * irq: * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum + * + * load_avg and runnable_avg are not supported and meaningless. * */ @@ -410,7 +460,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) 1); if (ret) { - ___update_load_avg(&rq->avg_irq, 1, 1); + ___update_load_avg(&rq->avg_irq, 1); trace_pelt_irq_tp(rq); } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index afff644da065..3a0e0dc28721 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -7,6 +7,26 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return READ_ONCE(rq->avg_thermal.load_avg); +} +#else +static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return 0; +} +#endif + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); #else @@ -17,14 +37,12 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif -/* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. - * This flag is used to synchronize util_avg updates with util_est updates. - * We map this information into the LSB bit of the utilization saved at - * dequeue time (i.e. util_est.dequeued). - */ -#define UTIL_AVG_UNCHANGED 0x1 +#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024) + +static inline u32 get_pelt_divider(struct sched_avg *avg) +{ + return PELT_MIN_DIVIDER + avg->period_contrib; +} static inline void cfs_se_util_change(struct sched_avg *avg) { @@ -33,7 +51,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg) if (!sched_feat(UTIL_EST)) return; - /* Avoid store if the flag has been already set */ + /* Avoid store if the flag has been already reset */ enqueued = avg->util_est.enqueued; if (!(enqueued & UTIL_AVG_UNCHANGED)) return; @@ -43,6 +61,25 @@ static inline void cfs_se_util_change(struct sched_avg *avg) WRITE_ONCE(avg->util_est.enqueued, enqueued); } +static inline u64 rq_clock_pelt(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + + return rq->clock_pelt - rq->lost_idle_time; +} + +/* The rq is idle, we can sync to clock_task */ +static inline void _update_idle_rq_clock_pelt(struct rq *rq) +{ + rq->clock_pelt = rq_clock_task(rq); + + u64_u32_store(rq->clock_idle, rq_clock(rq)); + /* Paired with smp_rmb in migrate_se_pelt_lag() */ + smp_wmb(); + u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq)); +} + /* * The clock_pelt scales the time to reflect the effective amount of * computation done during the running delta time but then sync back to @@ -58,8 +95,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg) static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) { if (unlikely(is_idle_task(rq->curr))) { - /* The rq is idle, we can sync to clock_task */ - rq->clock_pelt = rq_clock_task(rq); + _update_idle_rq_clock_pelt(rq); return; } @@ -105,33 +141,40 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq) * Reflecting stolen time makes sense only if the idle * phase would be present at max capacity. As soon as the * utilization of a rq has reached the maximum value, it is - * considered as an always runnig rq without idle time to + * considered as an always running rq without idle time to * steal. This potential idle time is considered as lost in * this case. We keep track of this lost idle time compare to * rq's clock_task. */ if (util_sum >= divider) rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; + + _update_idle_rq_clock_pelt(rq); } -static inline u64 rq_clock_pelt(struct rq *rq) +#ifdef CONFIG_CFS_BANDWIDTH +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { - lockdep_assert_held(&rq->lock); - assert_clock_updated(rq); + u64 throttled; - return rq->clock_pelt - rq->lost_idle_time; + if (unlikely(cfs_rq->throttle_count)) + throttled = U64_MAX; + else + throttled = cfs_rq->throttled_clock_pelt_time; + + u64_u32_store(cfs_rq->throttled_pelt_idle, throttled); } -#ifdef CONFIG_CFS_BANDWIDTH /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; - return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } #else +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { return rq_clock_pelt(rq_of(cfs_rq)); @@ -159,6 +202,17 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) } static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return 0; +} + +static inline int update_irq_load_avg(struct rq *rq, u64 running) { return 0; @@ -175,6 +229,7 @@ update_rq_clock_pelt(struct rq *rq, s64 delta) { } static inline void update_idle_rq_clock_pelt(struct rq *rq) { } +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } #endif diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 028520702717..ee2ecc081422 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Pressure stall information for CPU, memory and IO * @@ -34,10 +35,19 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * (Naturally, the FULL state doesn't exist for the CPU resource.) - * * SOME = nr_delayed_tasks != 0 - * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0 + * + * What it means for a task to be productive is defined differently + * for each resource. For IO, productive means a running task. For + * memory, productive means a running task that isn't a reclaimer. For + * CPU, productive means an oncpu task. + * + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level. At the cgroup level, + * FULL means all non-idle tasks in the cgroup are delayed on the CPU + * resource which is being used by others outside of the cgroup or + * throttled by the cgroup cpu.max configuration. * * The percentage of wallclock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, @@ -59,7 +69,7 @@ * states, we would have to conclude a CPU SOME pressure number of * 100%, since *somebody* is waiting on a runqueue at all * times. However, that is clearly not the amount of contention the - * workload is experiencing: only one out of 256 possible exceution + * workload is experiencing: only one out of 256 possible execution * threads will be contended at any given time, or about 0.4%. * * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any @@ -73,18 +83,18 @@ * we have to base our calculation on the number of non-idle tasks in * conjunction with the number of available CPUs, which is the number * of potential execution threads. SOME becomes then the proportion of - * delayed tasks to possibe threads, and FULL is the share of possible + * delayed tasks to possible threads, and FULL is the share of possible * threads that are unproductive due to delays: * * threads = min(nr_nonidle_tasks, nr_cpus) * SOME = min(nr_delayed_tasks / threads, 1) - * FULL = (threads - min(nr_running_tasks, threads)) / threads + * FULL = (threads - min(nr_productive_tasks, threads)) / threads * * For the 257 number crunchers on 256 CPUs, this yields: * * threads = min(257, 256) * SOME = min(1 / 256, 1) = 0.4% - * FULL = (256 - min(257, 256)) / 256 = 0% + * FULL = (256 - min(256, 256)) / 256 = 0% * * For the 1 out of 4 memory-delayed tasks, this yields: * @@ -109,7 +119,7 @@ * For each runqueue, we track: * * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) - * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu]) * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) * * and then periodically aggregate: @@ -127,24 +137,10 @@ * sampling of the aggregate task states would be. */ -#include "../workqueue_internal.h" -#include <linux/sched/loadavg.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> -#include <linux/seqlock.h> -#include <linux/uaccess.h> -#include <linux/cgroup.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/ctype.h> -#include <linux/file.h> -#include <linux/poll.h> -#include <linux/psi.h> -#include "sched.h" - static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); +DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED static bool psi_enable; @@ -179,10 +175,13 @@ struct psi_group psi_system = { static void psi_avgs_work(struct work_struct *work); +static void poll_timer_fn(struct timer_list *t); + static void group_init(struct psi_group *group) { int cpu; + group->enabled = true; for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); group->avg_last_update = sched_clock(); @@ -190,42 +189,46 @@ static void group_init(struct psi_group *group) INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ - atomic_set(&group->poll_scheduled, 0); mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); - memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); - group->poll_states = 0; group->poll_min_period = U32_MAX; - memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; - group->polling_until = 0; - rcu_assign_pointer(group->poll_kworker, NULL); + init_waitqueue_head(&group->poll_wait); + timer_setup(&group->poll_timer, poll_timer_fn, 0); + rcu_assign_pointer(group->poll_task, NULL); } void __init psi_init(void) { if (!psi_enable) { static_branch_enable(&psi_disabled); + static_branch_disable(&psi_cgroups_enabled); return; } + if (!cgroup_psi_enabled()) + static_branch_disable(&psi_cgroups_enabled); + psi_period = jiffies_to_nsecs(PSI_FREQ); group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state) +static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) { switch (state) { case PSI_IO_SOME: - return tasks[NR_IOWAIT]; + return unlikely(tasks[NR_IOWAIT]); case PSI_IO_FULL: - return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]); case PSI_MEM_SOME: - return tasks[NR_MEMSTALL]; + return unlikely(tasks[NR_MEMSTALL]); case PSI_MEM_FULL: - return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: - return tasks[NR_RUNNING] > 1; + return unlikely(tasks[NR_RUNNING] > oncpu); + case PSI_CPU_FULL: + return unlikely(tasks[NR_RUNNING] && !oncpu); case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -442,7 +445,7 @@ static void psi_avgs_work(struct work_struct *work) mutex_unlock(&group->avgs_lock); } -/* Trigger tracking window manupulations */ +/* Trigger tracking window manipulations */ static void window_reset(struct psi_window *win, u64 now, u64 value, u64 prev_growth) { @@ -503,7 +506,7 @@ static void init_triggers(struct psi_group *group, u64 now) static u64 update_triggers(struct psi_group *group, u64 now) { struct psi_trigger *t; - bool new_stall = false; + bool update_total = false; u64 *total = group->total[PSI_POLL]; /* @@ -512,24 +515,35 @@ static u64 update_triggers(struct psi_group *group, u64 now) */ list_for_each_entry(t, &group->triggers, node) { u64 growth; + bool new_stall; - /* Check for stall activity */ - if (group->polling_total[t->state] == total[t->state]) - continue; + new_stall = group->polling_total[t->state] != total[t->state]; + /* Check for stall activity or a previous threshold breach */ + if (!new_stall && !t->pending_event) + continue; /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. + * Check for new stall activity, as well as deferred + * events that occurred in the last window after the + * trigger had already fired (we want to ratelimit + * events without dropping any). */ - new_stall = true; + if (new_stall) { + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + update_total = true; - /* Calculate growth since last update */ - growth = window_update(&t->win, now, total[t->state]); - if (growth < t->threshold) - continue; + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + t->pending_event = true; + } /* Limit event signaling to once per window */ if (now < t->last_event_time + t->win.size) continue; @@ -538,56 +552,49 @@ static u64 update_triggers(struct psi_group *group, u64 now) if (cmpxchg(&t->event, 0, 1) == 0) wake_up_interruptible(&t->event_wait); t->last_event_time = now; + /* Reset threshold breach flag once event got generated */ + t->pending_event = false; } - if (new_stall) + if (update_total) memcpy(group->polling_total, total, sizeof(group->polling_total)); return now + group->poll_min_period; } -/* - * Schedule polling if it's not already scheduled. It's safe to call even from - * hotpath because even though kthread_queue_delayed_work takes worker->lock - * spinlock that spinlock is never contended due to poll_scheduled atomic - * preventing such competition. - */ +/* Schedule polling if it's not already scheduled. */ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) { - struct kthread_worker *kworker; + struct task_struct *task; - /* Do not reschedule if already scheduled */ - if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + /* + * Do not reschedule if already scheduled. + * Possible race with a timer scheduled after this check but before + * mod_timer below can be tolerated because group->polling_next_update + * will keep updates on schedule. + */ + if (timer_pending(&group->poll_timer)) return; rcu_read_lock(); - kworker = rcu_dereference(group->poll_kworker); + task = rcu_dereference(group->poll_task); /* * kworker might be NULL in case psi_trigger_destroy races with * psi_task_change (hotpath) which can't use locks */ - if (likely(kworker)) - kthread_queue_delayed_work(kworker, &group->poll_work, delay); - else - atomic_set(&group->poll_scheduled, 0); + if (likely(task)) + mod_timer(&group->poll_timer, jiffies + delay); rcu_read_unlock(); } -static void psi_poll_work(struct kthread_work *work) +static void psi_poll_work(struct psi_group *group) { - struct kthread_delayed_work *dwork; - struct psi_group *group; u32 changed_states; u64 now; - dwork = container_of(work, struct kthread_delayed_work, work); - group = container_of(dwork, struct psi_group, poll_work); - - atomic_set(&group->poll_scheduled, 0); - mutex_lock(&group->trigger_lock); now = sched_clock(); @@ -623,13 +630,36 @@ out: mutex_unlock(&group->trigger_lock); } -static void record_times(struct psi_group_cpu *groupc, int cpu, - bool memstall_tick) +static int psi_poll_worker(void *data) +{ + struct psi_group *group = (struct psi_group *)data; + + sched_set_fifo_low(current); + + while (true) { + wait_event_interruptible(group->poll_wait, + atomic_cmpxchg(&group->poll_wakeup, 1, 0) || + kthread_should_stop()); + if (kthread_should_stop()) + break; + + psi_poll_work(group); + } + return 0; +} + +static void poll_timer_fn(struct timer_list *t) +{ + struct psi_group *group = from_timer(group, t, poll_timer); + + atomic_set(&group->poll_wakeup, 1); + wake_up_interruptible(&group->poll_wait); +} + +static void record_times(struct psi_group_cpu *groupc, u64 now) { u32 delta; - u64 now; - now = cpu_clock(cpu); delta = now - groupc->state_start; groupc->state_start = now; @@ -643,165 +673,276 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_MEM_SOME] += delta; if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; - else if (memstall_tick) { - u32 sample; - /* - * Since we care about lost potential, a - * memstall is FULL when there are no other - * working tasks, but also when the CPU is - * actively reclaiming and nothing productive - * could run even if it were runnable. - * - * When the timer tick sees a reclaiming CPU, - * regardless of runnable tasks, sample a FULL - * tick (or less if it hasn't been a full tick - * since the last state change). - */ - sample = min(delta, (u32)jiffies_to_nsecs(1)); - groupc->times[PSI_MEM_FULL] += sample; - } } - if (groupc->state_mask & (1 << PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) { groupc->times[PSI_CPU_SOME] += delta; + if (groupc->state_mask & (1 << PSI_CPU_FULL)) + groupc->times[PSI_CPU_FULL] += delta; + } if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } -static u32 psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set, u64 now, + bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; enum psi_states s; - u32 state_mask = 0; + u32 state_mask; groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we assess the aggregate resource states this CPU's - * tasks have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - * - * Then we update the task counts according to the state + * First we update the task counts according to the state * change requested through the @clear and @set bits. + * + * Then if the cgroup PSI stats accounting enabled, we + * assess the aggregate resource states this CPU's tasks + * have been in since the last change, and account any + * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, false); + /* + * Start with TSK_ONCPU, which doesn't have a corresponding + * task count - it's just a boolean flag directly encoded in + * the state mask. Clear, set, or carry the current state if + * no changes are requested. + */ + if (unlikely(clear & TSK_ONCPU)) { + state_mask = 0; + clear &= ~TSK_ONCPU; + } else if (unlikely(set & TSK_ONCPU)) { + state_mask = PSI_ONCPU; + set &= ~TSK_ONCPU; + } else { + state_mask = groupc->state_mask & PSI_ONCPU; + } + /* + * The rest of the state mask is calculated based on the task + * counts. Update those first, then construct the mask. + */ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; - if (groupc->tasks[t] == 0 && !psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + if (groupc->tasks[t]) { + groupc->tasks[t]--; + } else if (!psi_bug) { + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } - groupc->tasks[t]--; } for (t = 0; set; set &= ~(1 << t), t++) if (set & (1 << t)) groupc->tasks[t]++; - /* Calculate state mask representing active states */ + if (!group->enabled) { + /* + * On the first group change after disabling PSI, conclude + * the current state and flush its time. This is unlikely + * to matter to the user, but aggregation (get_recent_times) + * may have already incorporated the live state into times_prev; + * avoid a delta sample underflow when PSI is later re-enabled. + */ + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + record_times(groupc, now); + + groupc->state_mask = state_mask; + + write_seqcount_end(&groupc->seq); + return; + } + for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s)) + if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) state_mask |= (1 << s); } + + /* + * Since we care about lost potential, a memstall is FULL + * when there are no other working tasks, but also when + * the CPU is actively reclaiming and nothing productive + * could run even if it were runnable. So when the current + * task in a cgroup is in_memstall, the corresponding groupc + * on that cpu is in PSI_MEM_FULL state. + */ + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) + state_mask |= (1 << PSI_MEM_FULL); + + record_times(groupc, now); + groupc->state_mask = state_mask; write_seqcount_end(&groupc->seq); - return state_mask; + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +static inline struct psi_group *task_psi_group(struct task_struct *task) { #ifdef CONFIG_CGROUPS - struct cgroup *cgroup = NULL; - - if (!*iter) - cgroup = task->cgroups->dfl_cgrp; - else if (*iter == &psi_system) - return NULL; - else - cgroup = cgroup_parent(*iter); - - if (cgroup && cgroup_parent(cgroup)) { - *iter = cgroup; - return cgroup_psi(cgroup); - } -#else - if (*iter) - return NULL; + if (static_branch_likely(&psi_cgroups_enabled)) + return cgroup_psi(task_dfl_cgroup(task)); #endif - *iter = &psi_system; return &psi_system; } -void psi_task_change(struct task_struct *task, int clear, int set) +static void psi_flags_change(struct task_struct *task, int clear, int set) { - int cpu = task_cpu(task); - struct psi_group *group; - bool wake_clock = true; - void *iter = NULL; - - if (!task->pid) - return; - if (((task->psi_flags & set) || (task->psi_flags & clear) != clear) && !psi_bug) { printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", - task->pid, task->comm, cpu, + task->pid, task->comm, task_cpu(task), task->psi_flags, clear, set); psi_bug = 1; } task->psi_flags &= ~clear; task->psi_flags |= set; +} - /* - * Periodic aggregation shuts off if there is a period of no - * task changes, so we wake it back up if necessary. However, - * don't do this if the task change is the aggregation worker - * itself going to sleep, or we'll ping-pong forever. - */ - if (unlikely((clear & TSK_RUNNING) && - (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_avgs_work)) - wake_clock = false; +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + struct psi_group *group; + u64 now; - while ((group = iterate_groups(task, &iter))) { - u32 state_mask = psi_group_change(group, cpu, clear, set); + if (!task->pid) + return; - if (state_mask & group->poll_states) - psi_schedule_poll_work(group, 1); + psi_flags_change(task, clear, set); + + now = cpu_clock(cpu); + + group = task_psi_group(task); + do { + psi_group_change(group, cpu, clear, set, now, true); + } while ((group = group->parent)); +} + +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep) +{ + struct psi_group *group, *common = NULL; + int cpu = task_cpu(prev); + u64 now = cpu_clock(cpu); + + if (next->pid) { + psi_flags_change(next, 0, TSK_ONCPU); + /* + * Set TSK_ONCPU on @next's cgroups. If @next shares any + * ancestors with @prev, those will already have @prev's + * TSK_ONCPU bit set, and we can stop the iteration there. + */ + group = task_psi_group(next); + do { + if (per_cpu_ptr(group->pcpu, cpu)->state_mask & + PSI_ONCPU) { + common = group; + break; + } + + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + } while ((group = group->parent)); + } + + if (prev->pid) { + int clear = TSK_ONCPU, set = 0; + bool wake_clock = true; + + /* + * When we're going to sleep, psi_dequeue() lets us + * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and + * TSK_IOWAIT here, where we can combine it with + * TSK_ONCPU and save walking common ancestors twice. + */ + if (sleep) { + clear |= TSK_RUNNING; + if (prev->in_memstall) + clear |= TSK_MEMSTALL_RUNNING; + if (prev->in_iowait) + set |= TSK_IOWAIT; + + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((prev->flags & PF_WQ_WORKER) && + wq_worker_last_func(prev) == psi_avgs_work)) + wake_clock = false; + } - if (wake_clock && !delayed_work_pending(&group->avgs_work)) - schedule_delayed_work(&group->avgs_work, PSI_FREQ); + psi_flags_change(prev, clear, set); + + group = task_psi_group(prev); + do { + if (group == common) + break; + psi_group_change(group, cpu, clear, set, now, wake_clock); + } while ((group = group->parent)); + + /* + * TSK_ONCPU is handled up to the common ancestor. If there are + * any other differences between the two tasks (e.g. prev goes + * to sleep, or only one task is memstall), finish propagating + * those differences all the way up to the root. + */ + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { + clear &= ~TSK_ONCPU; + for (; group; group = group->parent) + psi_group_change(group, cpu, clear, set, now, wake_clock); + } } } -void psi_memstall_tick(struct task_struct *task, int cpu) +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct task_struct *task, u32 delta) { + int cpu = task_cpu(task); struct psi_group *group; - void *iter = NULL; + struct psi_group_cpu *groupc; + u64 now; - while ((group = iterate_groups(task, &iter))) { - struct psi_group_cpu *groupc; + if (!task->pid) + return; + + now = cpu_clock(cpu); + + group = task_psi_group(task); + do { + if (!group->enabled) + continue; groupc = per_cpu_ptr(group->pcpu, cpu); + write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, true); + + record_times(groupc, now); + groupc->times[PSI_IRQ_FULL] += delta; + write_seqcount_end(&groupc->seq); - } + + if (group->poll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_poll_work(group, 1); + } while ((group = group->parent)); } +#endif /** * psi_memstall_enter - mark the beginning of a memory stall section @@ -818,21 +959,22 @@ void psi_memstall_enter(unsigned long *flags) if (static_branch_likely(&psi_disabled)) return; - *flags = current->flags & PF_MEMSTALL; + *flags = current->in_memstall; if (*flags) return; /* - * PF_MEMSTALL setting & accounting needs to be atomic wrt + * in_memstall setting & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we can * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags |= PF_MEMSTALL; - psi_task_change(current, 0, TSK_MEMSTALL); + current->in_memstall = 1; + psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_enter); /** * psi_memstall_leave - mark the end of an memory stall section @@ -851,40 +993,49 @@ void psi_memstall_leave(unsigned long *flags) if (*flags) return; /* - * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * in_memstall clearing & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we could * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags &= ~PF_MEMSTALL; - psi_task_change(current, TSK_MEMSTALL, 0); + current->in_memstall = 0; + psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0); rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return 0; - cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); - if (!cgroup->psi.pcpu) + cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); + if (!cgroup->psi) + return -ENOMEM; + + cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi->pcpu) { + kfree(cgroup->psi); return -ENOMEM; - group_init(&cgroup->psi); + } + group_init(cgroup->psi); + cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup)); return 0; } void psi_cgroup_free(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return; - cancel_delayed_work_sync(&cgroup->psi.avgs_work); - free_percpu(cgroup->psi.pcpu); + cancel_delayed_work_sync(&cgroup->psi->avgs_work); + free_percpu(cgroup->psi->pcpu); /* All triggers must be removed by now */ - WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); + WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n"); + kfree(cgroup->psi); } /** @@ -901,11 +1052,11 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - unsigned int task_flags = 0; + unsigned int task_flags; struct rq_flags rf; struct rq *rq; - if (static_branch_likely(&psi_disabled)) { + if (!static_branch_likely(&psi_cgroups_enabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -916,13 +1067,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) rq = task_rq_lock(task, &rf); - if (task_on_rq_queued(task)) - task_flags = TSK_RUNNING; - else if (task->in_iowait) - task_flags = TSK_IOWAIT; - - if (task->flags & PF_MEMSTALL) - task_flags |= TSK_MEMSTALL; + /* + * We may race with schedule() dropping the rq lock between + * deactivating prev and switching to next. Because the psi + * updates from the deactivation are deferred to the switch + * callback to save cgroup tree updates, the task's scheduling + * state here is not coherent with its psi state: + * + * schedule() cgroup_move_task() + * rq_lock() + * deactivate_task() + * p->on_rq = 0 + * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates + * pick_next_task() + * rq_unlock() + * rq_lock() + * psi_task_change() // old cgroup + * task->cgroups = to + * psi_task_change() // new cgroup + * rq_unlock() + * rq_lock() + * psi_sched_switch() // does deferred updates in new cgroup + * + * Don't rely on the scheduling state. Use psi_flags instead. + */ + task_flags = task->psi_flags; if (task_flags) psi_task_change(task, task_flags, 0); @@ -935,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) task_rq_unlock(rq, task, &rf); } + +void psi_cgroup_restart(struct psi_group *group) +{ + int cpu; + + /* + * After we disable psi_group->enabled, we don't actually + * stop percpu tasks accounting in each psi_group_cpu, + * instead only stop test_state() loop, record_times() + * and averaging worker, see psi_group_change() for details. + * + * When disable cgroup PSI, this function has nothing to sync + * since cgroup pressure files are hidden and percpu psi_group_cpu + * would see !psi_group->enabled and only do task accounting. + * + * When re-enable cgroup PSI, this function use psi_group_change() + * to get correct state mask from test_state() loop on tasks[], + * and restart groupc->state_start from now, use .clear = .set = 0 + * here since no task status really changed. + */ + if (!group->enabled) + return; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + u64 now; + + rq_lock_irq(rq, &rf); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + rq_unlock_irq(rq, &rf); + } +} #endif /* CONFIG_CGROUPS */ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; u64 now; @@ -953,18 +1157,25 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2 - (res == PSI_CPU); full++) { - unsigned long avg[3]; - u64 total; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif + + for (full = 0; full < 2 - only_full; full++) { + unsigned long avg[3] = { 0, }; + u64 total = 0; int w; - for (w = 0; w < 3; w++) - avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[PSI_AVGS][res * 2 + full], - NSEC_PER_USEC); + /* CPU FULL is undefined at the system level */ + if (!(group == &psi_system && res == PSI_CPU && full)) { + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); + } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -974,38 +1185,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return single_open(file, psi_io_show, NULL); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return single_open(file, psi_memory_show, NULL); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return single_open(file, psi_cpu_show, NULL); -} - struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, size_t nbytes, enum psi_res res) + char *buf, enum psi_res res) { struct psi_trigger *t; enum psi_states state; @@ -1022,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, else return ERR_PTR(-EINVAL); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); @@ -1041,31 +1227,28 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->state = state; t->threshold = threshold_us * NSEC_PER_USEC; t->win.size = window_us * NSEC_PER_USEC; - window_reset(&t->win, 0, 0, 0); + window_reset(&t->win, sched_clock(), + group->total[PSI_POLL][t->state], 0); t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); + t->pending_event = false; mutex_lock(&group->trigger_lock); - if (!rcu_access_pointer(group->poll_kworker)) { - struct sched_param param = { - .sched_priority = 1, - }; - struct kthread_worker *kworker; + if (!rcu_access_pointer(group->poll_task)) { + struct task_struct *task; - kworker = kthread_create_worker(0, "psimon"); - if (IS_ERR(kworker)) { + task = kthread_create(psi_poll_worker, group, "psimon"); + if (IS_ERR(task)) { kfree(t); mutex_unlock(&group->trigger_lock); - return ERR_CAST(kworker); + return ERR_CAST(task); } - sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); - kthread_init_delayed_work(&group->poll_work, - psi_poll_work); - rcu_assign_pointer(group->poll_kworker, kworker); + atomic_set(&group->poll_wakeup, 0); + wake_up_process(task); + rcu_assign_pointer(group->poll_task, task); } list_add(&t->node, &group->triggers); @@ -1079,15 +1262,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; - struct kthread_worker *kworker_to_destroy = NULL; + struct psi_group *group; + struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1109,55 +1296,39 @@ static void psi_trigger_destroy(struct kref *ref) period = min(period, div_u64(tmp->win.size, UPDATES_PER_WINDOW)); group->poll_min_period = period; - /* Destroy poll_kworker when the last trigger is destroyed */ + /* Destroy poll_task when the last trigger is destroyed */ if (group->poll_states == 0) { group->polling_until = 0; - kworker_to_destroy = rcu_dereference_protected( - group->poll_kworker, + task_to_destroy = rcu_dereference_protected( + group->poll_task, lockdep_is_held(&group->trigger_lock)); - rcu_assign_pointer(group->poll_kworker, NULL); + rcu_assign_pointer(group->poll_task, NULL); + del_timer(&group->poll_timer); } } mutex_unlock(&group->trigger_lock); /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_kworker RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_kworker + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* - * Destroy the kworker after releasing trigger_lock to prevent a + * Stop kthread 'psimon' after releasing trigger_lock to prevent a * deadlock while waiting for psi_poll_work to acquire trigger_lock */ - if (kworker_to_destroy) { + if (task_to_destroy) { /* * After the RCU grace period has expired, the worker - * can no longer be found through group->poll_kworker. - * But it might have been already scheduled before - * that - deschedule it cleanly before destroying it. + * can no longer be found through group->poll_task. */ - kthread_cancel_delayed_work_sync(&group->poll_work); - atomic_set(&group->poll_scheduled, 0); - - kthread_destroy_worker(kworker_to_destroy); + kthread_stop(task_to_destroy); } kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1167,27 +1338,57 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ + if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return single_open(file, psi_show, NULL); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_io_show); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_memory_show); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_cpu_show); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1208,14 +1409,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1250,7 +1461,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } @@ -1281,14 +1492,46 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_irq_show); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif + static int __init psi_proc_init(void) { if (psi_enable) { proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_proc_ops); - proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops); + proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); + proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); + proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif } return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4043abe45459..ed2a47e4ddae 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -3,17 +3,66 @@ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ -#include "sched.h" - -#include "pelt.h" int sched_rr_timeslice = RR_TIMESLICE; -int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +/* More than 4 hours if BW_SHIFT equals 20. */ +static const u64 max_rt_runtime = MAX_BW; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; +/* + * period over which we measure -rt task CPU usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + +#ifdef CONFIG_SYSCTL +static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +static struct ctl_table sched_rt_sysctls[] = { + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sysctl_sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, + {} +}; + +static int __init sched_rt_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_rt_sysctls); + return 0; +} +late_initcall(sched_rt_sysctl_init); +#endif + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = @@ -50,11 +99,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.function = sched_rt_period_timer; } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) { - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; @@ -73,6 +119,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + do_start_rt_bandwidth(rt_b); +} + void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -87,8 +141,8 @@ void init_rt_rq(struct rt_rq *rt_rq) __set_bit(MAX_RT_PRIO, array->bitmap); #if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->highest_prio.next = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + rt_rq->highest_prio.next = MAX_RT_PRIO-1; rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); @@ -135,13 +189,17 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) return rt_rq->rq; } -void free_rt_sched_group(struct task_group *tg) +void unregister_rt_sched_group(struct task_group *tg) { - int i; - if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); +} + +void free_rt_sched_group(struct task_group *tg) +{ + int i; + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -159,7 +217,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->rt_nr_boosted = 0; rt_rq->rq = rq; rt_rq->tg = tg; @@ -248,6 +306,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return &rq->rt; } +void unregister_rt_sched_group(struct task_group *tg) { } + void free_rt_sched_group(struct task_group *tg) { } int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) @@ -258,12 +318,10 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP -static void pull_rt_task(struct rq *this_rq); - static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + return rq->online && rq->rt.highest_prio.curr > prev->prio; } static inline int rt_overloaded(struct rq *rq) @@ -352,8 +410,8 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } -static DEFINE_PER_CPU(struct callback_head, rt_push_head); -static DEFINE_PER_CPU(struct callback_head, rt_pull_head); +static DEFINE_PER_CPU(struct balance_callback, rt_push_head); +static DEFINE_PER_CPU(struct balance_callback, rt_pull_head); static void push_rt_tasks(struct rq *); static void pull_rt_task(struct rq *); @@ -391,8 +449,9 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) p = plist_first_entry(&rq->rt.pushable_tasks, struct task_struct, pushable_tasks); rq->rt.highest_prio.next = p->prio; - } else - rq->rt.highest_prio.next = MAX_RT_PRIO; + } else { + rq->rt.highest_prio.next = MAX_RT_PRIO-1; + } } #else @@ -415,22 +474,13 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { } -static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) -{ - return false; -} - -static inline void pull_rt_task(struct rq *this_rq) -{ -} - static inline void rt_queue_push_tasks(struct rq *rq) { } #endif /* CONFIG_SMP */ static void enqueue_top_rt_rq(struct rt_rq *rt_rq); -static void dequeue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { @@ -459,7 +509,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) unsigned int cpu_cap; /* Only heterogeneous systems can benefit from this check */ - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return true; min_cap = uclamp_eff_value(p, UCLAMP_MIN); @@ -551,7 +601,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; if (!rt_se) { - dequeue_top_rt_rq(rt_rq); + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); } @@ -637,7 +687,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { - dequeue_top_rt_rq(rt_rq); + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -697,7 +747,7 @@ static void do_balance_runtime(struct rt_rq *rt_rq) /* * Either all rqs have inf runtime and there's nothing to steal * or __disable_runtime() below sets a specific rq to inf to - * indicate its been disabled and disalow stealing. + * indicate its been disabled and disallow stealing. */ if (iter->rt_runtime == RUNTIME_INF) goto next; @@ -793,7 +843,7 @@ static void __disable_runtime(struct rq *rq) * We cannot be left wanting - that would mean some runtime * leaked out of the system. */ - BUG_ON(want); + WARN_ON_ONCE(want); balanced: /* * Disable all the borrow logic by pretending we have inf @@ -871,6 +921,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + struct rq_flags rf; int skip; /* @@ -885,7 +936,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (skip) continue; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); if (rt_rq->rt_time) { @@ -923,7 +974,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (enqueue) sched_rt_rq_enqueue(rt_rq); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) @@ -1006,33 +1057,35 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->stats.exec_max, + max(curr->stats.exec_max, delta_exec)); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); + trace_sched_stat_runtime(curr, delta_exec, 0); - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); if (!rt_bandwidth_enabled()) return; for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + int exceeded; if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) + exceeded = sched_rt_runtime_exceeded(rt_rq); + if (exceeded) resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (exceeded) + do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } } static void -dequeue_top_rt_rq(struct rt_rq *rt_rq) +dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count) { struct rq *rq = rq_of_rt_rq(rt_rq); @@ -1043,7 +1096,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) BUG_ON(!rq->nr_running); - sub_nr_running(rq, rt_rq->rt_nr_running); + sub_nr_running(rq, count); rt_rq->rt_queued = 0; } @@ -1145,8 +1198,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) sched_find_first_bit(array->bitmap); } - } else - rt_rq->highest_prio.curr = MAX_RT_PRIO; + } else { + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + } dec_rt_prio_smp(rt_rq, prio, prev_prio); } @@ -1267,6 +1321,112 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr rt_se->on_list = 0; } +static inline struct sched_statistics * +__schedstats_from_rt_se(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_RT_GROUP_SCHED + /* schedstats is not supported for rt group. */ + if (!rt_entity_is_task(rt_se)) + return NULL; +#endif + + return &rt_task_of(rt_se)->stats; +} + +static inline void +update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int flags) +{ + if (!schedstat_enabled()) + return; + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper_rt(rt_rq, rt_se); +} + +static inline void +update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int flags) +{ + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + if ((flags & DEQUEUE_SLEEP) && p) { + unsigned int state; + + state = READ_ONCE(p->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(p->stats.sleep_start, + rq_clock(rq_of_rt_rq(rt_rq))); + + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(p->stats.block_start, + rq_clock(rq_of_rt_rq(rt_rq))); + } +} + static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); @@ -1322,24 +1482,29 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; + unsigned int rt_nr_running; for_each_sched_rt_entity(rt_se) { rt_se->back = back; back = rt_se; } - dequeue_top_rt_rq(rt_rq_of_se(back)); + rt_nr_running = rt_rq_of_se(back)->rt_nr_running; for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) __dequeue_rt_entity(rt_se, flags); } + + dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running); } static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); + update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) __enqueue_rt_entity(rt_se, flags); @@ -1350,6 +1515,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); + update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { @@ -1372,6 +1539,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; + check_schedstat_required(); + update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se); + enqueue_rt_entity(rt_se, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) @@ -1426,14 +1596,14 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int flags) { struct task_struct *curr; struct rq *rq; bool test; /* For anything but wake ups, just return the task_cpu */ - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (!(flags & (WF_TTWU | WF_FORK))) goto out; rq = cpu_rq(cpu); @@ -1475,6 +1645,13 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) int target = find_lowest_rq(p); /* + * Bail out if we were forcing a migration to find a better + * fitting CPU but our search failed. + */ + if (!test && target != -1 && !rt_task_fits_capacity(p, target)) + goto out_unlock; + + /* * Don't bother moving it if the destination CPU is * not running a lower priority task. */ @@ -1482,6 +1659,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) p->prio < cpu_rq(target)->rt.highest_prio.curr) cpu = target; } + +out_unlock: rcu_read_unlock(); out: @@ -1495,7 +1674,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; /* @@ -1503,7 +1682,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * see if it is pushed or pulled somewhere else. */ if (p->nr_cpus_allowed != 1 && - cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) + cpupri_find(&rq->rd->cpupri, p, NULL)) return; /* @@ -1563,7 +1742,12 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) { + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq = &rq->rt; + p->se.exec_start = rq_clock_task(rq); + if (on_rt_rq(&p->rt)) + update_stats_wait_end_rt(rt_rq, rt_se); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); @@ -1582,8 +1766,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f rt_queue_push_tasks(rq); } -static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, - struct rt_rq *rt_rq) +static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) { struct rt_prio_array *array = &rt_rq->active; struct sched_rt_entity *next = NULL; @@ -1605,7 +1788,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; do { - rt_se = pick_next_rt_entity(rq, rt_rq); + rt_se = pick_next_rt_entity(rt_rq); BUG_ON(!rt_se); rt_rq = group_rt_rq(rt_se); } while (rt_rq); @@ -1613,7 +1796,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq) { struct task_struct *p; @@ -1621,12 +1804,28 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) return NULL; p = _pick_next_task_rt(rq); - set_next_task_rt(rq, p, true); + + return p; +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ + struct task_struct *p = pick_task_rt(rq); + + if (p) + set_next_task_rt(rq, p, true); + return p; } static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq = &rq->rt; + + if (on_rt_rq(&p->rt)) + update_stats_wait_start_rt(rt_rq, rt_se); + update_curr_rt(rq); update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); @@ -1646,9 +1845,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { - if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr) && - rt_task_fits_capacity(p, cpu)) + if (!task_on_cpu(rq, p) && + cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; @@ -1682,6 +1880,7 @@ static int find_lowest_rq(struct task_struct *task) struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + int ret; /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) @@ -1690,8 +1889,22 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, - rt_task_fits_capacity)) + /* + * If we're on asym system ensure we consider the different capacities + * of the CPUs when searching for the lowest_mask. + */ + if (sched_asym_cpucap_active()) { + + ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, + task, lowest_mask, + rt_task_fits_capacity); + } else { + + ret = cpupri_find(&task_rq(task)->rd->cpupri, + task, lowest_mask); + } + + if (!ret) return -1; /* No targets found */ /* @@ -1727,8 +1940,8 @@ static int find_lowest_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(lowest_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(lowest_mask, + sched_domain_span(sd)); if (best_cpu < nr_cpu_ids) { rcu_read_unlock(); return best_cpu; @@ -1745,7 +1958,7 @@ static int find_lowest_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(lowest_mask); + cpu = cpumask_any_distribute(lowest_mask); if (cpu < nr_cpu_ids) return cpu; @@ -1786,8 +1999,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || - task_running(rq, task) || + !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || + task_on_cpu(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { @@ -1834,7 +2047,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) * running task can migrate over to a CPU that is running a task * of lesser priority. */ -static int push_rt_task(struct rq *rq) +static int push_rt_task(struct rq *rq, bool pull) { struct task_struct *next_task; struct rq *lowest_rq; @@ -1848,9 +2061,6 @@ static int push_rt_task(struct rq *rq) return 0; retry: - if (WARN_ON(next_task == rq->curr)) - return 0; - /* * It's possible that the next_task slipped in of * higher priority than current. If that's the case @@ -1861,6 +2071,49 @@ retry: return 0; } + if (is_migration_disabled(next_task)) { + struct task_struct *push_task = NULL; + int cpu; + + if (!pull || rq->push_busy) + return 0; + + /* + * Invoking find_lowest_rq() on anything but an RT task doesn't + * make sense. Per the above priority check, curr has to + * be of higher priority than next_task, so no need to + * reschedule when bailing out. + * + * Note that the stoppers are masqueraded as SCHED_FIFO + * (cf. sched_set_stop_task()), so we can't rely on rt_task(). + */ + if (rq->curr->sched_class != &rt_sched_class) + return 0; + + cpu = find_lowest_rq(rq->curr); + if (cpu == -1 || cpu == rq->cpu) + return 0; + + /* + * Given we found a CPU with lower priority than @next_task, + * therefore it should be running. However we cannot migrate it + * to this other CPU, instead attempt to push the current + * running task on this CPU away. + */ + push_task = get_push_task(rq); + if (push_task) { + raw_spin_rq_unlock(rq); + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + push_task, &rq->push_work); + raw_spin_rq_lock(rq); + } + + return 0; + } + + if (WARN_ON(next_task == rq->curr)) + return 0; + /* We might release rq lock */ get_task_struct(next_task); @@ -1902,12 +2155,10 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, lowest_rq->cpu); activate_task(lowest_rq, next_task, 0); - ret = 1; - resched_curr(lowest_rq); + ret = 1; double_unlock_balance(rq, lowest_rq); - out: put_task_struct(next_task); @@ -1917,7 +2168,7 @@ out: static void push_rt_tasks(struct rq *rq) { /* push_rt_task will return true if it moved an RT */ - while (push_rt_task(rq)) + while (push_rt_task(rq, false)) ; } @@ -1945,7 +2196,7 @@ static void push_rt_tasks(struct rq *rq) * * Each root domain has its own irq work function that can iterate over * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT - * tassk must be checked if there's one or many CPUs that are lowering + * task must be checked if there's one or many CPUs that are lowering * their priority, there's a single irq work iterator that will try to * push off RT tasks that are waiting to run. * @@ -2069,9 +2320,10 @@ void rto_push_irq_work_func(struct irq_work *work) * When it gets updated, a check is made if a push is possible. */ if (has_pushable_tasks(rq)) { - raw_spin_lock(&rq->lock); - push_rt_tasks(rq); - raw_spin_unlock(&rq->lock); + raw_spin_rq_lock(rq); + while (push_rt_task(rq, true)) + ; + raw_spin_rq_unlock(rq); } raw_spin_lock(&rd->rto_lock); @@ -2095,7 +2347,7 @@ static void pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; bool resched = false; - struct task_struct *p; + struct task_struct *p, *push_task; struct rq *src_rq; int rt_overload_count = rt_overloaded(this_rq); @@ -2142,6 +2394,7 @@ static void pull_rt_task(struct rq *this_rq) * double_lock_balance, and another CPU could * alter this_rq */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2161,7 +2414,7 @@ static void pull_rt_task(struct rq *this_rq) /* * There's a chance that p is higher in priority * than what's currently running on its CPU. - * This is just that p is wakeing up and hasn't + * This is just that p is waking up and hasn't * had a chance to schedule. We only pull * p if it is lower in priority than the * current task on the run queue @@ -2169,11 +2422,14 @@ static void pull_rt_task(struct rq *this_rq) if (p->prio < src_rq->curr->prio) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + if (is_migration_disabled(p)) { + push_task = get_push_task(src_rq); + } else { + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + resched = true; + } /* * We continue with the search, just in * case there's an even higher prio task @@ -2183,6 +2439,13 @@ static void pull_rt_task(struct rq *this_rq) } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + raw_spin_rq_unlock(this_rq); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + raw_spin_rq_lock(this_rq); + } } if (resched) @@ -2195,14 +2458,14 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - bool need_to_push = !task_running(rq, p) && + bool need_to_push = !task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio); - if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) + if (need_to_push) push_rt_tasks(rq); } @@ -2266,18 +2529,22 @@ void __init init_sched_rt_class(void) static void switched_to_rt(struct rq *rq, struct task_struct *p) { /* - * If we are already running, then there's nothing - * that needs to be done. But if we are not running - * we may need to preempt the current running task. - * If that current running task is also an RT task + * If we are running, update the avg_rt tracking, as the running time + * will now on be accounted into the latter. + */ + if (task_current(rq, p)) { + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + return; + } + + /* + * If we are not running we may need to preempt the current + * running task. If that current running task is also an RT task * then see if we can move to another run queue. */ - if (task_on_rq_queued(p) && rq->curr != p) { + if (task_on_rq_queued(p)) { #ifdef CONFIG_SMP - bool need_to_push = rq->rt.overloaded || - !rt_task_fits_capacity(p, cpu_of(rq)); - - if (p->nr_cpus_allowed > 1 && need_to_push) + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) @@ -2295,7 +2562,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->curr == p) { + if (task_current(rq, p)) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we @@ -2407,8 +2674,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -const struct sched_class rt_sched_class = { - .next = &fair_sched_class, +DEFINE_SCHED_CLASS(rt) = { + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, @@ -2421,12 +2688,14 @@ const struct sched_class rt_sched_class = { #ifdef CONFIG_SMP .balance = balance_rt, + .pick_task = pick_task_rt, .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, + .find_lock_rq = find_lock_lowest_rq, #endif .task_tick = task_tick_rt, @@ -2449,10 +2718,11 @@ const struct sched_class rt_sched_class = { */ static DEFINE_MUTEX(rt_constraints_mutex); -/* Must be called with tasklist_lock held */ static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_struct *g, *p; + struct task_struct *task; + struct css_task_iter it; + int ret = 0; /* * Autogroups do not have RT tasks; see autogroup_create(). @@ -2460,12 +2730,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg) if (task_group_is_autogroup(tg)) return 0; - for_each_process_thread(g, p) { - if (rt_task(p) && task_group(p) == tg) - return 1; - } + css_task_iter_start(&tg->css, 0, &it); + while (!ret && (task = css_task_iter_next(&it))) + ret |= rt_task(task); + css_task_iter_end(&it); - return 0; + return ret; } struct rt_schedulable_data { @@ -2496,9 +2766,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) return -EINVAL; /* - * Ensure we don't starve existing RT tasks. + * Ensure we don't starve existing RT tasks if runtime turns zero. */ - if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + if (rt_bandwidth_enabled() && !runtime && + tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY; total = to_ratio(period, runtime); @@ -2563,8 +2834,13 @@ static int tg_set_rt_bandwidth(struct task_group *tg, if (rt_period == 0) return -EINVAL; + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); err = __rt_schedulable(tg, rt_period, rt_runtime); if (err) goto unlock; @@ -2582,7 +2858,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg, } raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: - read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return err; @@ -2636,18 +2911,18 @@ long sched_group_rt_period(struct task_group *tg) return rt_period_us; } +#ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { int ret = 0; mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); ret = __rt_schedulable(NULL, 0, 0); - read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; } +#endif /* CONFIG_SYSCTL */ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { @@ -2659,6 +2934,8 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) } #else /* !CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { unsigned long flags; @@ -2676,15 +2953,19 @@ static int sched_rt_global_constraints(void) return 0; } +#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) { if (sysctl_sched_rt_period <= 0) return -EINVAL; if ((sysctl_sched_rt_runtime != RUNTIME_INF) && - (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) + ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || + ((u64)sysctl_sched_rt_runtime * + NSEC_PER_USEC > max_rt_runtime))) return -EINVAL; return 0; @@ -2692,13 +2973,16 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { + unsigned long flags; + + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); def_rt_bandwidth.rt_runtime = global_rt_runtime(); def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_period, old_runtime; static DEFINE_MUTEX(mutex); @@ -2736,9 +3020,8 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; static DEFINE_MUTEX(mutex); @@ -2758,6 +3041,7 @@ int sched_rr_handler(struct ctl_table *table, int write, return ret; } +#endif /* CONFIG_SYSCTL */ #ifdef CONFIG_SCHED_DEBUG void print_rt_stats(struct seq_file *m, int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9ea647835fd6..a4a20046e586 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,83 +2,99 @@ /* * Scheduler internal types and methods: */ -#include <linux/sched.h> +#ifndef _KERNEL_SCHED_SCHED_H +#define _KERNEL_SCHED_SCHED_H +#include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> -#include <linux/sched/clock.h> -#include <linux/sched/coredump.h> #include <linux/sched/cpufreq.h> -#include <linux/sched/cputime.h> #include <linux/sched/deadline.h> -#include <linux/sched/debug.h> -#include <linux/sched/hotplug.h> -#include <linux/sched/idle.h> -#include <linux/sched/init.h> -#include <linux/sched/isolation.h> -#include <linux/sched/jobctl.h> +#include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/sched/mm.h> -#include <linux/sched/nohz.h> -#include <linux/sched/numa_balancing.h> -#include <linux/sched/prio.h> -#include <linux/sched/rt.h> +#include <linux/sched/rseq_api.h> #include <linux/sched/signal.h> #include <linux/sched/smt.h> #include <linux/sched/stat.h> #include <linux/sched/sysctl.h> +#include <linux/sched/task_flags.h> #include <linux/sched/task.h> -#include <linux/sched/task_stack.h> #include <linux/sched/topology.h> -#include <linux/sched/user.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/xacct.h> - -#include <uapi/linux/sched/types.h> -#include <linux/binfmts.h> -#include <linux/blkdev.h> -#include <linux/compat.h> +#include <linux/atomic.h> +#include <linux/bitmap.h> +#include <linux/bug.h> +#include <linux/capability.h> +#include <linux/cgroup_api.h> +#include <linux/cgroup.h> #include <linux/context_tracking.h> #include <linux/cpufreq.h> -#include <linux/cpuidle.h> -#include <linux/cpuset.h> +#include <linux/cpumask_api.h> #include <linux/ctype.h> -#include <linux/debugfs.h> -#include <linux/delayacct.h> -#include <linux/energy_model.h> -#include <linux/init_task.h> -#include <linux/kprobes.h> +#include <linux/file.h> +#include <linux/fs_api.h> +#include <linux/hrtimer_api.h> +#include <linux/interrupt.h> +#include <linux/irq_work.h> +#include <linux/jiffies.h> +#include <linux/kref_api.h> #include <linux/kthread.h> -#include <linux/membarrier.h> -#include <linux/migrate.h> -#include <linux/mmu_context.h> -#include <linux/nmi.h> +#include <linux/ktime_api.h> +#include <linux/lockdep_api.h> +#include <linux/lockdep.h> +#include <linux/minmax.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/mutex_api.h> +#include <linux/plist.h> +#include <linux/poll.h> #include <linux/proc_fs.h> -#include <linux/prefetch.h> #include <linux/profile.h> #include <linux/psi.h> -#include <linux/rcupdate_wait.h> -#include <linux/security.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/seqlock.h> +#include <linux/softirq.h> +#include <linux/spinlock_api.h> +#include <linux/static_key.h> #include <linux/stop_machine.h> -#include <linux/suspend.h> -#include <linux/swait.h> +#include <linux/syscalls_api.h> #include <linux/syscalls.h> -#include <linux/task_work.h> -#include <linux/tsacct_kern.h> +#include <linux/tick.h> +#include <linux/topology.h> +#include <linux/types.h> +#include <linux/u64_stats_sync_api.h> +#include <linux/uaccess.h> +#include <linux/wait_api.h> +#include <linux/wait_bit.h> +#include <linux/workqueue_api.h> + +#include <trace/events/power.h> +#include <trace/events/sched.h> + +#include "../workqueue_internal.h" + +#ifdef CONFIG_CGROUP_SCHED +#include <linux/cgroup.h> +#include <linux/psi.h> +#endif -#include <asm/tlb.h> +#ifdef CONFIG_SCHED_DEBUG +# include <linux/static_key.h> +#endif #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> +# include <asm/paravirt_api_clock.h> #endif #include "cpupri.h" #include "cpudeadline.h" #ifdef CONFIG_SCHED_DEBUG -# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else -# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif struct rq; @@ -93,9 +109,17 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; +extern unsigned int sysctl_sched_child_runs_first; + extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); +extern void call_trace_sched_update_nr_running(struct rq *rq, int count); + +extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_runtime; +extern int sched_rr_timeslice; + /* * Helpers for converting nanosecond timing to jiffy resolution */ @@ -118,7 +142,13 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust); #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) -# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ +}) #else # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) (w) @@ -131,7 +161,7 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust); * scale_load() and scale_load_down(w) to convert between them. The * following must be true: * - * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD * */ #define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) @@ -189,6 +219,19 @@ static inline int task_has_dl_policy(struct task_struct *p) #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +static inline void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff / 8; +} + +/* + * Shifting a value by an exponent greater *or equal* to the size of said value + * is UB; cap at size-1. + */ +#define shr_bound(val, shift) \ + (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) + /* * !! For sched_setattr_nocheck() (kernel) only !! * @@ -203,6 +246,8 @@ static inline int task_has_dl_policy(struct task_struct *p) */ #define SCHED_FLAG_SUGOV 0x10000000 +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL @@ -241,30 +286,6 @@ struct rt_bandwidth { void __dl_clear_params(struct task_struct *p); -/* - * To keep the bandwidth of -deadline tasks and groups under control - * we need some place where: - * - store the maximum -deadline bandwidth of the system (the group); - * - cache the fraction of that bandwidth that is currently allocated. - * - * This is all done in the data structure below. It is similar to the - * one used for RT-throttling (rt_bandwidth), with the main difference - * that, since here we are only interested in admission control, we - * do not decrease any runtime while the group "executes", neither we - * need a timer to replenish it. - * - * With respect to SMP, the bandwidth is given on a per-CPU basis, - * meaning that: - * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; - * - dl_total_bw array contains, in the i-eth element, the currently - * allocated bandwidth on the i-eth CPU. - * Moreover, groups consume bandwidth on each CPU, while tasks only - * consume bandwidth on the CPU they're running on. - * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw - * that will be shown the next time the proc or cgroup controls will - * be red. It on its turn can be changed by writing on its own - * control. - */ struct dl_bandwidth { raw_spinlock_t dl_runtime_lock; u64 dl_runtime; @@ -276,36 +297,30 @@ static inline int dl_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } +/* + * To keep the bandwidth of -deadline tasks under control + * we need some place where: + * - store the maximum -deadline bandwidth of each cpu; + * - cache the fraction of bandwidth that is currently allocated in + * each root domain; + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, bandwidth is given on a per root domain basis, + * meaning that: + * - bw (< 100%) is the deadline bandwidth of each CPU; + * - total_bw is the currently allocated bandwidth in each root domain; + */ struct dl_bw { raw_spinlock_t lock; u64 bw; u64 total_bw; }; -static inline void __dl_update(struct dl_bw *dl_b, s64 bw); - -static inline -void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) -{ - dl_b->total_bw -= tsk_bw; - __dl_update(dl_b, (s32)tsk_bw / cpus); -} - -static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) -{ - dl_b->total_bw += tsk_bw; - __dl_update(dl_b, -((s32)tsk_bw / cpus)); -} - -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) -{ - return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; -} - -extern void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); @@ -314,15 +329,11 @@ extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); -extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern bool dl_cpu_busy(unsigned int cpu); +extern int dl_cpu_busy(int cpu, struct task_struct *p); #ifdef CONFIG_CGROUP_SCHED -#include <linux/cgroup.h> -#include <linux/psi.h> - struct cfs_rq; struct rt_rq; @@ -334,11 +345,12 @@ struct cfs_bandwidth { ktime_t period; u64 quota; u64 runtime; + u64 burst; + u64 runtime_snap; s64 hierarchical_quota; u8 idle; u8 period_active; - u8 distribute_running; u8 slack_started; struct hrtimer period_timer; struct hrtimer slack_timer; @@ -347,7 +359,9 @@ struct cfs_bandwidth { /* Statistics: */ int nr_periods; int nr_throttled; + int nr_burst; u64 throttled_time; + u64 burst_time; #endif }; @@ -362,6 +376,9 @@ struct task_group { struct cfs_rq **cfs_rq; unsigned long shares; + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; + #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put @@ -449,8 +466,6 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); -extern void free_rt_sched_group(struct task_group *tg); -extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); @@ -464,13 +479,15 @@ extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_online_group(struct task_group *tg, struct task_group *parent); extern void sched_destroy_group(struct task_group *tg); -extern void sched_offline_group(struct task_group *tg); +extern void sched_release_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern int sched_group_set_idle(struct task_group *tg, long idle); + #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); @@ -486,16 +503,64 @@ struct cfs_bandwidth { }; #endif /* CONFIG_CGROUP_SCHED */ +extern void unregister_rt_sched_group(struct task_group *tg); +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); + +/* + * u64_u32_load/u64_u32_store + * + * Use a copy of a u64 value to protect against data race. This is only + * applicable for 32-bits architectures. + */ +#ifdef CONFIG_64BIT +# define u64_u32_load_copy(var, copy) var +# define u64_u32_store_copy(var, copy, val) (var = val) +#else +# define u64_u32_load_copy(var, copy) \ +({ \ + u64 __val, __val_copy; \ + do { \ + __val_copy = copy; \ + /* \ + * paired with u64_u32_store_copy(), ordering access \ + * to var and copy. \ + */ \ + smp_rmb(); \ + __val = var; \ + } while (__val != __val_copy); \ + __val; \ +}) +# define u64_u32_store_copy(var, copy, val) \ +do { \ + typeof(val) __val = (val); \ + var = __val; \ + /* \ + * paired with u64_u32_load_copy(), ordering access to var and \ + * copy. \ + */ \ + smp_wmb(); \ + copy = __val; \ +} while (0) +#endif +# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) +# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned long runnable_weight; unsigned int nr_running; unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ u64 exec_clock; u64 min_vruntime; +#ifdef CONFIG_SCHED_CORE + unsigned int forceidle_seq; + u64 min_vruntime_fi; +#endif + #ifndef CONFIG_64BIT u64 min_vruntime_copy; #endif @@ -521,14 +586,14 @@ struct cfs_rq { */ struct sched_avg avg; #ifndef CONFIG_64BIT - u64 load_last_update_time_copy; + u64 last_update_time_copy; #endif struct { raw_spinlock_t lock ____cacheline_aligned; int nr; unsigned long load_avg; unsigned long util_avg; - unsigned long runnable_sum; + unsigned long runnable_avg; } removed; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -563,13 +628,20 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + /* Locally cached copy of our task_group's idle value */ + int idle; + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; + u64 throttled_pelt_idle; +#ifndef CONFIG_64BIT + u64 throttled_pelt_idle_copy; +#endif u64 throttled_clock; - u64 throttled_clock_task; - u64 throttled_clock_task_time; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; int throttled; int throttle_count; struct list_head throttled_list; @@ -601,8 +673,8 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - unsigned long rt_nr_total; + unsigned int rt_nr_migratory; + unsigned int rt_nr_total; int overloaded; struct plist_head pushable_tasks; @@ -616,7 +688,7 @@ struct rt_rq { raw_spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; + unsigned int rt_nr_boosted; struct rq *rq; struct task_group *tg; @@ -633,7 +705,7 @@ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ struct rb_root_cached root; - unsigned long dl_nr_running; + unsigned int dl_nr_running; #ifdef CONFIG_SMP /* @@ -647,7 +719,7 @@ struct dl_rq { u64 next; } earliest_dl; - unsigned long dl_nr_migratory; + unsigned int dl_nr_migratory; int overloaded; /* @@ -688,8 +760,30 @@ struct dl_rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) + +static inline void se_update_runnable(struct sched_entity *se) +{ + if (!entity_is_task(se)) + se->runnable_weight = se->my_q->h_nr_running; +} + +static inline long se_runnable(struct sched_entity *se) +{ + if (entity_is_task(se)) + return !!se->on_rq; + else + return se->runnable_weight; +} + #else #define entity_is_task(se) 1 + +static inline void se_update_runnable(struct sched_entity *se) {} + +static inline long se_runnable(struct sched_entity *se) +{ + return !!se->on_rq; +} #endif #ifdef CONFIG_SMP @@ -701,10 +795,6 @@ static inline long se_weight(struct sched_entity *se) return scale_load_down(se->load.weight); } -static inline long se_runnable(struct sched_entity *se) -{ - return scale_load_down(se->runnable_weight); -} static inline bool sched_asym_prefer(int a, int b) { @@ -755,6 +845,15 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; + /* + * Indicate whether a root_domain's dl_bw has been checked or + * updated. It's monotonously increasing value. + * + * Also, some corner cases, like 'wrap around' is dangerous, but given + * that u64 is 'big enough'. So that shouldn't be a concern. + */ + u64 visit_gen; + #ifdef HAVE_RT_PUSH_IPI /* * For IPI pull requests, loop across the rto_mask. @@ -835,8 +934,16 @@ struct uclamp_rq { unsigned int value; struct uclamp_bucket bucket[UCLAMP_BUCKETS]; }; + +DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ +struct rq; +struct balance_callback { + struct balance_callback *next; + void (*func)(struct rq *rq); +}; + /* * This is the main, per-CPU runqueue data structure. * @@ -846,7 +953,7 @@ struct uclamp_rq { */ struct rq { /* runqueue lock: */ - raw_spinlock_t lock; + raw_spinlock_t __lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -860,15 +967,17 @@ struct rq { #endif #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_SMP - unsigned long last_load_update_tick; unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; + call_single_data_t nohz_csd; #endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; - atomic_t nohz_flags; + atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ - unsigned long nr_load_updates; +#ifdef CONFIG_SMP + unsigned int ttwu_pending; +#endif u64 nr_switches; #ifdef CONFIG_UCLAMP_TASK @@ -894,7 +1003,7 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned long nr_uninterruptible; + unsigned int nr_uninterruptible; struct task_struct __rcu *curr; struct task_struct *idle; @@ -908,9 +1017,20 @@ struct rq { u64 clock_task ____cacheline_aligned; u64 clock_pelt; unsigned long lost_idle_time; + u64 clock_pelt_idle; + u64 clock_idle; +#ifndef CONFIG_64BIT + u64 clock_pelt_idle_copy; + u64 clock_idle_copy; +#endif atomic_t nr_iowait; +#ifdef CONFIG_SCHED_DEBUG + u64 last_seen_need_resched_ns; + int ticks_without_resched; +#endif + #ifdef CONFIG_MEMBARRIER int membarrier_state; #endif @@ -922,8 +1042,9 @@ struct rq { unsigned long cpu_capacity; unsigned long cpu_capacity_orig; - struct callback_head *balance_callback; + struct balance_callback *balance_callback; + unsigned char nohz_idle_balance; unsigned char idle_balance; unsigned long misfit_task_load; @@ -944,12 +1065,22 @@ struct rq { #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif +#ifdef CONFIG_SCHED_THERMAL_PRESSURE + struct sched_avg avg_thermal; +#endif u64 idle_stamp; u64 avg_idle; + unsigned long wake_stamp; + u64 wake_avg_idle; + /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; + +#ifdef CONFIG_HOTPLUG_CPU + struct rcuwait hotplug_wait; #endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -967,10 +1098,10 @@ struct rq { #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP - int hrtick_csd_pending; call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; + ktime_t hrtick_time; #endif #ifdef CONFIG_SCHEDSTATS @@ -991,14 +1122,34 @@ struct rq { unsigned int ttwu_local; #endif -#ifdef CONFIG_SMP - struct llist_head wake_list; -#endif - #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; #endif + +#ifdef CONFIG_SMP + unsigned int nr_pinned; +#endif + unsigned int push_busy; + struct cpu_stop_work push_work; + +#ifdef CONFIG_SCHED_CORE + /* per rq */ + struct rq *core; + struct task_struct *core_pick; + unsigned int core_enabled; + unsigned int core_sched_seq; + struct rb_root core_tree; + + /* shared state -- careful with sched_core_cpu_deactivate() */ + unsigned int core_task_seq; + unsigned int core_pick_seq; + unsigned long core_cookie; + unsigned int core_forceidle_count; + unsigned int core_forceidle_seq; + unsigned int core_forceidle_occupation; + u64 core_forceidle_start; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1026,6 +1177,213 @@ static inline int cpu_of(struct rq *rq) #endif } +#define MDF_PUSH 0x01 + +static inline bool is_migration_disabled(struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->migration_disabled; +#else + return false; +#endif +} + +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() this_cpu_ptr(&runqueues) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() raw_cpu_ptr(&runqueues) + +struct sched_group; +#ifdef CONFIG_SCHED_CORE +static inline struct cpumask *sched_group_span(struct sched_group *sg); + +DECLARE_STATIC_KEY_FALSE(__sched_core_enabled); + +static inline bool sched_core_enabled(struct rq *rq) +{ + return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled; +} + +static inline bool sched_core_disabled(void) +{ + return !static_branch_unlikely(&__sched_core_enabled); +} + +/* + * Be careful with this function; not for general use. The return value isn't + * stable unless you actually hold a relevant rq->__lock. + */ +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + if (sched_core_enabled(rq)) + return &rq->core->__lock; + + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + if (rq->core_enabled) + return &rq->core->__lock; + + return &rq->__lock; +} + +bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); + +/* + * Helpers to check if the CPU's core cookie matches with the task's cookie + * when core scheduling is enabled. + * A special case is that the task's cookie always matches with CPU's core + * cookie if the CPU is in an idle core. + */ +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + return rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + bool idle_core = true; + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { + if (!available_idle_cpu(cpu)) { + idle_core = false; + break; + } + } + + /* + * A CPU in an idle core is always the best choice for tasks with + * cookies. + */ + return idle_core || rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) { + if (sched_core_cookie_match(cpu_rq(cpu), p)) + return true; + } + return false; +} + +static inline bool sched_core_enqueued(struct task_struct *p) +{ + return !RB_EMPTY_NODE(&p->core_node); +} + +extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); + +extern void sched_core_get(void); +extern void sched_core_put(void); + +#else /* !CONFIG_SCHED_CORE */ + +static inline bool sched_core_enabled(struct rq *rq) +{ + return false; +} + +static inline bool sched_core_disabled(void) +{ + return true; +} + +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + return true; +} +#endif /* CONFIG_SCHED_CORE */ + +static inline void lockdep_assert_rq_held(struct rq *rq) +{ + lockdep_assert_held(__rq_lockp(rq)); +} + +extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); +extern bool raw_spin_rq_trylock(struct rq *rq); +extern void raw_spin_rq_unlock(struct rq *rq); + +static inline void raw_spin_rq_lock(struct rq *rq) +{ + raw_spin_rq_lock_nested(rq, 0); +} + +static inline void raw_spin_rq_lock_irq(struct rq *rq) +{ + local_irq_disable(); + raw_spin_rq_lock(rq); +} + +static inline void raw_spin_rq_unlock_irq(struct rq *rq) +{ + raw_spin_rq_unlock(rq); + local_irq_enable(); +} + +static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) +{ + unsigned long flags; + local_irq_save(flags); + raw_spin_rq_lock(rq); + return flags; +} + +static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags) +{ + raw_spin_rq_unlock(rq); + local_irq_restore(flags); +} + +#define raw_spin_rq_lock_irqsave(rq, flags) \ +do { \ + flags = _raw_spin_rq_lock_irqsave(rq); \ +} while (0) #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); @@ -1040,21 +1398,59 @@ static inline void update_idle_core(struct rq *rq) static inline void update_idle_core(struct rq *rq) { } #endif -DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline struct task_struct *task_of(struct sched_entity *se) +{ + SCHED_WARN_ON(!entity_is_task(se)); + return container_of(se, struct task_struct, se); +} -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() this_cpu_ptr(&runqueues) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() raw_cpu_ptr(&runqueues) +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} -extern void update_rq_clock(struct rq *rq); +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} -static inline u64 __rq_clock_broken(struct rq *rq) +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) { - return READ_ONCE(rq->clock); + return grp->my_q; } +#else + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} +#endif + +extern void update_rq_clock(struct rq *rq); + /* * rq::clock_update_flags bits * @@ -1074,7 +1470,7 @@ static inline u64 __rq_clock_broken(struct rq *rq) * * if (rq-clock_update_flags >= RQCF_UPDATED) * - * to check if %RQCF_UPADTED is set. It'll never be shifted more than + * to check if %RQCF_UPDATED is set. It'll never be shifted more than * one position though, because the next rq_unpin_lock() will shift it * back. */ @@ -1093,7 +1489,7 @@ static inline void assert_clock_updated(struct rq *rq) static inline u64 rq_clock(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); assert_clock_updated(rq); return rq->clock; @@ -1101,25 +1497,43 @@ static inline u64 rq_clock(struct rq *rq) static inline u64 rq_clock_task(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); assert_clock_updated(rq); return rq->clock_task; } +/** + * By default the decay is the default pelt decay period. + * The decay shift can change the decay period in + * multiples of 32. + * Decay shift Decay period(ms) + * 0 32 + * 1 64 + * 2 128 + * 3 256 + * 4 512 + */ +extern int sched_thermal_decay_shift; + +static inline u64 rq_clock_thermal(struct rq *rq) +{ + return rq_clock_task(rq) >> sched_thermal_decay_shift; +} + static inline void rq_clock_skip_update(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rq->clock_update_flags |= RQCF_REQ_SKIP; } /* * See rt task throttling, which is the only time a skip - * request is cancelled. + * request is canceled. */ static inline void rq_clock_cancel_skipupdate(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rq->clock_update_flags &= ~RQCF_REQ_SKIP; } @@ -1136,13 +1550,28 @@ struct rq_flags { #endif }; +extern struct balance_callback balance_push_callback; + +/* + * Lockdep annotation that avoids accidental unlocks; it's like a + * sticky/continuous lockdep_assert_held(). + * + * This avoids code that has access to 'struct rq *rq' (basically everything in + * the scheduler) from accidentally unlocking the rq if they do not also have a + * copy of the (on-stack) 'struct rq_flags rf'. + * + * Also see Documentation/locking/lockdep-design.rst. + */ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); #ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; +#ifdef CONFIG_SMP + SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); +#endif #endif } @@ -1153,12 +1582,12 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) rf->clock_update_flags = RQCF_UPDATED; #endif - lockdep_unpin_lock(&rq->lock, rf->cookie); + lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); } static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) { - lockdep_repin_lock(&rq->lock, rf->cookie); + lockdep_repin_lock(__rq_lockp(rq), rf->cookie); #ifdef CONFIG_SCHED_DEBUG /* @@ -1179,7 +1608,7 @@ static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); } static inline void @@ -1188,7 +1617,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(p->pi_lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } @@ -1196,7 +1625,7 @@ static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock_irqsave(&rq->lock, rf->flags); + raw_spin_rq_lock_irqsave(rq, rf->flags); rq_pin_lock(rq, rf); } @@ -1204,7 +1633,7 @@ static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock_irq(&rq->lock); + raw_spin_rq_lock_irq(rq); rq_pin_lock(rq, rf); } @@ -1212,24 +1641,16 @@ static inline void rq_lock(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); rq_pin_lock(rq, rf); } static inline void -rq_relock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); -} - -static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock_irqrestore(&rq->lock, rf->flags); + raw_spin_rq_unlock_irqrestore(rq, rf->flags); } static inline void @@ -1237,7 +1658,7 @@ rq_unlock_irq(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); } static inline void @@ -1245,7 +1666,7 @@ rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); } static inline struct rq * @@ -1269,12 +1690,14 @@ enum numa_topology_type { extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); -extern void sched_init_numa(void); +extern void sched_init_numa(int offline_node); +extern void sched_update_numa(int cpu, bool online); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); #else -static inline void sched_init_numa(void) { } +static inline void sched_init_numa(int offline_node) { } +static inline void sched_update_numa(int cpu, bool online) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) @@ -1307,21 +1730,24 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static inline void queue_balance_callback(struct rq *rq, - struct callback_head *head, + struct balance_callback *head, void (*func)(struct rq *rq)) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); - if (unlikely(head->next)) + /* + * Don't (re)queue an already queued item; nor queue anything when + * balance_push() is active, see the comment with + * balance_push_callback. + */ + if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; - head->func = (void (*)(struct callback_head *))func; + head->func = func; head->next = rq->balance_callback; rq->balance_callback = head; } -extern void sched_ttwu_pending(void); - #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -1337,8 +1763,6 @@ extern void sched_ttwu_pending(void); for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) -#define for_each_lower_domain(sd) for (; sd; sd = sd->child) - /** * highest_flag_domain - Return highest sched_domain containing flag. * @cpu: The CPU whose highest level of sched domain is to @@ -1382,6 +1806,11 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +static __always_inline bool sched_asym_cpucap_active(void) +{ + return static_branch_unlikely(&sched_asym_cpucapacity); +} + struct sched_group_capacity { atomic_t ref; /* @@ -1398,7 +1827,7 @@ struct sched_group_capacity { int id; #endif - unsigned long cpumask[0]; /* Balance mask */ + unsigned long cpumask[]; /* Balance mask */ }; struct sched_group { @@ -1408,6 +1837,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; int asym_prefer_cpu; /* CPU of highest priority in group */ + int flags; /* * The CPUs this group covers. @@ -1416,7 +1846,7 @@ struct sched_group { * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) */ - unsigned long cpumask[0]; + unsigned long cpumask[]; }; static inline struct cpumask *sched_group_span(struct sched_group *sg) @@ -1432,45 +1862,50 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) return to_cpumask(sg->sgc->cpumask); } -/** - * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. - * @group: The group whose first CPU is to be returned. - */ -static inline unsigned int group_first_cpu(struct sched_group *group) -{ - return cpumask_first(sched_group_span(group)); -} - extern int group_balance_cpu(struct sched_group *sg); -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -void register_sched_domain_sysctl(void); +#ifdef CONFIG_SCHED_DEBUG +void update_sched_domain_debugfs(void); void dirty_sched_domain_sysctl(int cpu); -void unregister_sched_domain_sysctl(void); #else -static inline void register_sched_domain_sysctl(void) +static inline void update_sched_domain_debugfs(void) { } static inline void dirty_sched_domain_sysctl(int cpu) { } -static inline void unregister_sched_domain_sysctl(void) +#endif + +extern int sched_update_scaling(void); +#endif /* CONFIG_SMP */ + +#include "stats.h" + +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) { + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); } -#endif -extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); +extern void __sched_core_tick(struct rq *rq); -#else +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} -static inline void sched_ttwu_pending(void) { } +#else -static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } +static inline void sched_core_account_forceidle(struct rq *rq) {} -#endif /* CONFIG_SMP */ +static inline void sched_core_tick(struct rq *rq) {} -#include "stats.h" -#include "autogroup.h" +#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */ #ifdef CONFIG_CGROUP_SCHED @@ -1503,6 +1938,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; + p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -1531,11 +1967,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * per-task data have been completed by this moment. */ smp_wmb(); -#ifdef CONFIG_THREAD_INFO_IN_TASK - WRITE_ONCE(p->cpu, cpu); -#else WRITE_ONCE(task_thread_info(p)->cpu, cpu); -#endif p->wake_cpu = cpu; #endif } @@ -1544,7 +1976,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include <linux/static_key.h> # define const_debug __read_mostly #else # define const_debug const @@ -1560,7 +1991,7 @@ enum { #undef SCHED_FEAT -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * To support run-time toggling of sched features, all the translation units @@ -1568,6 +1999,7 @@ enum { */ extern const_debug unsigned int sysctl_sched_features; +#ifdef CONFIG_JUMP_LABEL #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -1580,7 +2012,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */ +#else /* !CONFIG_JUMP_LABEL */ + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* CONFIG_JUMP_LABEL */ + +#else /* !SCHED_DEBUG */ /* * Each translation unit has its own copy of sysctl_sched_features to allow @@ -1596,7 +2034,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */ +#endif /* SCHED_DEBUG */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; @@ -1619,7 +2057,7 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -static inline int task_running(struct rq *rq, struct task_struct *p) +static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP return p->on_cpu; @@ -1638,12 +2076,19 @@ static inline int task_on_rq_migrating(struct task_struct *p) return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; } -/* - * wake flags - */ -#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ -#define WF_FORK 0x02 /* Child wakeup after fork */ -#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ +/* Wake flags. The first three directly map to some SD flag value */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ + +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ + +#ifdef CONFIG_SMP +static_assert(WF_EXEC == SD_BALANCE_EXEC); +static_assert(WF_FORK == SD_BALANCE_FORK); +static_assert(WF_TTWU == SD_BALANCE_WAKE); +#endif /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1700,7 +2145,6 @@ extern const u32 sched_prio_to_wmult[40]; #define RETRY_TASK ((void *)-1UL) struct sched_class { - const struct sched_class *next; #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; @@ -1709,7 +2153,7 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); - bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); @@ -1720,16 +2164,22 @@ struct sched_class { #ifdef CONFIG_SMP int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + + struct task_struct * (*pick_task)(struct rq *rq); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, - const struct cpumask *newmask); + const struct cpumask *newmask, + u32 flags); void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); + + struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); #endif void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); @@ -1738,7 +2188,7 @@ struct sched_class { /* * The switched_from() call is allowed to drop rq->lock, therefore we - * cannot assume the switched_from/switched_to pair is serliazed by + * cannot assume the switched_from/switched_to pair is serialized by * rq->lock. They are however serialized by p->pi_lock. */ void (*switched_from)(struct rq *this_rq, struct task_struct *task); @@ -1751,11 +2201,8 @@ struct sched_class { void (*update_curr)(struct rq *rq); -#define TASK_SET_GROUP 0 -#define TASK_MOVE_GROUP 1 - #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_change_group)(struct task_struct *p, int type); + void (*task_change_group)(struct task_struct *p); #endif }; @@ -1767,21 +2214,36 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) static inline void set_next_task(struct rq *rq, struct task_struct *next) { - WARN_ON_ONCE(rq->curr != next); next->sched_class->set_next_task(rq, next, false); } -#ifdef CONFIG_SMP -#define sched_class_highest (&stop_sched_class) -#else -#define sched_class_highest (&dl_sched_class) -#endif + +/* + * Helper to define a sched_class instance; each one is placed in a separate + * section which is ordered by the linker script: + * + * include/asm-generic/vmlinux.lds.h + * + * *CAREFUL* they are laid out in *REVERSE* order!!! + * + * Also enforce alignment on the instance, not the type, to guarantee layout. + */ +#define DEFINE_SCHED_CLASS(name) \ +const struct sched_class name##_sched_class \ + __aligned(__alignof__(struct sched_class)) \ + __section("__" #name "_sched_class") + +/* Defined in include/asm-generic/vmlinux.lds.h */ +extern struct sched_class __sched_class_highest[]; +extern struct sched_class __sched_class_lowest[]; #define for_class_range(class, _from, _to) \ - for (class = (_from); class != (_to); class = class->next) + for (class = (_from); class < (_to); class++) #define for_each_class(class) \ - for_class_range(class, sched_class_highest, NULL) + for_class_range(class, __sched_class_highest, __sched_class_lowest) + +#define sched_class_above(_a, _b) ((_a) < (_b)) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -1812,13 +2274,39 @@ static inline bool sched_fair_runnable(struct rq *rq) extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); extern struct task_struct *pick_next_task_idle(struct rq *rq); +#define SCA_CHECK 0x01 +#define SCA_MIGRATE_DISABLE 0x02 +#define SCA_MIGRATE_ENABLE 0x04 +#define SCA_USER 0x08 + #ifdef CONFIG_SMP extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + +static inline struct task_struct *get_push_task(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + lockdep_assert_rq_held(rq); + + if (rq->push_busy) + return NULL; + + if (p->nr_cpus_allowed == 1) + return NULL; + + if (p->migration_disabled) + return NULL; + + rq->push_busy = true; + return get_task_struct(p); +} + +extern int push_cpu_stop(void *arg); #endif @@ -1864,16 +2352,17 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); -extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 +#define MAX_BW_BITS (64 - BW_SHIFT) +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1) unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); @@ -1890,12 +2379,7 @@ extern int __init sched_tick_offload_init(void); */ static inline void sched_update_tick_dependency(struct rq *rq) { - int cpu; - - if (!tick_nohz_full_enabled()) - return; - - cpu = cpu_of(rq); + int cpu = cpu_of(rq); if (!tick_nohz_full_cpu(cpu)) return; @@ -1915,6 +2399,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count) unsigned prev_nr = rq->nr_running; rq->nr_running = prev_nr + count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, count); + } #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) { @@ -1929,6 +2416,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count) static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, -count); + } + /* Check if we still need preemption */ sched_update_tick_dependency(rq); } @@ -1938,9 +2429,32 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +#ifdef CONFIG_PREEMPT_RT +#define SCHED_NR_MIGRATE_BREAK 8 +#else +#define SCHED_NR_MIGRATE_BREAK 32 +#endif + extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_DEBUG +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_idle_min_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern int sysctl_resched_latency_warn_ms; +extern int sysctl_resched_latency_warn_once; + +extern unsigned int sysctl_sched_tunable_scaling; + +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_size; +extern unsigned int sysctl_numa_balancing_hot_threshold; +#endif + #ifdef CONFIG_SCHED_HRTICK /* @@ -1950,17 +2464,39 @@ extern const_debug unsigned int sysctl_sched_migration_cost; */ static inline int hrtick_enabled(struct rq *rq) { - if (!sched_feat(HRTICK)) - return 0; if (!cpu_active(cpu_of(rq))) return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } +static inline int hrtick_enabled_fair(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + return hrtick_enabled(rq); +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + if (!sched_feat(HRTICK_DL)) + return 0; + return hrtick_enabled(rq); +} + void hrtick_start(struct rq *rq, u64 delay); #else +static inline int hrtick_enabled_fair(struct rq *rq) +{ + return 0; +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + return 0; +} + static inline int hrtick_enabled(struct rq *rq) { return 0; @@ -1968,7 +2504,24 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ +#ifndef arch_scale_freq_tick +static __always_inline +void arch_scale_freq_tick(void) +{ +} +#endif + #ifndef arch_scale_freq_capacity +/** + * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. + * @cpu: the CPU in question. + * + * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. + * + * f_curr + * ------ * SCHED_CAPACITY_SCALE + * f_max + */ static __always_inline unsigned long arch_scale_freq_capacity(int cpu) { @@ -1976,10 +2529,56 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +#ifdef CONFIG_SCHED_DEBUG +/* + * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to + * acquire rq lock instead of rq_lock(). So at the end of these two functions + * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of + * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning. + */ +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) +{ + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPTION + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); +#endif +} +#else +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} +#endif -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); +#ifdef CONFIG_SMP + +static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) +{ +#ifdef CONFIG_SCHED_CORE + /* + * In order to not have {0,2},{1,3} turn into into an AB-BA, + * order by core-id first and cpu-id second. + * + * Notably: + * + * double_rq_lock(0,3); will take core-0, core-1 lock + * double_rq_lock(1,2); will take core-1, core-0 lock + * + * when only cpu-id is considered. + */ + if (rq1->core->cpu < rq2->core->cpu) + return true; + if (rq1->core->cpu > rq2->core->cpu) + return false; + + /* + * __sched_core_flip() relies on SMT having cpu-id lock order. + */ +#endif + return rq1->cpu < rq2->cpu; +} + +extern void double_rq_lock(struct rq *rq1, struct rq *rq2); + +#ifdef CONFIG_PREEMPTION /* * fair double_lock_balance: Safely acquires both rq->locks in a fair @@ -1994,7 +2593,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); double_rq_lock(this_rq, busiest); return 1; @@ -2013,20 +2612,22 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - int ret = 0; - - if (unlikely(!raw_spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - raw_spin_unlock(&this_rq->lock); - raw_spin_lock(&busiest->lock); - raw_spin_lock_nested(&this_rq->lock, - SINGLE_DEPTH_NESTING); - ret = 1; - } else - raw_spin_lock_nested(&busiest->lock, - SINGLE_DEPTH_NESTING); + if (__rq_lockp(this_rq) == __rq_lockp(busiest) || + likely(raw_spin_rq_trylock(busiest))) { + double_rq_clock_clear_update(this_rq, busiest); + return 0; + } + + if (rq_order_less(this_rq, busiest)) { + raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(this_rq, busiest); + return 0; } - return ret; + + raw_spin_rq_unlock(this_rq); + double_rq_lock(this_rq, busiest); + + return 1; } #endif /* CONFIG_PREEMPTION */ @@ -2036,11 +2637,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) */ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) { - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work well under rq->lock */ - raw_spin_unlock(&this_rq->lock); - BUG_ON(1); - } + lockdep_assert_irqs_disabled(); return _double_lock_balance(this_rq, busiest); } @@ -2048,8 +2645,9 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - raw_spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); + if (__rq_lockp(this_rq) != __rq_lockp(busiest)) + raw_spin_rq_unlock(busiest); + lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_); } static inline void double_lock(spinlock_t *l1, spinlock_t *l2) @@ -2080,31 +2678,6 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) } /* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq1 < rq2) { - raw_spin_lock(&rq1->lock); - raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); - } else { - raw_spin_lock(&rq2->lock); - raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); - } - } -} - -/* * double_rq_unlock - safely unlock two runqueues * * Note this does not restore interrupts like task_rq_unlock, @@ -2114,11 +2687,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - raw_spin_unlock(&rq1->lock); - if (rq1 != rq2) - raw_spin_unlock(&rq2->lock); + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_unlock(rq2); else __release(rq2->lock); + raw_spin_rq_unlock(rq1); } extern void set_rq_online (struct rq *rq); @@ -2137,10 +2710,11 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq1->lock) __acquires(rq2->lock) { - BUG_ON(!irqs_disabled()); - BUG_ON(rq1 != rq2); - raw_spin_lock(&rq1->lock); + WARN_ON_ONCE(!irqs_disabled()); + WARN_ON_ONCE(rq1 != rq2); + raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ + double_rq_clock_clear_update(rq1, rq2); } /* @@ -2153,8 +2727,8 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - BUG_ON(rq1 != rq2); - raw_spin_unlock(&rq1->lock); + WARN_ON_ONCE(rq1 != rq2); + raw_spin_rq_unlock(rq1); __release(rq2->lock); } @@ -2164,7 +2738,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); #ifdef CONFIG_SCHED_DEBUG -extern bool sched_debug_enabled; +extern bool sched_debug_verbose; extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); @@ -2172,6 +2746,8 @@ extern void print_dl_stats(struct seq_file *m, int cpu); extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); + +extern void resched_latency_warn(int cpu, u64 latency); #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); @@ -2179,6 +2755,8 @@ extern void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, unsigned long tpf, unsigned long gsf, unsigned long gpf); #endif /* CONFIG_NUMA_BALANCING */ +#else +static inline void resched_latency_warn(int cpu, u64 latency) {} #endif /* CONFIG_SCHED_DEBUG */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); @@ -2191,11 +2769,19 @@ extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON #define NOHZ_BALANCE_KICK_BIT 0 #define NOHZ_STATS_KICK_BIT 1 +#define NOHZ_NEWILB_KICK_BIT 2 +#define NOHZ_NEXT_KICK_BIT 3 +/* Run rebalance_domains() */ #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) +/* Update blocked load */ #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) +/* Update blocked load when entering idle */ +#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT) +/* Update nohz.next_balance */ +#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT) -#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) @@ -2204,33 +2790,12 @@ extern void nohz_balance_exit_idle(struct rq *rq); static inline void nohz_balance_exit_idle(struct rq *rq) { } #endif - -#ifdef CONFIG_SMP -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); - int i; - - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - for_each_cpu_and(i, rd->span, cpu_active_mask) { - struct rq *rq = cpu_rq(i); - - rq->dl.extra_bw += bw; - } -} +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void nohz_run_idle_balance(int cpu); #else -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); - - dl->extra_bw += bw; -} +static inline void nohz_run_idle_balance(int cpu) { } #endif - #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { u64 total; @@ -2243,7 +2808,7 @@ DECLARE_PER_CPU(struct irqtime, cpu_irqtime); /* * Returns the irqtime minus the softirq time computed by ksoftirqd. - * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime * and never move forward. */ static inline u64 irq_time_read(int cpu) @@ -2299,40 +2864,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ -#ifdef CONFIG_UCLAMP_TASK -unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); - -static __always_inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); - - if (p) { - min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); - max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); - } - - /* - * Since CPU's {min,max}_util clamps are MAX aggregated considering - * RUNNABLE tasks with _different_ clamps, we can end up with an - * inversion. Fix it now when the clamps are applied. - */ - if (unlikely(min_util >= max_util)) - return min_util; - - return clamp(util, min_util, max_util); -} -#else /* CONFIG_UCLAMP_TASK */ -static inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - return util; -} -#endif /* CONFIG_UCLAMP_TASK */ - #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant # define arch_scale_freq_invariant() true @@ -2346,29 +2877,41 @@ static inline unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; } -#endif /** - * enum schedutil_type - CPU utilization type + * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency * @ENERGY_UTIL: Utilization used during energy calculation * * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time * need to be aggregated differently depending on the usage made of them. This - * enum is used within schedutil_freq_util() to differentiate the types of + * enum is used within effective_cpu_util() to differentiate the types of * utilization expected by the callers, and adjust the aggregation accordingly. */ -enum schedutil_type { +enum cpu_util_type { FREQUENCY_UTIL, ENERGY_UTIL, }; -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL - -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + enum cpu_util_type type, struct task_struct *p); +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the original capacity of @cpu is + * greater than or equal to task's deadline density right shifted by + * (BW_SHIFT - SCHED_CAPACITY_SHIFT) and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned long cap = arch_scale_cpu_capacity(cpu); + + return cap >= p->dl.dl_density >> (BW_SHIFT - SCHED_CAPACITY_SHIFT); +} + static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -2379,30 +2922,158 @@ static inline unsigned long cpu_util_dl(struct rq *rq) return READ_ONCE(rq->avg_dl.util_avg); } -static inline unsigned long cpu_util_cfs(struct rq *rq) +/** + * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks. + * @cpu: the CPU to get the utilization for. + * + * The unit of the return value must be the same as the one of CPU capacity + * so that CPU utilization can be compared with CPU capacity. + * + * CPU utilization is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on that CPU. + * It represents the amount of CPU capacity currently used by CFS tasks in + * the range [0..max CPU capacity] with max CPU capacity being the CPU + * capacity at f_max. + * + * The estimated CPU utilization is defined as the maximum between CPU + * utilization and sum of the estimated utilization of the currently + * runnable tasks on that CPU. It preserves a utilization "snapshot" of + * previously-executed tasks, which helps better deduce how busy a CPU will + * be when a long-sleeping task wakes up. The contribution to CPU utilization + * of such a task would be significantly decayed at this point of time. + * + * CPU utilization can be higher than the current CPU capacity + * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because + * of rounding errors as well as task migrations or wakeups of new tasks. + * CPU utilization has to be capped to fit into the [0..max CPU capacity] + * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) + * could be seen as over-utilized even though CPU1 has 20% of spare CPU + * capacity. CPU utilization is allowed to overshoot current CPU capacity + * though since this is useful for predicting the CPU capacity required + * after task migrations (scheduler-driven DVFS). + * + * Return: (Estimated) utilization for the specified CPU. + */ +static inline unsigned long cpu_util_cfs(int cpu) { - unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + struct cfs_rq *cfs_rq; + unsigned long util; + + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); if (sched_feat(UTIL_EST)) { util = max_t(unsigned long, util, - READ_ONCE(rq->cfs.avg.util_est.enqueued)); + READ_ONCE(cfs_rq->avg.util_est.enqueued)); } - return util; + return min(util, capacity_orig_of(cpu)); } static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) +#endif + +#ifdef CONFIG_UCLAMP_TASK +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); + +/** + * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ +static __always_inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { - return 0; + unsigned long min_util = 0; + unsigned long max_util = 0; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + if (p) { + min_util = uclamp_eff_value(p, UCLAMP_MIN); + max_util = uclamp_eff_value(p, UCLAMP_MAX); + + /* + * Ignore last runnable task's max clamp, as this task will + * reset it. Similarly, no need to read the rq's min clamp. + */ + if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) + goto out; + } + + min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value)); + max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value)); +out: + /* + * Since CPU's {min,max}_util clamps are MAX aggregated considering + * RUNNABLE tasks with _different_ clamps, we can end up with an + * inversion. Fix it now when the clamps are applied. + */ + if (unlikely(min_util >= max_util)) + return min_util; + + return clamp(util, min_util, max_util); } -#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +/* Is the rq being capped/throttled by uclamp_max? */ +static inline bool uclamp_rq_is_capped(struct rq *rq) +{ + unsigned long rq_util; + unsigned long max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return false; + + rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; +} + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} +#else /* CONFIG_UCLAMP_TASK */ +static inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) +{ + return util; +} + +static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } + +static inline bool uclamp_is_used(void) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) @@ -2492,3 +3163,24 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } #endif + +extern void swake_up_all_locked(struct swait_queue_head *q); +extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +#ifdef CONFIG_PREEMPT_DYNAMIC +extern int preempt_dynamic_mode; +extern int sched_dynamic_mode(const char *str); +extern void sched_dynamic_update(int mode); +#endif + +static inline void update_current_exec_runtime(struct task_struct *curr, + u64 now, u64 delta_exec) +{ + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = now; + cgroup_account_cputime(curr, delta_exec); +} + +#endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h new file mode 100644 index 000000000000..2eb23dd0f285 --- /dev/null +++ b/kernel/sched/smp.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Scheduler internal SMP callback types and methods between the scheduler + * and other internal parts of the core kernel: + */ + +extern void sched_ttwu_pending(void *arg); + +extern void send_call_function_single_ipi(int cpu); + +#ifdef CONFIG_SMP +extern void flush_smp_call_function_queue(void); +#else +static inline void flush_smp_call_function_queue(void) { } +#endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 750fb3c67eed..857f837f52cb 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -2,7 +2,110 @@ /* * /proc/schedstat implementation */ -#include "sched.h" + +void __update_stats_wait_start(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats) +{ + u64 wait_start, prev_wait_start; + + wait_start = rq_clock(rq); + prev_wait_start = schedstat_val(stats->wait_start); + + if (p && likely(wait_start > prev_wait_start)) + wait_start -= prev_wait_start; + + __schedstat_set(stats->wait_start, wait_start); +} + +void __update_stats_wait_end(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats) +{ + u64 delta = rq_clock(rq) - schedstat_val(stats->wait_start); + + if (p) { + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + __schedstat_set(stats->wait_start, delta); + + return; + } + + trace_sched_stat_wait(p, delta); + } + + __schedstat_set(stats->wait_max, + max(schedstat_val(stats->wait_max), delta)); + __schedstat_inc(stats->wait_count); + __schedstat_add(stats->wait_sum, delta); + __schedstat_set(stats->wait_start, 0); +} + +void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats) +{ + u64 sleep_start, block_start; + + sleep_start = schedstat_val(stats->sleep_start); + block_start = schedstat_val(stats->block_start); + + if (sleep_start) { + u64 delta = rq_clock(rq) - sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(stats->sleep_max))) + __schedstat_set(stats->sleep_max, delta); + + __schedstat_set(stats->sleep_start, 0); + __schedstat_add(stats->sum_sleep_runtime, delta); + + if (p) { + account_scheduler_latency(p, delta >> 10, 1); + trace_sched_stat_sleep(p, delta); + } + } + + if (block_start) { + u64 delta = rq_clock(rq) - block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(stats->block_max))) + __schedstat_set(stats->block_max, delta); + + __schedstat_set(stats->block_start, 0); + __schedstat_add(stats->sum_sleep_runtime, delta); + __schedstat_add(stats->sum_block_runtime, delta); + + if (p) { + if (p->in_iowait) { + __schedstat_add(stats->iowait_sum, delta); + __schedstat_inc(stats->iowait_count); + trace_sched_stat_iowait(p, delta); + } + + trace_sched_stat_blocked(p, delta); + + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(p), + delta >> 20); + } + account_scheduler_latency(p, delta >> 10, 0); + } + } +} /* * Current schedstat API version. @@ -74,7 +177,7 @@ static int show_schedstat(struct seq_file *seq, void *v) } /* - * This itererator needs some explanation. + * This iterator needs some explanation. * It returns 1 for the header position. * This means 2 is cpu 0. * In a hotplugged system some CPUs, including cpu 0, may be missing so we have diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index ba683fe81a6e..84a188913cc9 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -1,7 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_STATS_H +#define _KERNEL_STATS_H #ifdef CONFIG_SCHEDSTATS +extern struct static_key_false sched_schedstats; + /* * Expects runqueue lock to be held for atomicity of update */ @@ -25,7 +29,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) } static inline void -rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { if (rq) rq->rq_sched_info.run_delay += delta; @@ -40,9 +44,33 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +void __update_stats_wait_start(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); + +void __update_stats_wait_end(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); +void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); + +static inline void +check_schedstat_required(void) +{ + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n"); +} + #else /* !CONFIG_SCHEDSTATS: */ + static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } -static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { } static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } # define schedstat_enabled() 0 # define __schedstat_inc(var) do { } while (0) @@ -53,9 +81,37 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_set(var, val) do { } while (0) # define schedstat_val(var) 0 # define schedstat_val_or_zero(var) 0 + +# define __update_stats_wait_start(rq, p, stats) do { } while (0) +# define __update_stats_wait_end(rq, p, stats) do { } while (0) +# define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0) +# define check_schedstat_required() do { } while (0) + #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_FAIR_GROUP_SCHED +struct sched_entity_stats { + struct sched_entity se; + struct sched_statistics stats; +} __no_randomize_layout; +#endif + +static inline struct sched_statistics * +__schedstats_from_se(struct sched_entity *se) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) + return &container_of(se, struct sched_entity_stats, se)->stats; +#endif + return &task_of(se)->stats; +} + #ifdef CONFIG_PSI +void psi_task_change(struct task_struct *task, int clear, int set); +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep); +void psi_account_irqtime(struct task_struct *task, u32 delta); + /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -69,8 +125,11 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) if (static_branch_likely(&psi_disabled)) return; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; + if (!wakeup || p->sched_psi_wake_requeue) { - if (p->flags & PF_MEMSTALL) + if (p->in_memstall) set |= TSK_MEMSTALL; if (p->sched_psi_wake_requeue) p->sched_psi_wake_requeue = 0; @@ -84,20 +143,24 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) static inline void psi_dequeue(struct task_struct *p, bool sleep) { - int clear = TSK_RUNNING, set = 0; + int clear = TSK_RUNNING; if (static_branch_likely(&psi_disabled)) return; - if (!sleep) { - if (p->flags & PF_MEMSTALL) - clear |= TSK_MEMSTALL; - } else { - if (p->in_iowait) - set |= TSK_IOWAIT; - } + /* + * A voluntary sleep is a dequeue followed by a task switch. To + * avoid walking all ancestors twice, psi_task_switch() handles + * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. + * Do nothing here. + */ + if (sleep) + return; - psi_task_change(p, clear, set); + if (p->in_memstall) + clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); + + psi_task_change(p, clear, 0); } static inline void psi_ttwu_dequeue(struct task_struct *p) @@ -109,14 +172,14 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) * deregister its sleep-persistent psi states from the old * queue, and let psi_enqueue() know it has to requeue. */ - if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + if (unlikely(p->in_iowait || p->in_memstall)) { struct rq_flags rf; struct rq *rq; int clear = 0; if (p->in_iowait) clear |= TSK_IOWAIT; - if (p->flags & PF_MEMSTALL) + if (p->in_memstall) clear |= TSK_MEMSTALL; rq = __task_rq_lock(p, &rf); @@ -126,45 +189,45 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) } } -static inline void psi_task_tick(struct rq *rq) +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) { if (static_branch_likely(&psi_disabled)) return; - if (unlikely(rq->curr->flags & PF_MEMSTALL)) - psi_memstall_tick(rq->curr, cpu_of(rq)); + psi_task_switch(prev, next, sleep); } + #else /* CONFIG_PSI */ static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} static inline void psi_dequeue(struct task_struct *p, bool sleep) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} -static inline void psi_task_tick(struct rq *rq) {} +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) {} +static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO -static inline void sched_info_reset_dequeued(struct task_struct *t) -{ - t->sched_info.last_queued = 0; -} - /* * We are interested in knowing how long it was from the *first* time a * task was queued to the time that it finally hit a CPU, we call this routine * from dequeue_task() to account for possible rq->clock skew across CPUs. The * delta taken on each CPU would annul the skew. */ -static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) +static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long delta = 0; - if (sched_info_on()) { - if (t->sched_info.last_queued) - delta = now - t->sched_info.last_queued; - } - sched_info_reset_dequeued(t); + if (!t->sched_info.last_queued) + return; + + delta = rq_clock(rq) - t->sched_info.last_queued; + t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; - rq_sched_info_dequeued(rq, delta); + rq_sched_info_dequeue(rq, delta); } /* @@ -174,11 +237,14 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) */ static void sched_info_arrive(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long now, delta = 0; - if (t->sched_info.last_queued) - delta = now - t->sched_info.last_queued; - sched_info_reset_dequeued(t); + if (!t->sched_info.last_queued) + return; + + now = rq_clock(rq); + delta = now - t->sched_info.last_queued; + t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; @@ -189,14 +255,12 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) /* * This function is only called from enqueue_task(), but also only updates * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. + * sched_info_dequeue() will clear that stamp when appropriate. */ -static inline void sched_info_queued(struct rq *rq, struct task_struct *t) +static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t) { - if (sched_info_on()) { - if (!t->sched_info.last_queued) - t->sched_info.last_queued = rq_clock(rq); - } + if (!t->sched_info.last_queued) + t->sched_info.last_queued = rq_clock(rq); } /* @@ -204,7 +268,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) * due, typically, to expiring its time slice (this may also be called when * switching to the idle task). Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call - * sched_info_queued() to mark that it has now again started waiting on + * sched_info_enqueue() to mark that it has now again started waiting on * the runqueue. */ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) @@ -213,8 +277,8 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) rq_sched_info_depart(rq, delta); - if (t->state == TASK_RUNNING) - sched_info_queued(rq, t); + if (task_is_running(t)) + sched_info_enqueue(rq, t); } /* @@ -223,7 +287,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { /* * prev now departs the CPU. It's not interesting to record @@ -237,18 +301,10 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct sched_info_arrive(rq, next); } -static inline void -sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) -{ - if (sched_info_on()) - __sched_info_switch(rq, prev, next); -} - #else /* !CONFIG_SCHED_INFO: */ -# define sched_info_queued(rq, t) do { } while (0) -# define sched_info_reset_dequeued(t) do { } while (0) -# define sched_info_dequeued(rq, t) do { } while (0) -# define sched_info_depart(rq, t) do { } while (0) -# define sched_info_arrive(rq, next) do { } while (0) +# define sched_info_enqueue(rq, t) do { } while (0) +# define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ + +#endif /* _KERNEL_STATS_H */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4c9e9975684f..85590599b4d6 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,11 +7,10 @@ * * See kernel/stop_machine.c */ -#include "sched.h" #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } @@ -34,15 +33,24 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir stop->se.exec_start = rq_clock_task(rq); } -static struct task_struct *pick_next_task_stop(struct rq *rq) +static struct task_struct *pick_task_stop(struct rq *rq) { if (!sched_stop_runnable(rq)) return NULL; - set_next_task_stop(rq, rq->stop, true); return rq->stop; } +static struct task_struct *pick_next_task_stop(struct rq *rq) +{ + struct task_struct *p = pick_task_stop(rq); + + if (p) + set_next_task_stop(rq, p, true); + + return p; +} + static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { @@ -63,20 +71,17 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *curr = rq->curr; - u64 delta_exec; + u64 now, delta_exec; - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + now = rq_clock_task(rq); + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); - - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); + schedstat_set(curr->stats.exec_max, + max(curr->stats.exec_max, delta_exec)); - curr->se.exec_start = rq_clock_task(rq); - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); } /* @@ -102,12 +107,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) BUG(); /* how!?, what priority? */ } -static unsigned int -get_rr_interval_stop(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_stop(struct rq *rq) { } @@ -115,8 +114,7 @@ static void update_curr_stop(struct rq *rq) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -const struct sched_class stop_sched_class = { - .next = &dl_sched_class, +DEFINE_SCHED_CLASS(stop) = { .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, @@ -130,14 +128,13 @@ const struct sched_class stop_sched_class = { #ifdef CONFIG_SMP .balance = balance_stop, + .pick_task = pick_task_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif .task_tick = task_tick_stop, - .get_rr_interval = get_rr_interval_stop, - .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e83a3f8449f6..76b9b796e695 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,7 +2,6 @@ /* * <linux/swait.h> (simple wait queues ) implementation: */ -#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) @@ -32,6 +31,19 @@ void swake_up_locked(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_locked); +/* + * Wake up all waiters. This is an interface which is solely exposed for + * completions and not for general usage. + * + * It is intentionally different from swake_up_all() to allow usage from + * hard interrupt context and interrupt disabled regions. + */ +void swake_up_all_locked(struct swait_queue_head *q) +{ + while (!list_empty(&q->task_list)) + swake_up_locked(q); +} + void swake_up_one(struct swait_queue_head *q) { unsigned long flags; @@ -69,7 +81,7 @@ void swake_up_all(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_all); -static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) { wait->task = current; if (list_empty(&wait->task_list)) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dfb64c08a407..8739c2a5a54e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2,7 +2,6 @@ /* * Scheduler topology setup/handling methods */ -#include "sched.h" DEFINE_MUTEX(sched_domains_mutex); @@ -14,33 +13,33 @@ static cpumask_var_t sched_domains_tmpmask2; static int __init sched_debug_setup(char *str) { - sched_debug_enabled = true; + sched_debug_verbose = true; return 0; } -early_param("sched_debug", sched_debug_setup); +early_param("sched_verbose", sched_debug_setup); static inline bool sched_debug(void) { - return sched_debug_enabled; + return sched_debug_verbose; } +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, +const struct sd_flag_debug sd_flag_debug[] = { +#include <linux/sched/sd_flags.h> +}; +#undef SD_FLAG + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; + unsigned long flags = sd->flags; + unsigned int idx; cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); - return -1; - } - printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); @@ -51,6 +50,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + unsigned int flag = BIT(idx); + unsigned int meta_flags = sd_flag_debug[idx].meta_flags; + + if ((meta_flags & SDF_SHARED_CHILD) && sd->child && + !(sd->child->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in child\n", + sd_flag_debug[idx].name); + + if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && + !(sd->parent->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", + sd_flag_debug[idx].name); + } + printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { @@ -59,7 +73,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpumask_weight(sched_group_span(group))) { + if (cpumask_empty(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; @@ -116,7 +130,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; - if (!sched_debug_enabled) + if (!sched_debug_verbose) return; if (!sd) { @@ -137,7 +151,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } #else /* !CONFIG_SCHED_DEBUG */ -# define sched_debug_enabled 0 +# define sched_debug_verbose 0 # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { @@ -145,23 +159,22 @@ static inline bool sched_debug(void) } #endif /* CONFIG_SCHED_DEBUG */ +/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | +static const unsigned int SD_DEGENERATE_GROUPS_MASK = +#include <linux/sched/sd_flags.h> +0; +#undef SD_FLAG + static int sd_degenerate(struct sched_domain *sd) { if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUCAPACITY | - SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { - if (sd->groups != sd->groups->next) - return 0; - } + if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && + (sd->groups != sd->groups->next)) + return 0; /* Following flags don't use groups */ if (sd->flags & (SD_WAKE_AFFINE)) @@ -182,19 +195,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 0; /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); - if (nr_node_ids == 1) - pflags &= ~SD_SERIALIZE; - } + if (parent->groups == parent->groups->next) + pflags &= ~SD_DEGENERATE_GROUPS_MASK; + if (~cflags & pflags) return 0; @@ -203,13 +206,22 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_STATIC_KEY_FALSE(sched_energy_present); -unsigned int sysctl_sched_energy_aware = 1; +static unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +void rebuild_sched_domains_energy(void) +{ + mutex_lock(&sched_energy_mutex); + sched_energy_update = true; + rebuild_sched_domains(); + sched_energy_update = false; + mutex_unlock(&sched_energy_mutex); +} + #ifdef CONFIG_PROC_SYSCTL -int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int sched_energy_aware_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; @@ -219,17 +231,33 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); - if (state != sysctl_sched_energy_aware) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = 1; - rebuild_sched_domains(); - sched_energy_update = 0; - mutex_unlock(&sched_energy_mutex); - } + if (state != sysctl_sched_energy_aware) + rebuild_sched_domains_energy(); } return ret; } + +static struct ctl_table sched_energy_aware_sysctls[] = { + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sched_energy_aware_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_energy_aware_sysctls); + return 0; +} + +late_initcall(sched_energy_aware_sysctl_init); #endif static void free_pd(struct perf_domain *pd) @@ -282,10 +310,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map, printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); while (pd) { - printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }", + printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", cpumask_first(perf_domain_span(pd)), cpumask_pr_args(perf_domain_span(pd)), - em_pd_nr_cap_states(pd->em_pd)); + em_pd_nr_perf_states(pd->em_pd)); pd = pd->next; } @@ -317,31 +345,33 @@ static void sched_energy_set(bool has_eas) * EAS can be used on a root domain if it meets all the following conditions: * 1. an Energy Model (EM) is available; * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. - * 3. the EM complexity is low enough to keep scheduling overheads low; - * 4. schedutil is driving the frequency of all CPUs of the rd; + * 3. no SMT is detected. + * 4. the EM complexity is low enough to keep scheduling overheads low; + * 5. schedutil is driving the frequency of all CPUs of the rd; + * 6. frequency invariance support is present; * * The complexity of the Energy Model is defined as: * - * C = nr_pd * (nr_cpus + nr_cs) + * C = nr_pd * (nr_cpus + nr_ps) * * with parameters defined as: * - nr_pd: the number of performance domains * - nr_cpus: the number of CPUs - * - nr_cs: the sum of the number of capacity states of all performance + * - nr_ps: the sum of the number of performance states of all performance * domains (for example, on a system with 2 performance domains, - * with 10 capacity states each, nr_cs = 2 * 10 = 20). + * with 10 performance states each, nr_ps = 2 * 10 = 20). * * It is generally not a good idea to use such a model in the wake-up path on * very complex platforms because of the associated scheduling overheads. The * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs - * with per-CPU DVFS and less than 8 capacity states each, for example. + * with per-CPU DVFS and less than 8 performance states each, for example. */ #define EM_MAX_COMPLEXITY 2048 extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { - int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); + int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; @@ -360,6 +390,21 @@ static bool build_perf_domains(const struct cpumask *cpu_map) goto free; } + /* EAS definitely does *not* handle SMT */ + if (sched_smt_active()) { + pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", + cpumask_pr_args(cpu_map)); + goto free; + } + + if (!arch_scale_freq_invariant()) { + if (sched_debug()) { + pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", + cpumask_pr_args(cpu_map)); + } + goto free; + } + for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ if (find_pd(pd, i)) @@ -386,15 +431,15 @@ static bool build_perf_domains(const struct cpumask *cpu_map) pd = tmp; /* - * Count performance domains and capacity states for the + * Count performance domains and performance states for the * complexity check. */ nr_pd++; - nr_cs += em_pd_nr_cap_states(pd->em_pd); + nr_ps += em_pd_nr_perf_states(pd->em_pd); } /* Bail out if the Energy Model complexity is too high. */ - if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) { + if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", cpumask_pr_args(cpu_map)); goto free; @@ -442,7 +487,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) struct root_domain *old_rd = NULL; unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); if (rq->rd) { old_rd = rq->rd; @@ -468,7 +513,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); if (old_rd) call_rcu(&old_rd->rcu, free_rootdomain); @@ -501,9 +546,10 @@ static int init_rootdomain(struct root_domain *rd) #ifdef HAVE_RT_PUSH_IPI rd->rto_cpu = -1; raw_spin_lock_init(&rd->rto_lock); - init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); + rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif + rd->visit_gen = 0; init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; @@ -649,7 +695,7 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); - sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY); + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); } @@ -689,8 +735,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp = sd; sd = sd->parent; destroy_sched_domain(tmp); - if (sd) + if (sd) { + struct sched_group *sg = sd->groups; + + /* + * sched groups hold the flags of the child sched + * domain for convenience. Clear such flags since + * the child is being destroyed. + */ + do { + sg->flags = 0; + } while (sg != sd->groups); + sd->child = NULL; + } } sched_domain_debug(sd, cpu); @@ -886,10 +944,12 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) return NULL; sg_span = sched_group_span(sg); - if (sd->child) + if (sd->child) { cpumask_copy(sg_span, sched_domain_span(sd->child)); - else + sg->flags = sd->child->flags; + } else { cpumask_copy(sg_span, sched_domain_span(sd)); + } atomic_inc(&sg->ref); return sg; @@ -904,7 +964,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, int cpu; build_balance_mask(sd, sg, mask); - cpu = cpumask_first_and(sched_group_span(sg), mask); + cpu = cpumask_first(mask); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); if (atomic_inc_return(&sg->sgc->ref) == 1) @@ -923,6 +983,31 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; } +static struct sched_domain * +find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling) +{ + /* + * The proper descendant would be the one whose child won't span out + * of sd + */ + while (sibling->child && + !cpumask_subset(sched_domain_span(sibling->child), + sched_domain_span(sd))) + sibling = sibling->child; + + /* + * As we are referencing sgc across different topology level, we need + * to go down to skip those sched_domains which don't contribute to + * scheduling because they will be degenerated in cpu_attach_domain + */ + while (sibling->child && + cpumask_equal(sched_domain_span(sibling->child), + sched_domain_span(sibling))) + sibling = sibling->child; + + return sibling; +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -956,6 +1041,41 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; + /* + * Usually we build sched_group by sibling's child sched_domain + * But for machines whose NUMA diameter are 3 or above, we move + * to build sched_group by sibling's proper descendant's child + * domain because sibling's child sched_domain will span out of + * the sched_domain being built as below. + * + * Smallest diameter=3 topology is: + * + * node 0 1 2 3 + * 0: 10 20 30 40 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 40 30 20 10 + * + * 0 --- 1 --- 2 --- 3 + * + * NUMA-3 0-3 N/A N/A 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-2 0-2 0-3 0-3 1-3 + * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} + * + * NUMA-1 0-1 0-2 1-3 2-3 + * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} + * + * NUMA-0 0 1 2 3 + * + * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the + * group span isn't a subset of the domain span. + */ + if (sibling->child && + !cpumask_subset(sched_domain_span(sibling->child), span)) + sibling = find_descended_sibling(sd, sibling); + sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail; @@ -963,7 +1083,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sg_span = sched_group_span(sg); cpumask_or(covered, covered, sg_span); - init_overlap_sched_group(sd, sg); + init_overlap_sched_group(sibling, sg); if (!first) first = sg; @@ -1079,6 +1199,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) if (child) { cpumask_copy(sched_group_span(sg), sched_domain_span(child)); cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); + sg->flags = child->flags; } else { cpumask_set_cpu(cpu, sched_group_span(sg)); cpumask_set_cpu(cpu, group_balance_mask(sg)); @@ -1177,6 +1298,116 @@ next: } /* + * Asymmetric CPU capacity bits + */ +struct asym_cap_data { + struct list_head link; + unsigned long capacity; + unsigned long cpus[]; +}; + +/* + * Set of available CPUs grouped by their corresponding capacities + * Each list entry contains a CPU mask reflecting CPUs that share the same + * capacity. + * The lifespan of data is unlimited. + */ +static LIST_HEAD(asym_cap_list); + +#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) + +/* + * Verify whether there is any CPU capacity asymmetry in a given sched domain. + * Provides sd_flags reflecting the asymmetry scope. + */ +static inline int +asym_cpu_capacity_classify(const struct cpumask *sd_span, + const struct cpumask *cpu_map) +{ + struct asym_cap_data *entry; + int count = 0, miss = 0; + + /* + * Count how many unique CPU capacities this domain spans across + * (compare sched_domain CPUs mask with ones representing available + * CPUs capacities). Take into account CPUs that might be offline: + * skip those. + */ + list_for_each_entry(entry, &asym_cap_list, link) { + if (cpumask_intersects(sd_span, cpu_capacity_span(entry))) + ++count; + else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry))) + ++miss; + } + + WARN_ON_ONCE(!count && !list_empty(&asym_cap_list)); + + /* No asymmetry detected */ + if (count < 2) + return 0; + /* Some of the available CPU capacity values have not been detected */ + if (miss) + return SD_ASYM_CPUCAPACITY; + + /* Full asymmetry */ + return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL; + +} + +static inline void asym_cpu_capacity_update_data(int cpu) +{ + unsigned long capacity = arch_scale_cpu_capacity(cpu); + struct asym_cap_data *entry = NULL; + + list_for_each_entry(entry, &asym_cap_list, link) { + if (capacity == entry->capacity) + goto done; + } + + entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL); + if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n")) + return; + entry->capacity = capacity; + list_add(&entry->link, &asym_cap_list); +done: + __cpumask_set_cpu(cpu, cpu_capacity_span(entry)); +} + +/* + * Build-up/update list of CPUs grouped by their capacities + * An update requires explicit request to rebuild sched domains + * with state indicating CPU topology changes. + */ +static void asym_cpu_capacity_scan(void) +{ + struct asym_cap_data *entry, *next; + int cpu; + + list_for_each_entry(entry, &asym_cap_list, link) + cpumask_clear(cpu_capacity_span(entry)); + + for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) + asym_cpu_capacity_update_data(cpu); + + list_for_each_entry_safe(entry, next, &asym_cap_list, link) { + if (cpumask_empty(cpu_capacity_span(entry))) { + list_del(&entry->link); + kfree(entry); + } + } + + /* + * Only one capacity value has been detected i.e. this system is symmetric. + * No need to keep this data around. + */ + if (list_is_singular(&asym_cap_list)) { + entry = list_first_entry(&asym_cap_list, typeof(*entry), link); + list_del(&entry->link); + kfree(entry); + } +} + +/* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ @@ -1221,13 +1452,13 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, case sa_rootdomain: if (!atomic_read(&d->rd->refcount)) free_rootdomain(&d->rd->rcu); - /* Fall through */ + fallthrough; case sa_sd: free_percpu(d->sd); - /* Fall through */ + fallthrough; case sa_sd_storage: __sdt_free(cpu_map); - /* Fall through */ + fallthrough; case sa_none: break; } @@ -1281,7 +1512,6 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; -int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; #endif /* @@ -1294,7 +1524,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; * SD_SHARE_CPUCAPACITY - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1305,17 +1534,17 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_ASYM_PACKING) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, - struct sched_domain *child, int dflags, int cpu) + struct sched_domain *child, int cpu) { struct sd_data *sdd = &tl->data; struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); int sd_id, sd_weight, sd_flags = 0; + struct cpumask *sd_span; #ifdef CONFIG_NUMA /* @@ -1330,21 +1559,17 @@ sd_init(struct sched_domain_topology_level *tl, sd_flags = (*tl->sd_flags)(); if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; - - /* Apply detected topology flags */ - sd_flags |= dflags; + sd_flags &= TOPOLOGY_SD_FLAGS; *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, - .busy_factor = 32, - .imbalance_pct = 125, + .busy_factor = 16, + .imbalance_pct = 117, .cache_nice_tries = 0, - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE + .flags = 1*SD_BALANCE_NEWIDLE | 1*SD_BALANCE_EXEC | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE @@ -1360,32 +1585,29 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, .max_newidle_lb_cost = 0, - .next_decay_max_lb_cost = jiffies, + .last_decay_max_lb_cost = jiffies, .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, #endif }; - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - sd_id = cpumask_first(sched_domain_span(sd)); + sd_span = sched_domain_span(sd); + cpumask_and(sd_span, cpu_map, tl->mask(cpu)); + sd_id = cpumask_first(sd_span); + + sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); + + WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) == + (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), + "CPU capacity asymmetry not supported on SMT\n"); /* * Convert topological properties into behaviour. */ - - if (sd->flags & SD_ASYM_CPUCAPACITY) { - struct sched_domain *t = sd; - - /* - * Don't attempt to spread across CPUs of different capacities. - */ - if (sd->child) - sd->child->flags &= ~SD_PREFER_SIBLING; - - for_each_lower_domain(t) - t->flags |= SD_BALANCE_WAKE; - } + /* Don't attempt to spread across CPUs of different capacities. */ + if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) + sd->child->flags &= ~SD_PREFER_SIBLING; if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; @@ -1433,6 +1655,11 @@ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif + +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, +#endif + #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif @@ -1442,6 +1669,7 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology_saved; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) @@ -1452,6 +1680,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) return; sched_domain_topology = tl; + sched_domain_topology_saved = NULL; } #ifdef CONFIG_NUMA @@ -1475,8 +1704,12 @@ static void sched_numa_warn(const char *str) for (i = 0; i < nr_node_ids; i++) { printk(KERN_WARNING " "); - for (j = 0; j < nr_node_ids; j++) - printk(KERN_CONT "%02d ", node_distance(i,j)); + for (j = 0; j < nr_node_ids; j++) { + if (!node_state(i, N_CPU) || !node_state(j, N_CPU)) + printk(KERN_CONT "(%02d) ", node_distance(i,j)); + else + printk(KERN_CONT " %02d ", node_distance(i,j)); + } printk(KERN_CONT "\n"); } printk(KERN_WARNING "\n"); @@ -1484,19 +1717,34 @@ static void sched_numa_warn(const char *str) bool find_numa_distance(int distance) { - int i; + bool found = false; + int i, *distances; if (distance == node_distance(0, 0)) return true; + rcu_read_lock(); + distances = rcu_dereference(sched_domains_numa_distance); + if (!distances) + goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { - if (sched_domains_numa_distance[i] == distance) - return true; + if (distances[i] == distance) { + found = true; + break; + } } +unlock: + rcu_read_unlock(); - return false; + return found; } +#define for_each_cpu_node_but(n, nbut) \ + for_each_node_state(n, N_CPU) \ + if (n == nbut) \ + continue; \ + else + /* * A system can have three types of NUMA topology: * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system @@ -1516,7 +1764,7 @@ bool find_numa_distance(int distance) * there is an intermediary node C, which is < N hops away from both * nodes A and B, the system is a glueless mesh. */ -static void init_numa_topology_type(void) +static void init_numa_topology_type(int offline_node) { int a, b, c, n; @@ -1527,14 +1775,14 @@ static void init_numa_topology_type(void) return; } - for_each_online_node(a) { - for_each_online_node(b) { + for_each_cpu_node_but(a, offline_node) { + for_each_cpu_node_but(b, offline_node) { /* Find two nodes furthest removed from each other. */ if (node_distance(a, b) < n) continue; /* Is there an intermediary node between a and b? */ - for_each_online_node(c) { + for_each_cpu_node_but(c, offline_node) { if (node_distance(a, c) < n && node_distance(b, c) < n) { sched_numa_topology_type = @@ -1547,68 +1795,67 @@ static void init_numa_topology_type(void) return; } } + + pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n"); + sched_numa_topology_type = NUMA_DIRECT; } -void sched_init_numa(void) -{ - int next_distance, curr_distance = node_distance(0, 0); - struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); - if (!sched_domains_numa_distance) - return; +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) - /* Includes NUMA identity node at level 0. */ - sched_domains_numa_distance[level++] = curr_distance; - sched_domains_numa_levels = level; +void sched_init_numa(int offline_node) +{ + struct sched_domain_topology_level *tl; + unsigned long *distance_map; + int nr_levels = 0; + int i, j; + int *distances; + struct cpumask ***masks; /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. */ - next_distance = curr_distance; - for (i = 0; i < nr_node_ids; i++) { - for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); + if (!distance_map) + return; - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); + bitmap_zero(distance_map, NR_DISTANCE_VALUES); + for_each_cpu_node_but(i, offline_node) { + for_each_cpu_node_but(j, offline_node) { + int distance = node_distance(i, j); - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { + sched_numa_warn("Invalid distance value range"); + bitmap_free(distance_map); + return; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + bitmap_set(distance_map, distance, 1); } + } + /* + * We can now figure out how many unique distance values there are and + * allocate memory accordingly. + */ + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; + distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); + if (!distances) { + bitmap_free(distance_map); + return; + } + + for (i = 0, j = 0; i < nr_levels; i++, j++) { + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); + distances[i] = j; } + rcu_assign_pointer(sched_domains_numa_distance, distances); + + bitmap_free(distance_map); /* - * 'level' contains the number of unique distances + * 'nr_levels' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers. @@ -1617,36 +1864,40 @@ void sched_init_numa(void) /* * Here, we should temporarily reset sched_domains_numa_levels to 0. * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be + * the array will contain less then 'nr_levels' members. This could be * dangerous when we use it to iterate array sched_domains_numa_masks[][] * in other functions. * - * We reset it to 'level' at the end of this function. + * We reset it to 'nr_levels' at the end of this function. */ sched_domains_numa_levels = 0; - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); - if (!sched_domains_numa_masks) + masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); + if (!masks) return; /* * Now for each level, construct a mask per node which contains all * CPUs of nodes that are that many hops away from us. */ - for (i = 0; i < level; i++) { - sched_domains_numa_masks[i] = - kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); - if (!sched_domains_numa_masks[i]) + for (i = 0; i < nr_levels; i++) { + masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!masks[i]) return; - for (j = 0; j < nr_node_ids; j++) { + for_each_cpu_node_but(j, offline_node) { struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + int k; + if (!mask) return; - sched_domains_numa_masks[i][j] = mask; + masks[i][j] = mask; + + for_each_cpu_node_but(k, offline_node) { + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + sched_numa_warn("Node-distance not symmetric"); - for_each_node(k) { if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -1654,11 +1905,12 @@ void sched_init_numa(void) } } } + rcu_assign_pointer(sched_domains_numa_masks, masks); /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level + 1) * + tl = kzalloc((i + nr_levels + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -1681,7 +1933,7 @@ void sched_init_numa(void) /* * .. and append 'j' levels of NUMA goodness. */ - for (j = 1; j < level; i++, j++) { + for (j = 1; j < nr_levels; i++, j++) { tl[i] = (struct sched_domain_topology_level){ .mask = sd_numa_mask, .sd_flags = cpu_numa_flags, @@ -1691,12 +1943,67 @@ void sched_init_numa(void) }; } + sched_domain_topology_saved = sched_domain_topology; sched_domain_topology = tl; - sched_domains_numa_levels = level; - sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + sched_domains_numa_levels = nr_levels; + WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); + + init_numa_topology_type(offline_node); +} + + +static void sched_reset_numa(void) +{ + int nr_levels, *distances; + struct cpumask ***masks; + + nr_levels = sched_domains_numa_levels; + sched_domains_numa_levels = 0; + sched_max_numa_distance = 0; + sched_numa_topology_type = NUMA_DIRECT; + distances = sched_domains_numa_distance; + rcu_assign_pointer(sched_domains_numa_distance, NULL); + masks = sched_domains_numa_masks; + rcu_assign_pointer(sched_domains_numa_masks, NULL); + if (distances || masks) { + int i, j; + + synchronize_rcu(); + kfree(distances); + for (i = 0; i < nr_levels && masks; i++) { + if (!masks[i]) + continue; + for_each_node(j) + kfree(masks[i][j]); + kfree(masks[i]); + } + kfree(masks); + } + if (sched_domain_topology_saved) { + kfree(sched_domain_topology); + sched_domain_topology = sched_domain_topology_saved; + sched_domain_topology_saved = NULL; + } +} + +/* + * Call with hotplug lock held + */ +void sched_update_numa(int cpu, bool online) +{ + int node; + + node = cpu_to_node(cpu); + /* + * Scheduler NUMA topology is updated when the first CPU of a + * node is onlined or the last CPU of a node is offlined. + */ + if (cpumask_weight(cpumask_of_node(node)) != 1) + return; - init_numa_topology_type(); + sched_reset_numa(); + sched_init_numa(online ? NUMA_NO_NODE : node); } void sched_domains_numa_masks_set(unsigned int cpu) @@ -1706,6 +2013,10 @@ void sched_domains_numa_masks_set(unsigned int cpu) for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { + if (!node_state(j, N_CPU)) + continue; + + /* Set ourselves in the remote node's masks */ if (node_distance(j, node) <= sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } @@ -1717,8 +2028,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu) int i, j; for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) - cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + for (j = 0; j < nr_node_ids; j++) { + if (sched_domains_numa_masks[i][j]) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } } } @@ -1732,14 +2045,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu) */ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) { - int i, j = cpu_to_node(cpu); + int i, j = cpu_to_node(cpu), found = nr_cpu_ids; + struct cpumask ***masks; + rcu_read_lock(); + masks = rcu_dereference(sched_domains_numa_masks); + if (!masks) + goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { - cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]); - if (cpu < nr_cpu_ids) - return cpu; + if (!masks[i][j]) + break; + cpu = cpumask_any_and(cpus, masks[i][j]); + if (cpu < nr_cpu_ids) { + found = cpu; + break; + } } - return nr_cpu_ids; +unlock: + rcu_read_unlock(); + + return found; } #endif /* CONFIG_NUMA */ @@ -1851,9 +2176,9 @@ static void __sdt_free(const struct cpumask *cpu_map) static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *child, int dflags, int cpu) + struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu); + struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); if (child) { sd->level = child->level + 1; @@ -1916,65 +2241,6 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl, } /* - * Find the sched_domain_topology_level where all CPU capacities are visible - * for all CPUs. - */ -static struct sched_domain_topology_level -*asym_cpu_capacity_level(const struct cpumask *cpu_map) -{ - int i, j, asym_level = 0; - bool asym = false; - struct sched_domain_topology_level *tl, *asym_tl = NULL; - unsigned long cap; - - /* Is there any asymmetry? */ - cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); - - for_each_cpu(i, cpu_map) { - if (arch_scale_cpu_capacity(i) != cap) { - asym = true; - break; - } - } - - if (!asym) - return NULL; - - /* - * Examine topology from all CPU's point of views to detect the lowest - * sched_domain_topology_level where a highest capacity CPU is visible - * to everyone. - */ - for_each_cpu(i, cpu_map) { - unsigned long max_capacity = arch_scale_cpu_capacity(i); - int tl_id = 0; - - for_each_sd_topology(tl) { - if (tl_id < asym_level) - goto next_level; - - for_each_cpu_and(j, tl->mask(i), cpu_map) { - unsigned long capacity; - - capacity = arch_scale_cpu_capacity(j); - - if (capacity <= max_capacity) - continue; - - max_capacity = capacity; - asym_level = tl_id; - asym_tl = tl; - } -next_level: - tl_id++; - } - } - - return asym_tl; -} - - -/* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs */ @@ -1986,7 +2252,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct s_data d; struct rq *rq = NULL; int i, ret = -ENOMEM; - struct sched_domain_topology_level *tl_asym; bool has_asym = false; if (WARN_ON(cpumask_empty(cpu_map))) @@ -1996,25 +2261,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (alloc_state != sa_rootdomain) goto error; - tl_asym = asym_cpu_capacity_level(cpu_map); - /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; sd = NULL; for_each_sd_topology(tl) { - int dflags = 0; - - if (tl == tl_asym) { - dflags |= SD_ASYM_CPUCAPACITY; - has_asym = true; - } if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) goto error; - sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); + sd = build_sched_domain(tl, cpu_map, attr, sd, i); + + has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; @@ -2039,6 +2298,64 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + /* + * Calculate an allowed NUMA imbalance such that LLCs do not get + * imbalanced. + */ + for_each_cpu(i, cpu_map) { + unsigned int imb = 0; + unsigned int imb_span = 1; + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + struct sched_domain *child = sd->child; + + if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child && + (child->flags & SD_SHARE_PKG_RESOURCES)) { + struct sched_domain __rcu *top_p; + unsigned int nr_llcs; + + /* + * For a single LLC per node, allow an + * imbalance up to 12.5% of the node. This is + * arbitrary cutoff based two factors -- SMT and + * memory channels. For SMT-2, the intent is to + * avoid premature sharing of HT resources but + * SMT-4 or SMT-8 *may* benefit from a different + * cutoff. For memory channels, this is a very + * rough estimate of how many channels may be + * active and is based on recent CPUs with + * many cores. + * + * For multiple LLCs, allow an imbalance + * until multiple tasks would share an LLC + * on one node while LLCs on another node + * remain idle. This assumes that there are + * enough logical CPUs per LLC to avoid SMT + * factors and that there is a correlation + * between LLCs and memory channels. + */ + nr_llcs = sd->span_weight / child->span_weight; + if (nr_llcs == 1) + imb = sd->span_weight >> 3; + else + imb = nr_llcs; + imb = max(1U, imb); + sd->imb_numa_nr = imb; + + /* Set span based on the first NUMA domain. */ + top_p = sd->parent; + while (top_p && !(top_p->flags & SD_NUMA)) { + top_p = top_p->parent; + } + imb_span = top_p ? top_p->span_weight : sd->span_weight; + } else { + int factor = max(1U, (sd->span_weight / imb_span)); + + sd->imb_numa_nr = imb * factor; + } + } + } + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) @@ -2067,7 +2384,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); - if (rq && sched_debug_enabled) { + if (rq && sched_debug_verbose) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); } @@ -2085,7 +2402,7 @@ static cpumask_var_t *doms_cur; /* Number of sched domains in 'doms_cur': */ static int ndoms_cur; -/* Attribues of custom domains in 'doms_cur' */ +/* Attributes of custom domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* @@ -2143,13 +2460,13 @@ int sched_init_domains(const struct cpumask *cpu_map) zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); arch_update_cpu_topology(); + asym_cpu_capacity_scan(); ndoms_cur = 1; doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) doms_cur = &fallback_doms; - cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN)); err = build_sched_domains(doms_cur[0], NULL); - register_sched_domain_sysctl(); return err; } @@ -2224,11 +2541,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], lockdep_assert_held(&sched_domains_mutex); - /* Always unregister in case we don't destroy any domains: */ - unregister_sched_domain_sysctl(); - /* Let the architecture update CPU core mappings: */ new_topology = arch_update_cpu_topology(); + /* Trigger rebuilding CPU capacity asymmetry data */ + if (new_topology) + asym_cpu_capacity_scan(); if (!doms_new) { WARN_ON_ONCE(dattr_new); @@ -2237,7 +2554,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], if (doms_new) { n = 1; cpumask_and(doms_new[0], cpu_active_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); } } else { n = ndoms_new; @@ -2272,7 +2589,7 @@ match1: n = 0; doms_new = &fallback_doms; cpumask_and(doms_new[0], cpu_active_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); } /* Build new domains: */ @@ -2315,7 +2632,7 @@ match3: dattr_cur = dattr_new; ndoms_cur = ndoms_new; - register_sched_domain_sysctl(); + update_sched_domain_debugfs(); } /* diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index ba059fbfc53a..9860bb9a847c 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -4,7 +4,6 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ -#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { @@ -37,6 +36,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue } EXPORT_SYMBOL(add_wait_queue_exclusive); +void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL_GPL(add_wait_queue_priority); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -57,7 +67,11 @@ EXPORT_SYMBOL(remove_wait_queue); /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. + * number) then we wake that number of exclusive tasks, and potentially all + * the non-exclusive tasks. Normally, exclusive tasks will be at the end of + * the list and any non-exclusive tasks will be woken first. A priority task + * may be at the head of the list, and can consume the event without any other + * tasks being woken. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns @@ -223,6 +237,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +void __wake_up_pollfree(struct wait_queue_head *wq_head) +{ + __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE)); + /* POLLFREE must have cleared the queue. */ + WARN_ON_ONCE(waitqueue_active(wq_head)); +} + /* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any @@ -249,17 +270,22 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent } EXPORT_SYMBOL(prepare_to_wait); -void +/* Returns true if we are the first waiter in the queue, false otherwise. */ +bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; + bool was_empty = false; wq_entry->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - if (list_empty(&wq_entry->entry)) + if (list_empty(&wq_entry->entry)) { + was_empty = list_empty(&wq_head->head); __add_wait_queue_entry_tail(wq_head, wq_entry); + } set_current_state(state); spin_unlock_irqrestore(&wq_head->lock, flags); + return was_empty; } EXPORT_SYMBOL(prepare_to_wait_exclusive); @@ -389,7 +415,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wq_entry->entry); + list_del_init_careful(&wq_entry->entry); return ret; } diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 02ce292b9bc0..0b1cd985dc27 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only + /* * The implementation of the wait_bit*() and related waiting APIs: */ -#include "sched.h" #define WAIT_TABLE_BITS 8 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) @@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) ret = (*action)(&wbq_entry->key, mode); - } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); finish_wait(wq_head, &wbq_entry->wq_entry); |