aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-01-28 10:07:09 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2020-01-28 10:07:09 -0800
commitc677124e631d97130e4ff7db6e10acdfb7a82321 (patch)
tree514d9ed975d58dc077ee73a2469583ccc95a2aa1 /kernel/sched
parentMerge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (diff)
parentsched/fair: Define sched_idle_cpu() only for SMP configurations (diff)
downloadlinux-dev-c677124e631d97130e4ff7db6e10acdfb7a82321.tar.xz
linux-dev-c677124e631d97130e4ff7db6e10acdfb7a82321.zip
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "These were the main changes in this cycle: - More -rt motivated separation of CONFIG_PREEMPT and CONFIG_PREEMPTION. - Add more low level scheduling topology sanity checks and warnings to filter out nonsensical topologies that break scheduling. - Extend uclamp constraints to influence wakeup CPU placement - Make the RT scheduler more aware of asymmetric topologies and CPU capacities, via uclamp metrics, if CONFIG_UCLAMP_TASK=y - Make idle CPU selection more consistent - Various fixes, smaller cleanups, updates and enhancements - please see the git log for details" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (58 commits) sched/fair: Define sched_idle_cpu() only for SMP configurations sched/topology: Assert non-NUMA topology masks don't (partially) overlap idle: fix spelling mistake "iterrupts" -> "interrupts" sched/fair: Remove redundant call to cpufreq_update_util() sched/psi: create /proc/pressure and /proc/pressure/{io|memory|cpu} only when psi enabled sched/fair: Fix sgc->{min,max}_capacity calculation for SD_OVERLAP sched/fair: calculate delta runnable load only when it's needed sched/cputime: move rq parameter in irqtime_account_process_tick stop_machine: Make stop_cpus() static sched/debug: Reset watchdog on all CPUs while processing sysrq-t sched/core: Fix size of rq::uclamp initialization sched/uclamp: Fix a bug in propagating uclamp value in new cgroups sched/fair: Load balance aggressively for SCHED_IDLE CPUs sched/fair : Improve update_sd_pick_busiest for spare capacity case watchdog: Remove soft_lockup_hrtimer_cnt and related code sched/rt: Make RT capacity-aware sched/fair: Make EAS wakeup placement consider uclamp restrictions sched/fair: Make task_fits_capacity() consider uclamp restrictions sched/uclamp: Rename uclamp_util_with() into uclamp_rq_util_with() sched/uclamp: Make uclamp util helpers use and return UL values ...
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/clock.c6
-rw-r--r--kernel/sched/core.c34
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/cpupri.c25
-rw-r--r--kernel/sched/cpupri.h4
-rw-r--r--kernel/sched/cputime.c15
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c171
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/pelt.c20
-rw-r--r--kernel/sched/psi.c10
-rw-r--r--kernel/sched/rt.c83
-rw-r--r--kernel/sched/sched.h24
-rw-r--r--kernel/sched/topology.c39
-rw-r--r--kernel/sched/wait_bit.c1
15 files changed, 298 insertions, 149 deletions
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 1152259a4ca0..12bca64dff73 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -370,7 +370,7 @@ u64 sched_clock_cpu(int cpu)
if (sched_clock_stable())
return sched_clock() + __sched_clock_offset;
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return sched_clock();
preempt_disable_notrace();
@@ -393,7 +393,7 @@ void sched_clock_tick(void)
if (sched_clock_stable())
return;
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return;
lockdep_assert_irqs_disabled();
@@ -460,7 +460,7 @@ void __init sched_clock_init(void)
u64 sched_clock_cpu(int cpu)
{
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return 0;
return sched_clock();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 90e4b00ace89..fc1dfc007604 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -919,17 +919,17 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
return uc_req;
}
-unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
/* Task currently refcounted: use back-annotated (effective) value */
if (p->uclamp[clamp_id].active)
- return p->uclamp[clamp_id].value;
+ return (unsigned long)p->uclamp[clamp_id].value;
uc_eff = uclamp_eff_get(p, clamp_id);
- return uc_eff.value;
+ return (unsigned long)uc_eff.value;
}
/*
@@ -1253,7 +1253,8 @@ static void __init init_uclamp(void)
mutex_init(&uclamp_mutex);
for_each_possible_cpu(cpu) {
- memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
+ memset(&cpu_rq(cpu)->uclamp, 0,
+ sizeof(struct uclamp_rq)*UCLAMP_CNT);
cpu_rq(cpu)->uclamp_flags = 0;
}
@@ -4504,7 +4505,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
- int old_prio, delta;
+ int old_prio;
struct rq_flags rf;
struct rq *rq;
@@ -4538,19 +4539,18 @@ void set_user_nice(struct task_struct *p, long nice)
set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
- delta = p->prio - old_prio;
- if (queued) {
+ if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
- resched_curr(rq);
- }
if (running)
set_next_task(rq, p);
+
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ p->sched_class->prio_changed(rq, p, old_prio);
+
out_unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -7100,6 +7100,12 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent)
sched_online_group(tg, parent);
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ /* Propagate the effective uclamp value for the new group */
+ cpu_util_update_eff(css);
+#endif
+
return 0;
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 9b8916fd00a2..7fbaee24c824 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -238,7 +238,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
*/
util = util_cfs + cpu_util_rt(rq);
if (type == FREQUENCY_UTIL)
- util = uclamp_util_with(rq, util, p);
+ util = uclamp_rq_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index b7abca987d94..1a2719e1350a 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -46,6 +46,8 @@ static int convert_prio(int prio)
* @cp: The cpupri context
* @p: The task
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
+ * @fitness_fn: A pointer to a function to do custom checks whether the CPU
+ * fits a specific criteria so that we only return those CPUs.
*
* Note: This function returns the recommended CPUs as calculated during the
* current invocation. By the time the call returns, the CPUs may have in
@@ -57,7 +59,8 @@ static int convert_prio(int prio)
* Return: (int)bool - CPUs were found
*/
int cpupri_find(struct cpupri *cp, struct task_struct *p,
- struct cpumask *lowest_mask)
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu))
{
int idx = 0;
int task_pri = convert_prio(p->prio);
@@ -98,6 +101,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
continue;
if (lowest_mask) {
+ int cpu;
+
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
/*
@@ -108,7 +113,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
* condition, simply act as though we never hit this
* priority level and continue on.
*/
- if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+ if (cpumask_empty(lowest_mask))
+ continue;
+
+ if (!fitness_fn)
+ return 1;
+
+ /* Ensure the capacity of the CPUs fit the task */
+ for_each_cpu(cpu, lowest_mask) {
+ if (!fitness_fn(p, cpu))
+ cpumask_clear_cpu(cpu, lowest_mask);
+ }
+
+ /*
+ * If no CPU at the current priority can fit the task
+ * continue looking
+ */
+ if (cpumask_empty(lowest_mask))
continue;
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 7dc20a3232e7..32dd520db11f 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -18,7 +18,9 @@ struct cpupri {
};
#ifdef CONFIG_SMP
-int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu));
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index d43318a489f2..cff3e656566d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -355,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int ticks)
+ int ticks)
{
u64 other, cputime = TICK_NSEC * ticks;
@@ -381,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
} else if (user_tick) {
account_user_time(p, cputime);
- } else if (p == rq->idle) {
+ } else if (p == this_rq()->idle) {
account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
account_guest_time(p, cputime);
@@ -392,14 +392,12 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
static void irqtime_account_idle_ticks(int ticks)
{
- struct rq *rq = this_rq();
-
- irqtime_account_process_tick(current, 0, rq, ticks);
+ irqtime_account_process_tick(current, 0, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int nr_ticks) { }
+ int nr_ticks) { }
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
@@ -473,13 +471,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
void account_process_tick(struct task_struct *p, int user_tick)
{
u64 cputime, steal;
- struct rq *rq = this_rq();
if (vtime_accounting_enabled_this_cpu())
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq, 1);
+ irqtime_account_process_tick(p, user_tick, 1);
return;
}
@@ -493,7 +490,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (user_tick)
account_user_time(p, cputime);
- else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
account_system_time(p, HARDIRQ_OFFSET, cputime);
else
account_idle_time(cputime);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f7e4579e746c..879d3ccf3806 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -751,9 +751,16 @@ void sysrq_sched_debug_show(void)
int cpu;
sched_debug_header(NULL);
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
+ /*
+ * Need to reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI or stop_machine.
+ */
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
print_cpu(NULL, cpu);
-
+ }
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ba749f579714..fe4e0d775375 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -801,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p)
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se, 0);
+ attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
@@ -3114,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = rq_of(cfs_rq);
- if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
+ if (&rq->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
@@ -3366,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
-
- se->avg.runnable_load_sum = runnable_sum;
- se->avg.runnable_load_avg = runnable_load_avg;
if (se->on_rq) {
+ delta_sum = runnable_load_sum -
+ se_weight(se) * se->avg.runnable_load_sum;
+ delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
}
+
+ se->avg.runnable_load_sum = runnable_sum;
+ se->avg.runnable_load_avg = runnable_load_avg;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3520,7 +3521,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
@@ -3556,7 +3557,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
- cfs_rq_util_change(cfs_rq, flags);
+ cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
@@ -3614,7 +3615,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*
* IOW we're enqueueing a task on a new CPU.
*/
- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
+ attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0);
} else if (decayed) {
@@ -3711,6 +3712,20 @@ static inline unsigned long task_util_est(struct task_struct *p)
return max(task_util(p), _task_util_est(p));
}
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return clamp(task_util_est(p),
+ uclamp_eff_value(p, UCLAMP_MIN),
+ uclamp_eff_value(p, UCLAMP_MAX));
+}
+#else
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return task_util_est(p);
+}
+#endif
+
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
@@ -3822,7 +3837,7 @@ done:
static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
- return fits_capacity(task_util_est(p), capacity);
+ return fits_capacity(uclamp_task_util(p), capacity);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -3857,7 +3872,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
@@ -5196,6 +5211,20 @@ static inline void update_overutilized_status(struct rq *rq)
static inline void update_overutilized_status(struct rq *rq) { }
#endif
+/* Runqueue only has SCHED_IDLE tasks enqueued */
+static int sched_idle_rq(struct rq *rq)
+{
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ rq->nr_running);
+}
+
+#ifdef CONFIG_SMP
+static int sched_idle_cpu(int cpu)
+{
+ return sched_idle_rq(cpu_rq(cpu));
+}
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5310,6 +5339,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
int idle_h_nr_running = task_has_idle_policy(p);
+ bool was_sched_idle = sched_idle_rq(rq);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5356,6 +5386,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
sub_nr_running(rq, 1);
+ /* balance early to pull high priority tasks */
+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ rq->next_balance = jiffies;
+
util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -5378,15 +5412,6 @@ static struct {
#endif /* CONFIG_NO_HZ_COMMON */
-/* CPU only has SCHED_IDLE tasks enqueued */
-static int sched_idle_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
- rq->nr_running);
-}
-
static unsigned long cpu_load(struct rq *rq)
{
return cfs_rq_load_avg(&rq->cfs);
@@ -5588,7 +5613,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
- int shallowest_idle_cpu = -1, si_cpu = -1;
+ int shallowest_idle_cpu = -1;
int i;
/* Check if we have any choice: */
@@ -5597,6 +5622,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ if (sched_idle_cpu(i))
+ return i;
+
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5619,12 +5647,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
- } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
- if (sched_idle_cpu(i)) {
- si_cpu = i;
- continue;
- }
-
+ } else if (shallowest_idle_cpu == -1) {
load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
@@ -5633,11 +5656,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
}
}
- if (shallowest_idle_cpu != -1)
- return shallowest_idle_cpu;
- if (si_cpu != -1)
- return si_cpu;
- return least_loaded_cpu;
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5790,7 +5809,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
*/
static int select_idle_smt(struct task_struct *p, int target)
{
- int cpu, si_cpu = -1;
+ int cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -5798,13 +5817,11 @@ static int select_idle_smt(struct task_struct *p, int target)
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- if (available_idle_cpu(cpu))
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
- if (si_cpu == -1 && sched_idle_cpu(cpu))
- si_cpu = cpu;
}
- return si_cpu;
+ return -1;
}
#else /* CONFIG_SCHED_SMT */
@@ -5828,12 +5845,13 @@ static inline int select_idle_smt(struct task_struct *p, int target)
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
int this = smp_processor_id();
- int cpu, nr = INT_MAX, si_cpu = -1;
+ int cpu, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -5859,15 +5877,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = cpu_clock(this);
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
- return si_cpu;
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
- continue;
- if (available_idle_cpu(cpu))
+ return -1;
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
break;
- if (si_cpu == -1 && sched_idle_cpu(cpu))
- si_cpu = cpu;
}
time = cpu_clock(this) - time;
@@ -6268,9 +6284,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- /* Skip CPUs that will be overutilized. */
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
+ spare_cap = cpu_cap - util;
+
+ /*
+ * Skip CPUs that cannot satisfy the capacity request.
+ * IOW, placing the task there would make the CPU
+ * overutilized. Take uclamp into account to see how
+ * much capacity we can get out of the CPU; this is
+ * aligned with schedutil_cpu_util().
+ */
+ util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
if (!fits_capacity(util, cpu_cap))
continue;
@@ -6285,7 +6310,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* Find the CPU with the maximum spare capacity in
* the performance domain
*/
- spare_cap = cpu_cap - util;
if (spare_cap > max_spare_cap) {
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
@@ -7780,29 +7804,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
*/
for_each_cpu(cpu, sched_group_span(sdg)) {
- struct sched_group_capacity *sgc;
- struct rq *rq = cpu_rq(cpu);
+ unsigned long cpu_cap = capacity_of(cpu);
- /*
- * build_sched_domains() -> init_sched_groups_capacity()
- * gets here before we've attached the domains to the
- * runqueues.
- *
- * Use capacity_of(), which is set irrespective of domains
- * in update_cpu_capacity().
- *
- * This avoids capacity from being 0 and
- * causing divide-by-zero issues on boot.
- */
- if (unlikely(!rq->sd)) {
- capacity += capacity_of(cpu);
- } else {
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
- }
-
- min_capacity = min(capacity, min_capacity);
- max_capacity = max(capacity, max_capacity);
+ capacity += cpu_cap;
+ min_capacity = min(cpu_cap, min_capacity);
+ max_capacity = max(cpu_cap, max_capacity);
}
} else {
/*
@@ -8168,14 +8174,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_has_spare:
/*
- * Select not overloaded group with lowest number of
- * idle cpus. We could also compare the spare capacity
- * which is more stable but it can end up that the
- * group has less spare capacity but finally more idle
+ * Select not overloaded group with lowest number of idle cpus
+ * and highest number of running tasks. We could also compare
+ * the spare capacity which is more stable but it can end up
+ * that the group has less spare capacity but finally more idle
* CPUs which means less opportunity to pull tasks.
*/
- if (sgs->idle_cpus >= busiest->idle_cpus)
+ if (sgs->idle_cpus > busiest->idle_cpus)
+ return false;
+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
+ (sgs->sum_nr_running <= busiest->sum_nr_running))
return false;
+
break;
}
@@ -9529,6 +9539,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -9565,7 +9576,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
break;
}
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
@@ -9581,9 +9592,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
spin_unlock(&balancing);
@@ -10333,6 +10345,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
+ if (rq->cfs.nr_running == 1)
+ return;
+
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
@@ -10423,7 +10438,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
- attach_entity_load_avg(cfs_rq, se, 0);
+ attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index ffa959e91227..b743bf38f08f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,7 +158,7 @@ static void cpuidle_idle_call(void)
/*
* Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
- * activity happens here and in iterrupts (if any). In that case bypass
+ * activity happens here and in interrupts (if any). In that case bypass
* the cpuidle governor and go stratight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index a96db50d40e0..bd006b79b360 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -129,8 +129,20 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
* Step 2
*/
delta %= 1024;
- contrib = __accumulate_pelt_segments(periods,
- 1024 - sa->period_contrib, delta);
+ if (load) {
+ /*
+ * This relies on the:
+ *
+ * if (!load)
+ * runnable = running = 0;
+ *
+ * clause from ___update_load_sum(); this results in
+ * the below usage of @contrib to dissapear entirely,
+ * so no point in calculating it.
+ */
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
}
sa->period_contrib = delta;
@@ -205,7 +217,9 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
* this happens during idle_balance() which calls
- * update_blocked_averages()
+ * update_blocked_averages().
+ *
+ * Also see the comment in accumulate_sum().
*/
if (!load)
runnable = running = 0;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ce8f6748678a..db7b50bba3f1 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1280,10 +1280,12 @@ static const struct file_operations psi_cpu_fops = {
static int __init psi_proc_init(void)
{
- proc_mkdir("pressure", NULL);
- proc_create("pressure/io", 0, NULL, &psi_io_fops);
- proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
- proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
+ if (psi_enable) {
+ proc_mkdir("pressure", NULL);
+ proc_create("pressure/io", 0, NULL, &psi_io_fops);
+ proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
+ proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
+ }
return 0;
}
module_init(psi_proc_init);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e591d40fd645..4043abe45459 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -437,6 +437,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
return rt_se->on_rq;
}
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the uclamp
+ * settings.
+ *
+ * This check is only important for heterogeneous systems where uclamp_min value
+ * is higher than the capacity of a @cpu. For non-heterogeneous system this
+ * function will always return true.
+ *
+ * The function will return true if the capacity of the @cpu is >= the
+ * uclamp_min and false otherwise.
+ *
+ * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
+ * > uclamp_max.
+ */
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned int min_cap;
+ unsigned int max_cap;
+ unsigned int cpu_cap;
+
+ /* Only heterogeneous systems can benefit from this check */
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return true;
+
+ min_cap = uclamp_eff_value(p, UCLAMP_MIN);
+ max_cap = uclamp_eff_value(p, UCLAMP_MAX);
+
+ cpu_cap = capacity_orig_of(cpu);
+
+ return cpu_cap >= min(min_cap, max_cap);
+}
+#else
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ return true;
+}
+#endif
+
#ifdef CONFIG_RT_GROUP_SCHED
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
@@ -1391,6 +1430,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
+ bool test;
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1422,10 +1462,16 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
*
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
+ *
+ * We take into account the capacity of the CPU to ensure it fits the
+ * requirement of the task - which is only important on heterogeneous
+ * systems like big.LITTLE.
*/
- if (curr && unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ test = curr &&
+ unlikely(rt_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
+
+ if (test || !rt_task_fits_capacity(p, cpu)) {
int target = find_lowest_rq(p);
/*
@@ -1449,15 +1495,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL))
return;
/*
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (p->nr_cpus_allowed != 1
- && cpupri_find(&rq->rd->cpupri, p, NULL))
+ if (p->nr_cpus_allowed != 1 &&
+ cpupri_find(&rq->rd->cpupri, p, NULL, NULL))
return;
/*
@@ -1601,7 +1647,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, p->cpus_ptr))
+ cpumask_test_cpu(cpu, p->cpus_ptr) &&
+ rt_task_fits_capacity(p, cpu))
return 1;
return 0;
@@ -1643,7 +1690,8 @@ static int find_lowest_rq(struct task_struct *task)
if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
- if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask,
+ rt_task_fits_capacity))
return -1; /* No targets found */
/*
@@ -2147,12 +2195,14 @@ skip:
*/
static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
- !test_tsk_need_resched(rq->curr) &&
- p->nr_cpus_allowed > 1 &&
- (dl_task(rq->curr) || rt_task(rq->curr)) &&
- (rq->curr->nr_cpus_allowed < 2 ||
- rq->curr->prio <= p->prio))
+ bool need_to_push = !task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ p->nr_cpus_allowed > 1 &&
+ (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ rq->curr->prio <= p->prio);
+
+ if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq)))
push_rt_tasks(rq);
}
@@ -2224,7 +2274,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+ bool need_to_push = rq->rt.overloaded ||
+ !rt_task_fits_capacity(p, cpu_of(rq));
+
+ if (p->nr_cpus_allowed > 1 && need_to_push)
rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 280a3c735935..1a88dc8ad11b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2300,14 +2300,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
-unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static __always_inline
-unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
- struct task_struct *p)
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
{
- unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+ unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+ unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
if (p) {
min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
@@ -2324,18 +2324,10 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
return clamp(util, min_util, max_util);
}
-
-static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
-{
- return uclamp_util_with(rq, util, NULL);
-}
#else /* CONFIG_UCLAMP_TASK */
-static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
- struct task_struct *p)
-{
- return util;
-}
-static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+static inline
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
{
return util;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6ec1e595b1d4..dfb64c08a407 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1880,6 +1880,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
}
/*
+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
+ * any two given CPUs at this (non-NUMA) topology level.
+ */
+static bool topology_span_sane(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, int cpu)
+{
+ int i;
+
+ /* NUMA levels are allowed to overlap */
+ if (tl->flags & SDTL_OVERLAP)
+ return true;
+
+ /*
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
+ */
+ for_each_cpu(i, cpu_map) {
+ if (i == cpu)
+ continue;
+ /*
+ * We should 'and' all those masks with 'cpu_map' to exactly
+ * match the topology we're about to build, but that can only
+ * remove CPUs, which only lessens our ability to detect
+ * overlaps
+ */
+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
+ return false;
+ }
+
+ return true;
+}
+
+/*
* Find the sched_domain_topology_level where all CPU capacities are visible
* for all CPUs.
*/
@@ -1975,6 +2011,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
has_asym = true;
}
+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
+ goto error;
+
sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
if (tl == sched_domain_topology)
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 45eba18a2898..02ce292b9bc0 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -179,6 +179,7 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int
.bit_nr = -1,
},
.wq_entry = {
+ .flags = flags,
.private = current,
.func = var_wake_function,
.entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),