aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c548
1 files changed, 388 insertions, 160 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 20aa234ffe04..fb469b26b00a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -268,33 +268,11 @@ const struct sched_class fair_sched_class;
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- SCHED_WARN_ON(!entity_is_task(se));
- return container_of(se, struct task_struct, se);
-}
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return grp->my_q;
-}
-
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
if (!path)
@@ -455,33 +433,9 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#else /* !CONFIG_FAIR_GROUP_SCHED */
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- return container_of(se, struct task_struct, se);
-}
-
#define for_each_sched_entity(se) \
for (; se; se = NULL)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return &task_rq(p)->cfs;
-}
-
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- struct task_struct *p = task_of(se);
- struct rq *rq = task_rq(p);
-
- return &rq->cfs;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return NULL;
-}
-
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
if (path)
@@ -1039,11 +993,14 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
+ unsigned int state;
- if (tsk->state & TASK_INTERRUPTIBLE)
+ /* XXX racy against TTWU */
+ state = READ_ONCE(tsk->__state);
+ if (state & TASK_INTERRUPTIBLE)
__schedstat_set(se->statistics.sleep_start,
rq_clock(rq_of(cfs_rq)));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
+ if (state & TASK_UNINTERRUPTIBLE)
__schedstat_set(se->statistics.block_start,
rq_clock(rq_of(cfs_rq)));
}
@@ -1107,7 +1064,7 @@ struct numa_group {
static struct numa_group *deref_task_numa_group(struct task_struct *p)
{
return rcu_dereference_check(p->numa_group, p == current ||
- (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+ (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
}
static struct numa_group *deref_curr_numa_group(struct task_struct *p)
@@ -3139,7 +3096,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (1)
- * \Sum grq->load.weight
+ * \Sum grq->load.weight
*
* Now, because computing that sum is prohibitively expensive to compute (been
* there, done that) we approximate it with this average stuff. The average
@@ -3153,7 +3110,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->avg.load_avg
* ge->load.weight = ------------------------------ (3)
- * tg->load_avg
+ * tg->load_avg
*
* Where: tg->load_avg ~= \Sum grq->avg.load_avg
*
@@ -3169,7 +3126,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- = tg->weight (4)
- * grp->load.weight
+ * grp->load.weight
*
* That is, the sum collapses because all other CPUs are idle; the UP scenario.
*
@@ -3188,7 +3145,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (6)
- * tg_load_avg'
+ * tg_load_avg'
*
* Where:
*
@@ -3298,6 +3255,61 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
#ifdef CONFIG_SMP
#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
+ * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
+ * bottom-up, we only have to test whether the cfs_rq before us on the list
+ * is our child.
+ * If cfs_rq is not on the list, test whether a child needs its to be added to
+ * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
+ */
+static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
+{
+ struct cfs_rq *prev_cfs_rq;
+ struct list_head *prev;
+
+ if (cfs_rq->on_list) {
+ prev = cfs_rq->leaf_cfs_rq_list.prev;
+ } else {
+ struct rq *rq = rq_of(cfs_rq);
+
+ prev = rq->tmp_alone_branch;
+ }
+
+ prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
+
+ return (prev_cfs_rq->tg->parent == cfs_rq->tg);
+}
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load.weight)
+ return false;
+
+ if (cfs_rq->avg.load_sum)
+ return false;
+
+ if (cfs_rq->avg.util_sum)
+ return false;
+
+ if (cfs_rq->avg.runnable_sum)
+ return false;
+
+ if (child_cfs_rq_on_list(cfs_rq))
+ return false;
+
+ /*
+ * _avg must be null when _sum are null because _avg = _sum / divider
+ * Make sure that rounding and/or propagation of PELT values never
+ * break this.
+ */
+ SCHED_WARN_ON(cfs_rq->avg.load_avg ||
+ cfs_rq->avg.util_avg ||
+ cfs_rq->avg.runnable_avg);
+
+ return true;
+}
+
/**
* update_tg_load_avg - update the tg's load avg
* @cfs_rq: the cfs_rq whose avg changed
@@ -3499,10 +3511,9 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg;
u64 load_sum = 0;
- s64 delta_sum;
u32 divider;
if (!runnable_sum)
@@ -3549,13 +3560,16 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, divider);
- delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
- delta_avg = load_avg - se->avg.load_avg;
-
se->avg.load_sum = runnable_sum;
+
+ delta = load_avg - se->avg.load_avg;
+ if (!delta)
+ return;
+
se->avg.load_avg = load_avg;
- add_positive(&cfs_rq->avg.load_avg, delta_avg);
- add_positive(&cfs_rq->avg.load_sum, delta_sum);
+
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3671,15 +3685,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_load;
sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * divider);
+ sa->load_sum = sa->load_avg * divider;
r = removed_util;
sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * divider);
+ sa->util_sum = sa->util_avg * divider;
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
- sub_positive(&sa->runnable_sum, r * divider);
+ sa->runnable_sum = sa->runnable_avg * divider;
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -3766,11 +3780,17 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
+
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -3902,7 +3922,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
- return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
+ return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
}
static inline unsigned long task_util_est(struct task_struct *p)
@@ -4002,7 +4022,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
*/
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+ ue.enqueued = task_util(p);
if (sched_feat(UTIL_EST_FASTUP)) {
if (ue.ewma < ue.enqueued) {
ue.ewma = ue.enqueued;
@@ -4051,6 +4071,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
+ ue.enqueued |= UTIL_AVG_UNCHANGED;
WRITE_ONCE(p->se.avg.util_est, ue);
trace_sched_util_est_se_tp(&p->se);
@@ -4085,6 +4106,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
#else /* CONFIG_SMP */
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ return true;
+}
+
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
@@ -4419,6 +4445,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ clear_buddies(cfs_rq, se);
+
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
@@ -4478,7 +4506,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
- if (cfs_rq->skip == se) {
+ if (cfs_rq->skip && cfs_rq->skip == se) {
struct sched_entity *second;
if (se == curr) {
@@ -4505,8 +4533,6 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
se = cfs_rq->last;
}
- clear_buddies(cfs_rq, se);
-
return se;
}
@@ -4628,8 +4654,11 @@ static inline u64 sched_cfs_bandwidth_slice(void)
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
- if (cfs_b->quota != RUNTIME_INF)
- cfs_b->runtime = cfs_b->quota;
+ if (unlikely(cfs_b->quota == RUNTIME_INF))
+ return;
+
+ cfs_b->runtime += cfs_b->quota;
+ cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4743,8 +4772,8 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
- /* Add cfs_rq with already running entity in the list */
- if (cfs_rq->nr_running >= 1)
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
list_add_leaf_cfs_rq(cfs_rq);
}
@@ -4990,6 +5019,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->nr_periods += overrun;
+ /* Refill extra burst quota even if cfs_b->idle */
+ __refill_cfs_bandwidth_runtime(cfs_b);
+
/*
* idle depends on !throttled (for the case of a large deficit), and if
* we're going inactive then everything else can be deferred
@@ -4997,8 +5029,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
if (cfs_b->idle && !throttled)
goto out_deactivate;
- __refill_cfs_bandwidth_runtime(cfs_b);
-
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
@@ -5248,6 +5278,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (new < max_cfs_quota_period) {
cfs_b->period = ns_to_ktime(new);
cfs_b->quota *= 2;
+ cfs_b->burst *= 2;
pr_warn_ratelimited(
"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
@@ -5279,6 +5310,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
+ cfs_b->burst = 0;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
@@ -5328,7 +5360,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5347,7 +5379,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5935,11 +5967,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ struct rq *rq = cpu_rq(i);
+
+ if (!sched_core_cookie_match(rq, p))
+ continue;
+
if (sched_idle_cpu(i))
return i;
if (available_idle_cpu(i)) {
- struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
/*
@@ -6025,9 +6061,10 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return new_cpu;
}
-static inline int __select_idle_cpu(int cpu)
+static inline int __select_idle_cpu(int cpu, struct task_struct *p)
{
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
+ sched_cpu_cookie_match(cpu_rq(cpu), p))
return cpu;
return -1;
@@ -6097,7 +6134,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
int cpu;
if (!static_branch_likely(&sched_smt_present))
- return __select_idle_cpu(core);
+ return __select_idle_cpu(core, p);
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (!available_idle_cpu(cpu)) {
@@ -6153,7 +6190,7 @@ static inline bool test_idle_cores(int cpu, bool def)
static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
- return __select_idle_cpu(core);
+ return __select_idle_cpu(core, p);
}
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
@@ -6172,9 +6209,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ struct rq *this_rq = this_rq();
int this = smp_processor_id();
struct sched_domain *this_sd;
- u64 time;
+ u64 time = 0;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -6184,12 +6222,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
if (sched_feat(SIS_PROP) && !has_idle_core) {
u64 avg_cost, avg_idle, span_avg;
+ unsigned long now = jiffies;
/*
- * Due to large variance we need a large fuzz factor;
- * hackbench in particularly is sensitive here.
+ * If we're busy, the assumption that the last idle period
+ * predicts the future is flawed; age away the remaining
+ * predicted idle time.
*/
- avg_idle = this_rq()->avg_idle / 512;
+ if (unlikely(this_rq->wake_stamp < now)) {
+ while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
+ this_rq->wake_stamp++;
+ this_rq->wake_avg_idle >>= 1;
+ }
+ }
+
+ avg_idle = this_rq->wake_avg_idle;
avg_cost = this_sd->avg_scan_cost + 1;
span_avg = sd->span_weight * avg_idle;
@@ -6210,17 +6257,24 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
} else {
if (!--nr)
return -1;
- idle_cpu = __select_idle_cpu(cpu);
+ idle_cpu = __select_idle_cpu(cpu, p);
if ((unsigned int)idle_cpu < nr_cpumask_bits)
break;
}
}
if (has_idle_core)
- set_idle_cores(this, false);
+ set_idle_cores(target, false);
if (sched_feat(SIS_PROP) && !has_idle_core) {
time = cpu_clock(this) - time;
+
+ /*
+ * Account for the scan cost of wakeups against the average
+ * idle time.
+ */
+ this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
+
update_avg(&this_sd->avg_scan_cost, time);
}
@@ -6288,6 +6342,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
task_util = uclamp_task_util(p);
}
+ /*
+ * per-cpu select_idle_mask usage
+ */
+ lockdep_assert_irqs_disabled();
+
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
asym_fits_capacity(task_util, target))
return target;
@@ -6563,8 +6622,11 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
struct cpumask *pd_mask = perf_domain_span(pd);
unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
unsigned long max_util = 0, sum_util = 0;
+ unsigned long _cpu_cap = cpu_cap;
int cpu;
+ _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
+
/*
* The capacity state of CPUs of the current rd can be driven by CPUs
* of another rd if they belong to the same pd. So, account for the
@@ -6600,8 +6662,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
- sum_util += effective_cpu_util(cpu, util_running, cpu_cap,
- ENERGY_UTIL, NULL);
+ cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
+ ENERGY_UTIL, NULL);
+
+ sum_util += min(cpu_util, _cpu_cap);
/*
* Performance domain frequency: utilization clamping
@@ -6612,10 +6676,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
*/
cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
+ max_util = max(max_util, min(cpu_util, _cpu_cap));
}
- return em_cpu_energy(pd->em_pd, max_util, sum_util);
+ return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
}
/*
@@ -6661,15 +6725,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ int cpu, best_energy_cpu = prev_cpu, target = -1;
unsigned long cpu_cap, util, base_energy = 0;
- int cpu, best_energy_cpu = prev_cpu;
struct sched_domain *sd;
struct perf_domain *pd;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized))
- goto fail;
+ goto unlock;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6679,7 +6743,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
- goto fail;
+ goto unlock;
+
+ target = prev_cpu;
sync_entity_load_avg(&p->se);
if (!task_util_est(p))
@@ -6687,13 +6753,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ bool compute_prev_delta = false;
unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
- /* Compute the 'base' energy of the pd, without @p */
- base_energy_pd = compute_energy(p, -1, pd);
- base_energy += base_energy_pd;
-
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
@@ -6714,26 +6777,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!fits_capacity(util, cpu_cap))
continue;
- /* Always use prev_cpu as a candidate. */
if (cpu == prev_cpu) {
- prev_delta = compute_energy(p, prev_cpu, pd);
- prev_delta -= base_energy_pd;
- best_delta = min(best_delta, prev_delta);
- }
-
- /*
- * Find the CPU with the maximum spare capacity in
- * the performance domain
- */
- if (spare_cap > max_spare_cap) {
+ /* Always use prev_cpu as a candidate. */
+ compute_prev_delta = true;
+ } else if (spare_cap > max_spare_cap) {
+ /*
+ * Find the CPU with the maximum spare capacity
+ * in the performance domain.
+ */
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
}
}
- /* Evaluate the energy impact of using this CPU. */
- if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
+ if (max_spare_cap_cpu < 0 && !compute_prev_delta)
+ continue;
+
+ /* Compute the 'base' energy of the pd, without @p */
+ base_energy_pd = compute_energy(p, -1, pd);
+ base_energy += base_energy_pd;
+
+ /* Evaluate the energy impact of using prev_cpu. */
+ if (compute_prev_delta) {
+ prev_delta = compute_energy(p, prev_cpu, pd);
+ if (prev_delta < base_energy_pd)
+ goto unlock;
+ prev_delta -= base_energy_pd;
+ best_delta = min(best_delta, prev_delta);
+ }
+
+ /* Evaluate the energy impact of using max_spare_cap_cpu. */
+ if (max_spare_cap_cpu >= 0) {
cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+ if (cur_delta < base_energy_pd)
+ goto unlock;
cur_delta -= base_energy_pd;
if (cur_delta < best_delta) {
best_delta = cur_delta;
@@ -6741,25 +6818,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
}
}
-unlock:
rcu_read_unlock();
/*
* Pick the best CPU if prev_cpu cannot be used, or if it saves at
* least 6% of the energy used by prev_cpu.
*/
- if (prev_delta == ULONG_MAX)
- return best_energy_cpu;
-
- if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
- return best_energy_cpu;
+ if ((prev_delta == ULONG_MAX) ||
+ (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
+ target = best_energy_cpu;
- return prev_cpu;
+ return target;
-fail:
+unlock:
rcu_read_unlock();
- return -1;
+ return target;
}
/*
@@ -6771,8 +6845,6 @@ fail:
* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
*
* Returns the target CPU number.
- *
- * preempt must be disabled.
*/
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
@@ -6785,6 +6857,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
/* SD_flags and WF_flags share the first nibble */
int sd_flag = wake_flags & 0xF;
+ /*
+ * required for stable ->cpus_allowed
+ */
+ lockdep_assert_held(&p->pi_lock);
if (wake_flags & WF_TTWU) {
record_wakee(p);
@@ -6849,7 +6925,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* min_vruntime -- the latter is done by enqueue_entity() when placing
* the task on the new runqueue.
*/
- if (p->state == TASK_WAKING) {
+ if (READ_ONCE(p->__state) == TASK_WAKING) {
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 min_vruntime;
@@ -6874,7 +6950,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
* rq->lock and can modify state directly.
*/
- lockdep_assert_held(&task_rq(p)->lock);
+ lockdep_assert_rq_held(task_rq(p));
detach_entity_cfs_rq(&p->se);
} else {
@@ -7078,6 +7154,39 @@ preempt:
set_last_buddy(se);
}
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_fair(struct rq *rq)
+{
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+
+again:
+ cfs_rq = &rq->cfs;
+ if (!cfs_rq->nr_running)
+ return NULL;
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
+ if (curr) {
+ if (curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto again;
+ }
+
+ se = pick_next_entity(cfs_rq, curr);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ return task_of(se);
+}
+#endif
+
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -7501,7 +7610,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
if (p->sched_class != &fair_sched_class)
return 0;
@@ -7523,6 +7632,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (sysctl_sched_migration_cost == -1)
return 1;
+
+ /*
+ * Don't migrate task if the task's cookie does not match
+ * with the destination CPU's core cookie.
+ */
+ if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
+ return 1;
+
if (sysctl_sched_migration_cost == 0)
return 0;
@@ -7599,7 +7716,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
/*
* We do not migrate tasks that are:
@@ -7688,7 +7805,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
@@ -7704,7 +7821,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
@@ -7740,7 +7857,7 @@ static int detach_tasks(struct lb_env *env)
struct task_struct *p;
int detached = 0;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
/*
* Source run queue has been emptied by another CPU, clear
@@ -7870,7 +7987,7 @@ next:
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
BUG_ON(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
@@ -7990,23 +8107,6 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load.weight)
- return false;
-
- if (cfs_rq->avg.load_sum)
- return false;
-
- if (cfs_rq->avg.util_sum)
- return false;
-
- if (cfs_rq->avg.runnable_sum)
- return false;
-
- return true;
-}
-
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq, *pos;
@@ -8030,7 +8130,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
- update_load_avg(cfs_rq_of(se), se, 0);
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
@@ -8853,6 +8953,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
p->cpus_ptr))
continue;
+ /* Skip over this group if no cookie matched */
+ if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
+ continue;
+
local_group = cpumask_test_cpu(this_cpu,
sched_group_span(group));
@@ -9781,7 +9885,7 @@ more_balance:
if (need_active_balance(&env)) {
unsigned long flags;
- raw_spin_lock_irqsave(&busiest->lock, flags);
+ raw_spin_rq_lock_irqsave(busiest, flags);
/*
* Don't kick the active_load_balance_cpu_stop,
@@ -9789,8 +9893,7 @@ more_balance:
* moved to this_cpu:
*/
if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
- raw_spin_unlock_irqrestore(&busiest->lock,
- flags);
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
goto out_one_pinned;
}
@@ -9807,7 +9910,7 @@ more_balance:
busiest->push_cpu = this_cpu;
active_balance = 1;
}
- raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
@@ -10592,6 +10695,14 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
u64 curr_cost = 0;
update_misfit_status(NULL, this_rq);
+
+ /*
+ * There is a task waiting to run. No need to search for one.
+ * Return 0; the task will be enqueued when switching to idle.
+ */
+ if (this_rq->ttwu_pending)
+ return 0;
+
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -10624,7 +10735,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
goto out;
}
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
update_blocked_averages(this_cpu);
rcu_read_lock();
@@ -10657,12 +10768,13 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
- if (pulled_task || this_rq->nr_running > 0)
+ if (pulled_task || this_rq->nr_running > 0 ||
+ this_rq->ttwu_pending)
break;
}
rcu_read_unlock();
- raw_spin_lock(&this_rq->lock);
+ raw_spin_rq_lock(this_rq);
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
@@ -10755,6 +10867,119 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_CORE
+static inline bool
+__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
+{
+ u64 slice = sched_slice(cfs_rq_of(se), se);
+ u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+
+ return (rtime * min_nr_tasks > slice);
+}
+
+#define MIN_NR_TASKS_DURING_FORCEIDLE 2
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ /*
+ * If runqueue has only one task which used up its slice and
+ * if the sibling is forced idle, then trigger schedule to
+ * give forced idle task a chance.
+ *
+ * sched_slice() considers only this active rq and it gets the
+ * whole slice. But during force idle, we have siblings acting
+ * like a single runqueue and hence we need to consider runnable
+ * tasks on this CPU and the forced idle CPU. Ideally, we should
+ * go through the forced idle rq, but that would be a perf hit.
+ * We can assume that the forced idle CPU has at least
+ * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
+ * if we need to give up the CPU.
+ */
+ if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+ __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
+ resched_curr(rq);
+}
+
+/*
+ * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ */
+static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (forceidle) {
+ if (cfs_rq->forceidle_seq == fi_seq)
+ break;
+ cfs_rq->forceidle_seq = fi_seq;
+ }
+
+ cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ }
+}
+
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
+{
+ struct sched_entity *se = &p->se;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+{
+ struct rq *rq = task_rq(a);
+ struct sched_entity *sea = &a->se;
+ struct sched_entity *seb = &b->se;
+ struct cfs_rq *cfs_rqa;
+ struct cfs_rq *cfs_rqb;
+ s64 delta;
+
+ SCHED_WARN_ON(task_rq(b)->core != rq->core);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Find an se in the hierarchy for tasks a and b, such that the se's
+ * are immediate siblings.
+ */
+ while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+ int sea_depth = sea->depth;
+ int seb_depth = seb->depth;
+
+ if (sea_depth >= seb_depth)
+ sea = parent_entity(sea);
+ if (sea_depth <= seb_depth)
+ seb = parent_entity(seb);
+ }
+
+ se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
+ se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
+
+ cfs_rqa = sea->cfs_rq;
+ cfs_rqb = seb->cfs_rq;
+#else
+ cfs_rqa = &task_rq(a)->cfs;
+ cfs_rqb = &task_rq(b)->cfs;
+#endif
+
+ /*
+ * Find delta after normalizing se's vruntime with its cfs_rq's
+ * min_vruntime_fi, which would have been updated in prior calls
+ * to se_fi_update().
+ */
+ delta = (s64)(sea->vruntime - seb->vruntime) +
+ (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+
+ return delta > 0;
+}
+#else
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
+#endif
+
/*
* scheduler tick hitting a task of our scheduling class.
*
@@ -10778,6 +11003,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
+
+ task_tick_core(rq, curr);
}
/*
@@ -10863,7 +11090,7 @@ static inline bool vruntime_normalized(struct task_struct *p)
* waiting for actually being woken up by sched_ttwu_pending().
*/
if (!se->sum_exec_runtime ||
- (p->state == TASK_WAKING && p->sched_remote_wakeup))
+ (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
return true;
return false;
@@ -11149,9 +11376,9 @@ void unregister_fair_sched_group(struct task_group *tg)
rq = cpu_rq(cpu);
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
}
}
@@ -11273,6 +11500,7 @@ DEFINE_SCHED_CLASS(fair) = {
#ifdef CONFIG_SMP
.balance = balance_fair,
+ .pick_task = pick_task_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,