aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c1827
1 files changed, 1070 insertions, 757 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e476f6d9435..e4a0b8bd941c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,7 +20,41 @@
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
+#include <linux/energy_model.h>
+#include <linux/mmap_lock.h>
+#include <linux/hugetlb_inline.h>
+#include <linux/jiffies.h>
+#include <linux/mm_api.h>
+#include <linux/highmem.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/nohz.h>
+
+#include <linux/cpuidle.h>
+#include <linux/interrupt.h>
+#include <linux/memory-tiers.h>
+#include <linux/mempolicy.h>
+#include <linux/mutex_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/ratelimit.h>
+#include <linux/task_work.h>
+
+#include <asm/switch_to.h>
+
+#include <linux/sched/cond_resched.h>
+
#include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -141,7 +175,37 @@ int __weak arch_asym_cpu_priority(int cpu)
*
* (default: 5 msec, units: microseconds)
*/
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_fair_sysctls[] = {
+ {
+ .procname = "sched_child_runs_first",
+ .data = &sysctl_sched_child_runs_first,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .procname = "sched_cfs_bandwidth_slice_us",
+ .data = &sysctl_sched_cfs_bandwidth_slice,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+#endif
+ {}
+};
+
+static int __init sched_fair_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_fair_sysctls);
+ return 0;
+}
+late_initcall(sched_fair_sysctl_init);
#endif
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@ -281,19 +345,6 @@ const struct sched_class fair_sched_class;
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (!path)
- return;
-
- if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
- autogroup_path(cfs_rq->tg, path, len);
- else if (cfs_rq && cfs_rq->tg->css.cgroup)
- cgroup_path(cfs_rq->tg->css.cgroup, path, len);
- else
- strlcpy(path, "(null)", len);
-}
-
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -461,12 +512,6 @@ static int se_is_idle(struct sched_entity *se)
#define for_each_sched_entity(se) \
for (; se; se = NULL)
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (path)
- strlcpy(path, "(null)", len);
-}
-
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
return true;
@@ -568,11 +613,8 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
}
/* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+ u64_u32_store(cfs_rq->min_vruntime,
+ max_vruntime(cfs_rq->min_vruntime, vruntime));
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -758,8 +800,6 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static void attach_entity_cfs_rq(struct sched_entity *se);
-
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
@@ -794,20 +834,6 @@ void post_init_entity_util_avg(struct task_struct *p)
long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
- if (cap > 0) {
- if (cfs_rq->avg.util_avg != 0) {
- sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
- sa->util_avg /= (cfs_rq->avg.load_avg + 1);
-
- if (sa->util_avg > cap)
- sa->util_avg = cap;
- } else {
- sa->util_avg = cap;
- }
- }
-
- sa->runnable_avg = sa->util_avg;
-
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
@@ -823,7 +849,19 @@ void post_init_entity_util_avg(struct task_struct *p)
return;
}
- attach_entity_cfs_rq(se);
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ }
+
+ sa->runnable_avg = sa->util_avg;
}
#else /* !CONFIG_SMP */
@@ -1011,6 +1049,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+#ifdef CONFIG_NUMA
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long
+adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
+{
+ /*
+ * Allow a NUMA imbalance if busy CPUs is less than the maximum
+ * threshold. Above this threshold, individual tasks may be contending
+ * for both memory bandwidth and any shared HT resources. This is an
+ * approximation as the number of running tasks may not be related to
+ * the number of busy CPUs due to sched_setaffinity.
+ */
+ if (dst_running > imb_numa_nr)
+ return imbalance;
+
+ /*
+ * Allow a small imbalance based on a simple pair of communicating
+ * tasks that remain local when the destination is lightly loaded.
+ */
+ if (imbalance <= NUMA_IMBALANCE_MIN)
+ return 0;
+
+ return imbalance;
+}
+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
@@ -1026,6 +1091,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/* The page with hint page fault latency < threshold in ms is considered hot */
+unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
+
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+
struct numa_group {
refcount_t refcount;
@@ -1259,10 +1330,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng)
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
- int maxdist, bool task)
+ int lim_dist, bool task)
{
unsigned long score = 0;
- int node;
+ int node, max_dist;
/*
* All nodes are directly connected, and the same distance
@@ -1271,6 +1342,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
if (sched_numa_topology_type == NUMA_DIRECT)
return 0;
+ /* sched_max_numa_distance may be changed in parallel. */
+ max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
* which should be ok given the number of nodes rarely exceeds 8.
@@ -1283,7 +1356,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* The furthest away nodes in the system are not interesting
* for placement; nid was already counted.
*/
- if (dist == sched_max_numa_distance || node == nid)
+ if (dist >= max_dist || node == nid)
continue;
/*
@@ -1293,8 +1366,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* "hoplimit", only nodes closer by than "hoplimit" are part
* of each group. Skip other nodes.
*/
- if (sched_numa_topology_type == NUMA_BACKPLANE &&
- dist >= maxdist)
+ if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
continue;
/* Add up the faults from nearby nodes. */
@@ -1312,8 +1384,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* This seems to result in good task placement.
*/
if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
- faults *= (sched_max_numa_distance - dist);
- faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ faults *= (max_dist - dist);
+ faults /= (max_dist - LOCAL_DISTANCE);
}
score += faults;
@@ -1367,6 +1439,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 1000 * faults / total_faults;
}
+/*
+ * If memory tiering mode is enabled, cpupid of slow memory page is
+ * used to record scan time instead of CPU and PID. When tiering mode
+ * is disabled at run time, the scan time (in cpupid) will be
+ * interpreted as CPU and PID. So CPU needs to be checked to avoid to
+ * access out of array bound.
+ */
+static inline bool cpupid_valid(int cpupid)
+{
+ return cpupid_to_cpu(cpupid) < nr_cpu_ids;
+}
+
+/*
+ * For memory tiering mode, if there are enough free pages (more than
+ * enough watermark defined here) in fast memory node, to take full
+ * advantage of fast memory capacity, all recently accessed slow
+ * memory pages will be migrated to fast memory node without
+ * considering hot threshold.
+ */
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+ int z;
+ unsigned long enough_wmark;
+
+ enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
+ pgdat->node_present_pages >> 4);
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_watermark_ok(zone, 0,
+ wmark_pages(zone, WMARK_PROMO) + enough_wmark,
+ ZONE_MOVABLE, 0))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * For memory tiering mode, when page tables are scanned, the scan
+ * time will be recorded in struct page in addition to make page
+ * PROT_NONE for slow memory page. So when the page is accessed, in
+ * hint page fault handler, the hint page fault latency is calculated
+ * via,
+ *
+ * hint page fault latency = hint page fault time - scan time
+ *
+ * The smaller the hint page fault latency, the higher the possibility
+ * for the page to be hot.
+ */
+static int numa_hint_fault_latency(struct page *page)
+{
+ int last_time, time;
+
+ time = jiffies_to_msecs(jiffies);
+ last_time = xchg_page_access_time(page, time);
+
+ return (time - last_time) & PAGE_ACCESS_TIME_MASK;
+}
+
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency. So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long nr_cand;
+ unsigned int now, start;
+
+ now = jiffies_to_msecs(jiffies);
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ start = pgdat->nbp_rl_start;
+ if (now - start > MSEC_PER_SEC &&
+ cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+ pgdat->nbp_rl_nr_cand = nr_cand;
+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+ return true;
+ return false;
+}
+
+#define NUMA_MIGRATION_ADJUST_STEPS 16
+
+static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
+ unsigned long rate_limit,
+ unsigned int ref_th)
+{
+ unsigned int now, start, th_period, unit_th, th;
+ unsigned long nr_cand, ref_cand, diff_cand;
+
+ now = jiffies_to_msecs(jiffies);
+ th_period = sysctl_numa_balancing_scan_period_max;
+ start = pgdat->nbp_th_start;
+ if (now - start > th_period &&
+ cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
+ ref_cand = rate_limit *
+ sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+ unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
+ th = pgdat->nbp_threshold ? : ref_th;
+ if (diff_cand > ref_cand * 11 / 10)
+ th = max(th - unit_th, unit_th);
+ else if (diff_cand < ref_cand * 9 / 10)
+ th = min(th + unit_th, ref_th * 2);
+ pgdat->nbp_th_nr_cand = nr_cand;
+ pgdat->nbp_threshold = th;
+ }
+}
+
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
@@ -1374,9 +1560,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ /*
+ * The pages in slow memory node should be migrated according
+ * to hot/cold instead of private/shared.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !node_is_toptier(src_nid)) {
+ struct pglist_data *pgdat;
+ unsigned long rate_limit;
+ unsigned int latency, th, def_th;
+
+ pgdat = NODE_DATA(dst_nid);
+ if (pgdat_free_space_enough(pgdat)) {
+ /* workload changed, reset hot threshold */
+ pgdat->nbp_threshold = 0;
+ return true;
+ }
+
+ def_th = sysctl_numa_balancing_hot_threshold;
+ rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+ (20 - PAGE_SHIFT);
+ numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
+
+ th = pgdat->nbp_threshold ? : def_th;
+ latency = numa_hint_fault_latency(page);
+ if (latency >= th)
+ return false;
+
+ return !numa_promotion_rate_limit(pgdat, rate_limit,
+ thp_nr_pages(page));
+ }
+
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
+ return false;
+
/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
@@ -1489,6 +1710,7 @@ struct task_numa_env {
int src_cpu, src_nid;
int dst_cpu, dst_nid;
+ int imb_numa_nr;
struct numa_stats src_stats, dst_stats;
@@ -1502,9 +1724,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
-static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int dst_weight);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@@ -1525,11 +1744,11 @@ numa_type numa_classify(unsigned int imbalance_pct,
#ifdef CONFIG_SCHED_SMT
/* Forward declarations of select_idle_sibling helpers */
-static inline bool test_idle_cores(int cpu, bool def);
+static inline bool test_idle_cores(int cpu);
static inline int numa_idle_core(int idle_core, int cpu)
{
if (!static_branch_likely(&sched_smt_present) ||
- idle_core >= 0 || !test_idle_cores(cpu, false))
+ idle_core >= 0 || !test_idle_cores(cpu))
return idle_core;
/*
@@ -1569,7 +1788,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
- ns->util += cpu_util(cpu);
+ ns->util += cpu_util_cfs(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
@@ -1745,6 +1964,15 @@ static bool task_numa_compare(struct task_numa_env *env,
*/
cur_ng = rcu_dereference(cur->numa_group);
if (cur_ng == p_ng) {
+ /*
+ * Do not swap within a group or between tasks that have
+ * no group if there is spare capacity. Swapping does
+ * not address the load imbalance and helps one task at
+ * the cost of punishing another.
+ */
+ if (env->dst_stats.node_type == node_has_spare)
+ goto unlock;
+
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
@@ -1885,7 +2113,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running,
- env->dst_stats.weight);
+ env->imb_numa_nr);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -1950,8 +2178,10 @@ static int task_numa_migrate(struct task_struct *p)
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
- if (sd)
+ if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ env.imb_numa_nr = sd->imb_numa_nr;
+ }
rcu_read_unlock();
/*
@@ -1986,7 +2216,7 @@ static int task_numa_migrate(struct task_struct *p)
*/
ng = deref_curr_numa_group(p);
if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -2084,13 +2314,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group)
unsigned long faults, max_faults = 0;
int nid, active_nodes = 0;
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults > max_faults)
max_faults = faults;
}
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults * ACTIVE_NODE_FRACTION > max_faults)
active_nodes++;
@@ -2244,7 +2474,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
dist = sched_max_numa_distance;
- for_each_online_node(node) {
+ for_each_node_state(node, N_CPU) {
score = group_weight(p, node, dist);
if (score > max_score) {
max_score = score;
@@ -2263,7 +2493,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
* inside the highest scoring group of nodes. The nodemask tricks
* keep the complexity of the search down.
*/
- nodes = node_online_map;
+ nodes = node_states[N_CPU];
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
nodemask_t max_group = NODE_MASK_NONE;
@@ -2402,6 +2632,21 @@ static void task_numa_placement(struct task_struct *p)
}
}
+ /* Cannot migrate task to CPU-less node */
+ if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
+ int near_nid = max_nid;
+ int distance, near_distance = INT_MAX;
+
+ for_each_node_state(nid, N_CPU) {
+ distance = node_distance(max_nid, nid);
+ if (distance < near_distance) {
+ near_nid = nid;
+ near_distance = distance;
+ }
+ }
+ max_nid = near_nid;
+ }
+
if (ng) {
numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
@@ -2507,7 +2752,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!join)
return;
- BUG_ON(irqs_disabled());
+ WARN_ON_ONCE(irqs_disabled());
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
@@ -2592,6 +2837,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (!p->mm)
return;
+ /*
+ * NUMA faults statistics are unnecessary for the slow memory
+ * node for memory tiering mode.
+ */
+ if (!node_is_toptier(mem_node) &&
+ (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
+ !cpupid_valid(last_cpupid)))
+ return;
+
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
@@ -2672,6 +2926,7 @@ static void task_numa_work(struct callback_head *work)
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
u64 runtime = p->se.sum_exec_runtime;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
@@ -2728,13 +2983,16 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
- vma = find_vma(mm, start);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
- vma = mm->mmap;
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
}
- for (; vma; vma = vma->vm_next) {
+
+ for (; vma; vma = mas_find(&mas, ULONG_MAX)) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;
@@ -2823,9 +3081,12 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_migrate_retry = 0;
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
+ p->numa_pages_migrated = 0;
+ p->total_numa_faults = 0;
RCU_INIT_POINTER(p->numa_group, NULL);
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
@@ -2863,7 +3124,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
/*
* We don't care about NUMA placement if we don't have memory.
*/
- if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
+ if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
@@ -3029,9 +3290,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u32 divider = get_pelt_divider(&se->avg);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
#else
static inline void
@@ -3078,6 +3341,8 @@ void reweight_task(struct task_struct *p, int prio)
load->inv_weight = sched_prio_to_wmult[prio];
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
/*
@@ -3188,8 +3453,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_SMP */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-
/*
* Recomputes the group entity based on the current state of its group
* runqueue.
@@ -3240,13 +3503,41 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
- * See cpu_util().
+ * See cpu_util_cfs().
*/
cpufreq_update_util(rq, flags);
}
}
#ifdef CONFIG_SMP
+static inline bool load_avg_is_decayed(struct sched_avg *sa)
+{
+ if (sa->load_sum)
+ return false;
+
+ if (sa->util_sum)
+ return false;
+
+ if (sa->runnable_sum)
+ return false;
+
+ /*
+ * _avg must be null when _sum are null because _avg = _sum / divider
+ * Make sure that rounding and/or propagation of PELT values never
+ * break this.
+ */
+ SCHED_WARN_ON(sa->load_avg ||
+ sa->util_avg ||
+ sa->runnable_avg);
+
+ return true;
+}
+
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return u64_u32_load_copy(cfs_rq->avg.last_update_time,
+ cfs_rq->last_update_time_copy);
+}
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
@@ -3279,27 +3570,12 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (cfs_rq->load.weight)
return false;
- if (cfs_rq->avg.load_sum)
- return false;
-
- if (cfs_rq->avg.util_sum)
- return false;
-
- if (cfs_rq->avg.runnable_sum)
+ if (!load_avg_is_decayed(&cfs_rq->avg))
return false;
if (child_cfs_rq_on_list(cfs_rq))
return false;
- /*
- * _avg must be null when _sum are null because _avg = _sum / divider
- * Make sure that rounding and/or propagation of PELT values never
- * break this.
- */
- SCHED_WARN_ON(cfs_rq->avg.load_avg ||
- cfs_rq->avg.util_avg ||
- cfs_rq->avg.runnable_avg);
-
return true;
}
@@ -3357,32 +3633,13 @@ void set_task_rq_fair(struct sched_entity *se,
if (!(se->avg.last_update_time && prev))
return;
-#ifndef CONFIG_64BIT
- {
- u64 p_last_update_time_copy;
- u64 n_last_update_time_copy;
-
- do {
- p_last_update_time_copy = prev->load_last_update_time_copy;
- n_last_update_time_copy = next->load_last_update_time_copy;
-
- smp_rmb();
-
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
+ p_last_update_time = cfs_rq_last_update_time(prev);
+ n_last_update_time = cfs_rq_last_update_time(next);
- } while (p_last_update_time != p_last_update_time_copy ||
- n_last_update_time != n_last_update_time_copy);
- }
-#else
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
-#endif
__update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
-
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
@@ -3450,15 +3707,14 @@ void set_task_rq_fair(struct sched_entity *se,
* XXX: only do this for the part of runnable > running ?
*
*/
-
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
- u32 divider;
+ long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ u32 new_sum, divider;
/* Nothing to update */
- if (!delta)
+ if (!delta_avg)
return;
/*
@@ -3467,23 +3723,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * divider;
+ new_sum = se->avg.util_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.util_sum;
+ se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ add_positive(&cfs_rq->avg.util_avg, delta_avg);
+ add_positive(&cfs_rq->avg.util_sum, delta_sum);
+
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
- u32 divider;
+ long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ u32 new_sum, divider;
/* Nothing to update */
- if (!delta)
+ if (!delta_avg)
return;
/*
@@ -3494,19 +3757,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
- se->avg.runnable_sum = se->avg.runnable_avg * divider;
+ new_sum = se->avg.runnable_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
+ se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg;
u64 load_sum = 0;
+ s64 delta_sum;
u32 divider;
if (!runnable_sum)
@@ -3533,7 +3802,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* assuming all tasks are equally runnable.
*/
if (scale_load_down(gcfs_rq->load.weight)) {
- load_sum = div_s64(gcfs_rq->avg.load_sum,
+ load_sum = div_u64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight));
}
@@ -3550,19 +3819,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
- load_sum = (s64)se_weight(se) * runnable_sum;
- load_avg = div_s64(load_sum, divider);
-
- se->avg.load_sum = runnable_sum;
+ load_sum = se_weight(se) * runnable_sum;
+ load_avg = div_u64(load_sum, divider);
- delta = load_avg - se->avg.load_avg;
- if (!delta)
+ delta_avg = load_avg - se->avg.load_avg;
+ if (!delta_avg)
return;
- se->avg.load_avg = load_avg;
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
- add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+ se->avg.load_sum = runnable_sum;
+ se->avg.load_avg = load_avg;
+ add_positive(&cfs_rq->avg.load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.load_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3642,18 +3914,100 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_NO_HZ_COMMON
+static inline void migrate_se_pelt_lag(struct sched_entity *se)
+{
+ u64 throttled = 0, now, lut;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+ bool is_idle;
+
+ if (load_avg_is_decayed(&se->avg))
+ return;
+
+ cfs_rq = cfs_rq_of(se);
+ rq = rq_of(cfs_rq);
+
+ rcu_read_lock();
+ is_idle = is_idle_task(rcu_dereference(rq->curr));
+ rcu_read_unlock();
+
+ /*
+ * The lag estimation comes with a cost we don't want to pay all the
+ * time. Hence, limiting to the case where the source CPU is idle and
+ * we know we are at the greatest risk to have an outdated clock.
+ */
+ if (!is_idle)
+ return;
+
+ /*
+ * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
+ *
+ * last_update_time (the cfs_rq's last_update_time)
+ * = cfs_rq_clock_pelt()@cfs_rq_idle
+ * = rq_clock_pelt()@cfs_rq_idle
+ * - cfs->throttled_clock_pelt_time@cfs_rq_idle
+ *
+ * cfs_idle_lag (delta between rq's update and cfs_rq's update)
+ * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
+ *
+ * rq_idle_lag (delta between now and rq's update)
+ * = sched_clock_cpu() - rq_clock()@rq_idle
+ *
+ * We can then write:
+ *
+ * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
+ * sched_clock_cpu() - rq_clock()@rq_idle
+ * Where:
+ * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
+ * rq_clock()@rq_idle is rq->clock_idle
+ * cfs->throttled_clock_pelt_time@cfs_rq_idle
+ * is cfs_rq->throttled_pelt_idle
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
+ /* The clock has been stopped for throttling */
+ if (throttled == U64_MAX)
+ return;
+#endif
+ now = u64_u32_load(rq->clock_pelt_idle);
+ /*
+ * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
+ * is observed the old clock_pelt_idle value and the new clock_idle,
+ * which lead to an underestimation. The opposite would lead to an
+ * overestimation.
+ */
+ smp_rmb();
+ lut = cfs_rq_last_update_time(cfs_rq);
+
+ now -= throttled;
+ if (now < lut)
+ /*
+ * cfs_rq->avg.last_update_time is more recent than our
+ * estimation, let's use it.
+ */
+ now = lut;
+ else
+ now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
+
+ __update_load_avg_blocked_se(now, se);
+}
+#else
+static void migrate_se_pelt_lag(struct sched_entity *se) {}
+#endif
+
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_pelt()
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
- * avg. The immediate corollary is that all (fair) tasks must be attached, see
- * post_init_entity_util_avg().
+ * avg. The immediate corollary is that all (fair) tasks must be attached.
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
- * Returns true if the load decayed or we removed load.
+ * Return: true if the load decayed or we removed load.
*
* Since both these conditions indicate a changed cfs_rq->avg.load we should
* call update_tg_load_avg() when this function returns true.
@@ -3678,15 +4032,32 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_load;
sub_positive(&sa->load_avg, r);
- sa->load_sum = sa->load_avg * divider;
+ sub_positive(&sa->load_sum, r * divider);
+ /* See sa->util_sum below */
+ sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
r = removed_util;
sub_positive(&sa->util_avg, r);
- sa->util_sum = sa->util_avg * divider;
+ sub_positive(&sa->util_sum, r * divider);
+ /*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
- sa->runnable_sum = sa->runnable_avg * divider;
+ sub_positive(&sa->runnable_sum, r * divider);
+ /* See sa->util_sum above */
+ sa->runnable_sum = max_t(u32, sa->runnable_sum,
+ sa->runnable_avg * PELT_MIN_DIVIDER);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -3699,12 +4070,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
}
decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
-
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->load_last_update_time_copy = sa->last_update_time;
-#endif
-
+ u64_u32_store_copy(sa->last_update_time,
+ cfs_rq->last_update_time_copy,
+ sa->last_update_time);
return decayed;
}
@@ -3744,11 +4112,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
se->avg.runnable_sum = se->avg.runnable_avg * divider;
- se->avg.load_sum = divider;
- if (se_weight(se)) {
- se->avg.load_sum =
- div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
- }
+ se->avg.load_sum = se->avg.load_avg * divider;
+ if (se_weight(se) < se->avg.load_sum)
+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
+ else
+ se->avg.load_sum = 1;
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
@@ -3773,17 +4141,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- /*
- * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
- * See ___update_load_avg() for details.
- */
- u32 divider = get_pelt_divider(&cfs_rq->avg);
-
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -3798,6 +4167,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
#define UPDATE_TG 0x1
#define SKIP_AGE_LOAD 0x2
#define DO_ATTACH 0x4
+#define DO_DETACH 0x8
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -3827,6 +4197,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
+ } else if (flags & DO_DETACH) {
+ /*
+ * DO_DETACH means we're here from dequeue_entity()
+ * and we are migrating task out of the CPU.
+ */
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq);
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
@@ -3835,27 +4212,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
}
}
-#ifndef CONFIG_64BIT
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- u64 last_update_time_copy;
- u64 last_update_time;
-
- do {
- last_update_time_copy = cfs_rq->load_last_update_time_copy;
- smp_rmb();
- last_update_time = cfs_rq->avg.last_update_time;
- } while (last_update_time != last_update_time_copy);
-
- return last_update_time;
-}
-#else
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->avg.last_update_time;
-}
-#endif
-
/*
* Synchronize entity load avg of dequeued entity without locking
* the previous rq.
@@ -3880,8 +4236,8 @@ static void remove_entity_load_avg(struct sched_entity *se)
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
- * post_init_entity_util_avg() which will have added things to the
- * cfs_rq, so we can remove unconditionally.
+ * enqueue_task_fair() which will have added things to the cfs_rq,
+ * so we can remove unconditionally.
*/
sync_entity_load_avg(se);
@@ -4070,14 +4426,15 @@ done:
trace_sched_util_est_se_tp(&p->se);
}
-static inline int task_fits_capacity(struct task_struct *p, long capacity)
+static inline int task_fits_capacity(struct task_struct *p,
+ unsigned long capacity)
{
return fits_capacity(uclamp_task_util(p), capacity);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ if (!sched_asym_cpucap_active())
return;
if (!p || p->nr_cpus_allowed == 1) {
@@ -4107,6 +4464,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
+#define DO_DETACH 0x0
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
@@ -4249,7 +4607,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_running of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
@@ -4269,16 +4628,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- /*
- * When bandwidth control is enabled, cfs might have been removed
- * because of a parent been throttled but cfs->nr_running > 1. Try to
- * add it unconditionally.
- */
- if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
- list_add_leaf_cfs_rq(cfs_rq);
-
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_running == 1) {
check_enqueue_throttle(cfs_rq);
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+ }
}
static void __clear_buddies_last(struct sched_entity *se)
@@ -4331,6 +4685,11 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ int action = UPDATE_TG;
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
+ action |= DO_DETACH;
+
/*
* Update run-time statistics of the 'current'.
*/
@@ -4339,12 +4698,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Subtract its load from the cfs_rq->runnable_avg.
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_running of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_load_avg(cfs_rq, se, action);
se_update_runnable(se);
update_stats_dequeue_fair(cfs_rq, se, flags);
@@ -4378,6 +4738,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
+
+ if (cfs_rq->nr_running == 0)
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
}
/*
@@ -4759,11 +5122,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
- cfs_rq->throttled_clock_task;
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
/* Add cfs_rq with load or one or more already running entities to the list */
- if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
+ if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
@@ -4777,7 +5140,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
/* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
}
cfs_rq->throttle_count++;
@@ -4893,11 +5256,18 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
- /* Nothing to run but something to decay (on_list)? Complete the branch */
if (!cfs_rq->load.weight) {
- if (cfs_rq->on_list)
- goto unthrottle_throttle;
- return;
+ if (!cfs_rq->on_list)
+ return;
+ /*
+ * Nothing to run but something to decay (on_list)?
+ * Complete the branch.
+ */
+ for_each_sched_entity(se) {
+ if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
+ break;
+ }
+ goto unthrottle_throttle;
}
task_delta = cfs_rq->h_nr_running;
@@ -4935,31 +5305,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(qcfs_rq))
- list_add_leaf_cfs_rq(qcfs_rq);
}
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);
unthrottle_throttle:
- /*
- * The cfs_rq_throttled() breaks in the above iteration can result in
- * incomplete leaf list maintenance, resulting in triggering the
- * assertion below.
- */
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(qcfs_rq))
- break;
- }
-
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
@@ -5221,7 +5572,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -5509,11 +5860,9 @@ static inline void hrtick_update(struct rq *rq)
#endif
#ifdef CONFIG_SMP
-static inline unsigned long cpu_util(int cpu);
-
static inline bool cpu_overutilized(int cpu)
{
- return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
+ return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
}
static inline void update_overutilized_status(struct rq *rq)
@@ -5616,13 +5965,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
}
/* At this point se is NULL and we are at root level*/
@@ -5646,21 +5988,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_overutilized_status(rq);
enqueue_throttle:
- if (cfs_bandwidth_used()) {
- /*
- * When bandwidth control is enabled; the cfs_rq_throttled()
- * breaks in the above iteration can result in incomplete
- * leaf list maintenance, resulting in triggering the assertion
- * below.
- */
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(cfs_rq))
- break;
- }
- }
-
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -5746,8 +6073,8 @@ dequeue_throttle:
#ifdef CONFIG_SMP
/* Working cpumask for: load_balance, load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -6113,7 +6440,7 @@ static inline void set_idle_cores(int cpu, int val)
WRITE_ONCE(sds->has_idle_cores, val);
}
-static inline bool test_idle_cores(int cpu, bool def)
+static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
@@ -6121,7 +6448,7 @@ static inline bool test_idle_cores(int cpu, bool def)
if (sds)
return READ_ONCE(sds->has_idle_cores);
- return def;
+ return false;
}
/*
@@ -6137,7 +6464,7 @@ void __update_idle_core(struct rq *rq)
int cpu;
rcu_read_lock();
- if (test_idle_cores(core, true))
+ if (test_idle_cores(core))
goto unlock;
for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -6163,9 +6490,6 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
bool idle = true;
int cpu;
- if (!static_branch_likely(&sched_smt_present))
- return __select_idle_cpu(core, p);
-
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (!available_idle_cpu(cpu)) {
idle = false;
@@ -6192,13 +6516,12 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_smt(struct task_struct *p, int target)
{
int cpu;
- for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
- !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
+ if (cpu == target)
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
@@ -6213,9 +6536,9 @@ static inline void set_idle_cores(int cpu, int val)
{
}
-static inline bool test_idle_cores(int cpu, bool def)
+static inline bool test_idle_cores(int cpu)
{
- return def;
+ return false;
}
static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
@@ -6223,7 +6546,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
return __select_idle_cpu(core, p);
}
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline int select_idle_smt(struct task_struct *p, int target)
{
return -1;
}
@@ -6237,23 +6560,24 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
{
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ struct sched_domain_shared *sd_share;
struct rq *this_rq = this_rq();
int this = smp_processor_id();
- struct sched_domain *this_sd;
+ struct sched_domain *this_sd = NULL;
u64 time = 0;
- this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- if (!this_sd)
- return -1;
-
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
if (sched_feat(SIS_PROP) && !has_idle_core) {
u64 avg_cost, avg_idle, span_avg;
unsigned long now = jiffies;
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+ return -1;
+
/*
* If we're busy, the assumption that the last idle period
* predicts the future is flawed; age away the remaining
@@ -6278,6 +6602,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
time = cpu_clock(this);
}
+ if (sched_feat(SIS_UTIL)) {
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ if (sd_share) {
+ /* because !--nr is the condition to stop scan */
+ nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
+ /* overloaded LLC is unlikely to have idle cpu/core */
+ if (nr == 1)
+ return -1;
+ }
+ }
+
for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -6296,7 +6631,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
if (has_idle_core)
set_idle_cores(target, false);
- if (sched_feat(SIS_PROP) && !has_idle_core) {
+ if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) {
time = cpu_clock(this) - time;
/*
@@ -6323,7 +6658,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
int cpu, best_cpu = -1;
struct cpumask *cpus;
- cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
task_util = uclamp_task_util(p);
@@ -6345,9 +6680,9 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}
-static inline bool asym_fits_capacity(int task_util, int cpu)
+static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
{
- if (static_branch_unlikely(&sched_asym_cpucapacity))
+ if (sched_asym_cpucap_active())
return fits_capacity(task_util, capacity_of(cpu));
return true;
@@ -6367,13 +6702,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* On asymmetric system, update task utilization because we will check
* that the task fits with cpu's capacity.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ if (sched_asym_cpucap_active()) {
sync_entity_load_avg(&p->se);
task_util = uclamp_task_util(p);
}
/*
- * per-cpu select_idle_mask usage
+ * per-cpu select_rq_mask usage
*/
lockdep_assert_irqs_disabled();
@@ -6398,8 +6733,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* pattern is IO completions.
*/
if (is_per_cpu_kthread(current) &&
+ in_task() &&
prev == smp_processor_id() &&
- this_rq()->nr_running <= 1) {
+ this_rq()->nr_running <= 1 &&
+ asym_fits_capacity(task_util, prev)) {
return prev;
}
@@ -6419,7 +6756,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* For asymmetric CPU capacity systems, our domain of interest is
* sd_asym_cpucapacity rather than sd_llc.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ if (sched_asym_cpucap_active()) {
sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
@@ -6440,10 +6777,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
if (sched_smt_active()) {
- has_idle_core = test_idle_cores(target, false);
+ has_idle_core = test_idle_cores(target);
if (!has_idle_core && cpus_share_cache(prev, target)) {
- i = select_idle_smt(p, sd, prev);
+ i = select_idle_smt(p, prev);
if ((unsigned int)i < nr_cpumask_bits)
return i;
}
@@ -6456,56 +6793,66 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
-/**
- * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks.
- * @cpu: the CPU to get the utilization of
- *
- * The unit of the return value must be the one of capacity so we can compare
- * the utilization with the capacity of the CPU that is available for CFS task
- * (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * The estimated utilization of a CPU is defined to be the maximum between its
- * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
- * currently RUNNABLE on that CPU.
- * This allows to properly represent the expected utilization of a CPU which
- * has just got a big task running since a long sleep period. At the same time
- * however it preserves the benefits of the "blocked utilization" in
- * describing the potential for other tasks waking up on the same CPU.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- *
- * Return: the (estimated) utilization for the specified CPU
- */
-static inline unsigned long cpu_util(int cpu)
+/*
+ * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
+ * (@dst_cpu = -1) or migrated to @dst_cpu.
+ */
+static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
- struct cfs_rq *cfs_rq;
- unsigned int util;
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
+ /*
+ * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
+ * contribution. If @p migrates from another CPU to @cpu add its
+ * contribution. In all the other cases @cpu is not impacted by the
+ * migration so its util_avg is already correct.
+ */
+ if (task_cpu(p) == cpu && dst_cpu != cpu)
+ lsub_positive(&util, task_util(p));
+ else if (task_cpu(p) != cpu && dst_cpu == cpu)
+ util += task_util(p);
+
+ if (sched_feat(UTIL_EST)) {
+ unsigned long util_est;
+
+ util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+ /*
+ * During wake-up @p isn't enqueued yet and doesn't contribute
+ * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+ * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
+ * has been enqueued.
+ *
+ * During exec (@dst_cpu = -1) @p is enqueued and does
+ * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+ * Remove it to "simulate" cpu_util without @p's contribution.
+ *
+ * Despite the task_on_rq_queued(@p) check there is still a
+ * small window for a possible race when an exec
+ * select_task_rq_fair() races with LB's detach_task().
+ *
+ * detach_task()
+ * deactivate_task()
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
+ * -------------------------------- A
+ * dequeue_task() \
+ * dequeue_task_fair() + Race Time
+ * util_est_dequeue() /
+ * -------------------------------- B
+ *
+ * The additional check "current == p" is required to further
+ * reduce the race window.
+ */
+ if (dst_cpu == cpu)
+ util_est += _task_util_est(p);
+ else if (unlikely(task_on_rq_queued(p) || current == p))
+ lsub_positive(&util_est, _task_util_est(p));
- if (sched_feat(UTIL_EST))
- util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+ util = max(util, util_est);
+ }
- return min_t(unsigned long, util, capacity_orig_of(cpu));
+ return min(util, capacity_orig_of(cpu));
}
/*
@@ -6523,175 +6870,104 @@ static inline unsigned long cpu_util(int cpu)
*/
static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
- struct cfs_rq *cfs_rq;
- unsigned int util;
-
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util(cpu);
+ return cpu_util_cfs(cpu);
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
-
- /* Discount task's util from CPU's util */
- lsub_positive(&util, task_util(p));
+ return cpu_util_next(cpu, p, -1);
+}
- /*
- * Covered cases:
- *
- * a) if *p is the only task sleeping on this CPU, then:
- * cpu_util (== task_util) > util_est (== 0)
- * and thus we return:
- * cpu_util_without = (cpu_util - task_util) = 0
- *
- * b) if other tasks are SLEEPING on this CPU, which is now exiting
- * IDLE, then:
- * cpu_util >= task_util
- * cpu_util > util_est (== 0)
- * and thus we discount *p's blocked utilization to return:
- * cpu_util_without = (cpu_util - task_util) >= 0
- *
- * c) if other tasks are RUNNABLE on that CPU and
- * util_est > cpu_util
- * then we use util_est since it returns a more restrictive
- * estimation of the spare capacity on that CPU, by just
- * considering the expected utilization of tasks already
- * runnable on that CPU.
- *
- * Cases a) and b) are covered by the above code, while case c) is
- * covered by the following code when estimated utilization is
- * enabled.
- */
- if (sched_feat(UTIL_EST)) {
- unsigned int estimated =
- READ_ONCE(cfs_rq->avg.util_est.enqueued);
+/*
+ * energy_env - Utilization landscape for energy estimation.
+ * @task_busy_time: Utilization contribution by the task for which we test the
+ * placement. Given by eenv_task_busy_time().
+ * @pd_busy_time: Utilization of the whole perf domain without the task
+ * contribution. Given by eenv_pd_busy_time().
+ * @cpu_cap: Maximum CPU capacity for the perf domain.
+ * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
+ */
+struct energy_env {
+ unsigned long task_busy_time;
+ unsigned long pd_busy_time;
+ unsigned long cpu_cap;
+ unsigned long pd_cap;
+};
- /*
- * Despite the following checks we still have a small window
- * for a possible race, when an execl's select_task_rq_fair()
- * races with LB's detach_task():
- *
- * detach_task()
- * p->on_rq = TASK_ON_RQ_MIGRATING;
- * ---------------------------------- A
- * deactivate_task() \
- * dequeue_task() + RaceTime
- * util_est_dequeue() /
- * ---------------------------------- B
- *
- * The additional check on "current == p" it's required to
- * properly fix the execl regression and it helps in further
- * reducing the chances for the above race.
- */
- if (unlikely(task_on_rq_queued(p) || current == p))
- lsub_positive(&estimated, _task_util_est(p));
+/*
+ * Compute the task busy time for compute_energy(). This time cannot be
+ * injected directly into effective_cpu_util() because of the IRQ scaling.
+ * The latter only makes sense with the most recent CPUs where the task has
+ * run.
+ */
+static inline void eenv_task_busy_time(struct energy_env *eenv,
+ struct task_struct *p, int prev_cpu)
+{
+ unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
+ unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
- util = max(util, estimated);
- }
+ if (unlikely(irq >= max_cap))
+ busy_time = max_cap;
+ else
+ busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
- /*
- * Utilization (estimated) can exceed the CPU capacity, thus let's
- * clamp to the maximum CPU capacity to ensure consistency with
- * the cpu_util call.
- */
- return min_t(unsigned long, util, capacity_orig_of(cpu));
+ eenv->task_busy_time = busy_time;
}
/*
- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
- * to @dst_cpu.
+ * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
+ * utilization for each @pd_cpus, it however doesn't take into account
+ * clamping since the ratio (utilization / cpu_capacity) is already enough to
+ * scale the EM reported power consumption at the (eventually clamped)
+ * cpu_capacity.
+ *
+ * The contribution of the task @p for which we want to estimate the
+ * energy cost is removed (by cpu_util_next()) and must be calculated
+ * separately (see eenv_task_busy_time). This ensures:
+ *
+ * - A stable PD utilization, no matter which CPU of that PD we want to place
+ * the task on.
+ *
+ * - A fair comparison between CPUs as the task contribution (task_util())
+ * will always be the same no matter which CPU utilization we rely on
+ * (util_avg or util_est).
+ *
+ * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
+ * exceed @eenv->pd_cap.
*/
-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+static inline void eenv_pd_busy_time(struct energy_env *eenv,
+ struct cpumask *pd_cpus,
+ struct task_struct *p)
{
- struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
- unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
-
- /*
- * If @p migrates from @cpu to another, remove its contribution. Or,
- * if @p migrates from another CPU to @cpu, add its contribution. In
- * the other cases, @cpu is not impacted by the migration, so the
- * util_avg should already be correct.
- */
- if (task_cpu(p) == cpu && dst_cpu != cpu)
- lsub_positive(&util, task_util(p));
- else if (task_cpu(p) != cpu && dst_cpu == cpu)
- util += task_util(p);
-
- if (sched_feat(UTIL_EST)) {
- util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+ unsigned long busy_time = 0;
+ int cpu;
- /*
- * During wake-up, the task isn't enqueued yet and doesn't
- * appear in the cfs_rq->avg.util_est.enqueued of any rq,
- * so just add it (if needed) to "simulate" what will be
- * cpu_util() after the task has been enqueued.
- */
- if (dst_cpu == cpu)
- util_est += _task_util_est(p);
+ for_each_cpu(cpu, pd_cpus) {
+ unsigned long util = cpu_util_next(cpu, p, -1);
- util = max(util, util_est);
+ busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
}
- return min(util, capacity_orig_of(cpu));
+ eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
}
/*
- * compute_energy(): Estimates the energy that @pd would consume if @p was
- * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of @pd's CPUs after the task migration, and uses the Energy Model
- * to compute what would be the energy if we decided to actually migrate that
- * task.
+ * Compute the maximum utilization for compute_energy() when the task @p
+ * is placed on the cpu @dst_cpu.
+ *
+ * Returns the maximum utilization among @eenv->cpus. This utilization can't
+ * exceed @eenv->cpu_cap.
*/
-static long
-compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+static inline unsigned long
+eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+ struct task_struct *p, int dst_cpu)
{
- struct cpumask *pd_mask = perf_domain_span(pd);
- unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
- unsigned long max_util = 0, sum_util = 0;
- unsigned long _cpu_cap = cpu_cap;
+ unsigned long max_util = 0;
int cpu;
- _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
-
- /*
- * The capacity state of CPUs of the current rd can be driven by CPUs
- * of another rd if they belong to the same pd. So, account for the
- * utilization of these CPUs too by masking pd with cpu_online_mask
- * instead of the rd span.
- *
- * If an entire pd is outside of the current rd, it will not appear in
- * its pd list and will not be accounted by compute_energy().
- */
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
- unsigned long cpu_util, util_running = util_freq;
- struct task_struct *tsk = NULL;
-
- /*
- * When @p is placed on @cpu:
- *
- * util_running = max(cpu_util, cpu_util_est) +
- * max(task_util, _task_util_est)
- *
- * while cpu_util_next is: max(cpu_util + task_util,
- * cpu_util_est + _task_util_est)
- */
- if (cpu == dst_cpu) {
- tsk = p;
- util_running =
- cpu_util_next(cpu, p, -1) + task_util_est(p);
- }
-
- /*
- * Busy time computation: utilization clamping is not
- * required since the ratio (sum_util / cpu_capacity)
- * is already enough to scale the EM reported power
- * consumption at the (eventually clamped) cpu_capacity.
- */
- cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
- ENERGY_UTIL, NULL);
-
- sum_util += min(cpu_util, _cpu_cap);
+ for_each_cpu(cpu, pd_cpus) {
+ struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
+ unsigned long util = cpu_util_next(cpu, p, dst_cpu);
+ unsigned long cpu_util;
/*
* Performance domain frequency: utilization clamping
@@ -6700,12 +6976,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
- FREQUENCY_UTIL, tsk);
- max_util = max(max_util, min(cpu_util, _cpu_cap));
+ cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, cpu_util);
}
- return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
+ return min(max_util, eenv->cpu_cap);
+}
+
+/*
+ * compute_energy(): Use the Energy Model to estimate the energy that @pd would
+ * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
+ * contribution is ignored.
+ */
+static inline unsigned long
+compute_energy(struct energy_env *eenv, struct perf_domain *pd,
+ struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
+{
+ unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+ unsigned long busy_time = eenv->pd_busy_time;
+
+ if (dst_cpu >= 0)
+ busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
+
+ return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
}
/*
@@ -6749,12 +7042,13 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
*/
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
- int cpu, best_energy_cpu = prev_cpu, target = -1;
- unsigned long cpu_cap, util, base_energy = 0;
+ struct root_domain *rd = this_rq()->rd;
+ int cpu, best_energy_cpu, target = -1;
struct sched_domain *sd;
struct perf_domain *pd;
+ struct energy_env eenv;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
@@ -6777,20 +7071,39 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!task_util_est(p))
goto unlock;
+ eenv_task_busy_time(&eenv, p, prev_cpu);
+
for (; pd; pd = pd->next) {
- unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ unsigned long cpu_cap, cpu_thermal_cap, util;
+ unsigned long cur_delta, max_spare_cap = 0;
bool compute_prev_delta = false;
- unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
+ unsigned long base_energy;
+
+ cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
+
+ if (cpumask_empty(cpus))
+ continue;
+
+ /* Account thermal pressure for the energy estimation */
+ cpu = cpumask_first(cpus);
+ cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
+ cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+
+ eenv.cpu_cap = cpu_thermal_cap;
+ eenv.pd_cap = 0;
+
+ for_each_cpu(cpu, cpus) {
+ eenv.pd_cap += cpu_thermal_cap;
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
- spare_cap = cpu_cap;
- lsub_positive(&spare_cap, util);
/*
* Skip CPUs that cannot satisfy the capacity request.
@@ -6803,15 +7116,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!fits_capacity(util, cpu_cap))
continue;
+ lsub_positive(&cpu_cap, util);
+
if (cpu == prev_cpu) {
/* Always use prev_cpu as a candidate. */
compute_prev_delta = true;
- } else if (spare_cap > max_spare_cap) {
+ } else if (cpu_cap > max_spare_cap) {
/*
* Find the CPU with the maximum spare capacity
* in the performance domain.
*/
- max_spare_cap = spare_cap;
+ max_spare_cap = cpu_cap;
max_spare_cap_cpu = cpu;
}
}
@@ -6819,25 +7134,29 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (max_spare_cap_cpu < 0 && !compute_prev_delta)
continue;
+ eenv_pd_busy_time(&eenv, cpus, p);
/* Compute the 'base' energy of the pd, without @p */
- base_energy_pd = compute_energy(p, -1, pd);
- base_energy += base_energy_pd;
+ base_energy = compute_energy(&eenv, pd, cpus, p, -1);
/* Evaluate the energy impact of using prev_cpu. */
if (compute_prev_delta) {
- prev_delta = compute_energy(p, prev_cpu, pd);
- if (prev_delta < base_energy_pd)
+ prev_delta = compute_energy(&eenv, pd, cpus, p,
+ prev_cpu);
+ /* CPU utilization has changed */
+ if (prev_delta < base_energy)
goto unlock;
- prev_delta -= base_energy_pd;
+ prev_delta -= base_energy;
best_delta = min(best_delta, prev_delta);
}
/* Evaluate the energy impact of using max_spare_cap_cpu. */
if (max_spare_cap_cpu >= 0) {
- cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
- if (cur_delta < base_energy_pd)
+ cur_delta = compute_energy(&eenv, pd, cpus, p,
+ max_spare_cap_cpu);
+ /* CPU utilization has changed */
+ if (cur_delta < base_energy)
goto unlock;
- cur_delta -= base_energy_pd;
+ cur_delta -= base_energy;
if (cur_delta < best_delta) {
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
@@ -6846,12 +7165,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
rcu_read_unlock();
- /*
- * Pick the best CPU if prev_cpu cannot be used, or if it saves at
- * least 6% of the energy used by prev_cpu.
- */
- if ((prev_delta == ULONG_MAX) ||
- (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
+ if (best_delta < prev_delta)
target = best_energy_cpu;
return target;
@@ -6915,6 +7229,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
break;
}
+ /*
+ * Usually only true for WF_EXEC and WF_FORK, as sched_domains
+ * usually do not have SD_BALANCE_WAKE set. That means wakeup
+ * will usually go to the fast path.
+ */
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
@@ -6933,8 +7252,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
return new_cpu;
}
-static void detach_entity_cfs_rq(struct sched_entity *se);
-
/*
* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
@@ -6942,6 +7259,8 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
*/
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
+ struct sched_entity *se = &p->se;
+
/*
* As blocked tasks retain absolute vruntime the migration needs to
* deal with this by subtracting the old and adding the new
@@ -6949,50 +7268,32 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* the task on the new runqueue.
*/
if (READ_ONCE(p->__state) == TASK_WAKING) {
- struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
- do {
- min_vruntime_copy = cfs_rq->min_vruntime_copy;
- smp_rmb();
- min_vruntime = cfs_rq->min_vruntime;
- } while (min_vruntime != min_vruntime_copy);
-#else
- min_vruntime = cfs_rq->min_vruntime;
-#endif
-
- se->vruntime -= min_vruntime;
+ se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
}
- if (p->on_rq == TASK_ON_RQ_MIGRATING) {
- /*
- * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
- * rq->lock and can modify state directly.
- */
- lockdep_assert_rq_held(task_rq(p));
- detach_entity_cfs_rq(&p->se);
+ if (!task_on_rq_migrating(p)) {
+ remove_entity_load_avg(se);
- } else {
/*
- * We are supposed to update the task to "current" time, then
- * its up to date and ready to go to new CPU/cfs_rq. But we
- * have difficulty in getting what current time is, so simply
- * throw away the out-of-date time. This will result in the
- * wakee task is less decayed, but giving the wakee more load
- * sounds not bad.
+ * Here, the task's PELT values have been updated according to
+ * the current rq's clock. But if that clock hasn't been
+ * updated in a while, a substantial idle time will be missed,
+ * leading to an inflation after wake-up on the new rq.
+ *
+ * Estimate the missing time from the cfs_rq last_update_time
+ * and update sched_avg to improve the PELT continuity after
+ * migration.
*/
- remove_entity_load_avg(&p->se);
+ migrate_se_pelt_lag(se);
}
/* Tell new CPU we are migrated */
- p->se.avg.last_update_time = 0;
+ se->avg.last_update_time = 0;
/* We have migrated, no longer consider this task hot */
- p->se.exec_start = 0;
+ se->exec_start = 0;
update_scan_period(p, new_cpu);
}
@@ -7144,7 +7445,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
find_matching_se(&se, &pse);
- BUG_ON(!pse);
+ WARN_ON_ONCE(!pse);
cse_is_idle = se_is_idle(se);
pse_is_idle = se_is_idle(pse);
@@ -7576,8 +7877,8 @@ enum group_type {
*/
group_fully_busy,
/*
- * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
- * and must be migrated to a more powerful CPU.
+ * One task doesn't fit with CPU's capacity and must be migrated to a
+ * more powerful CPU.
*/
group_misfit_task,
/*
@@ -7803,7 +8104,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
- if (task_running(env->src_rq, p)) {
+ if (task_on_cpu(env->src_rq, p)) {
schedstat_inc(p->stats.nr_failed_migrations_running);
return 0;
}
@@ -7877,8 +8178,6 @@ static struct task_struct *detach_one_task(struct lb_env *env)
return NULL;
}
-static const unsigned int sched_nr_migrate_break = 32;
-
/*
* detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
@@ -7914,20 +8213,24 @@ static int detach_tasks(struct lb_env *env)
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
break;
- p = list_last_entry(tasks, struct task_struct, se.group_node);
-
env->loop++;
- /* We've more or less seen every task there is, call it quits */
- if (env->loop > env->loop_max)
+ /*
+ * We've more or less seen every task there is, call it quits
+ * unless we haven't found any movable task yet.
+ */
+ if (env->loop > env->loop_max &&
+ !(env->flags & LBF_ALL_PINNED))
break;
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
- env->loop_break += sched_nr_migrate_break;
+ env->loop_break += SCHED_NR_MIGRATE_BREAK;
env->flags |= LBF_NEED_BREAK;
break;
}
+ p = list_last_entry(tasks, struct task_struct, se.group_node);
+
if (!can_migrate_task(p, env))
goto next;
@@ -8024,7 +8327,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
- BUG_ON(task_rq(p) != rq);
+ WARN_ON_ONCE(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
check_preempt_curr(rq, p, 0);
}
@@ -8158,6 +8461,9 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq);
+ if (cfs_rq->nr_running == 0)
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
+
if (cfs_rq == &rq->cfs)
decayed = true;
}
@@ -8491,7 +8797,7 @@ static inline int sg_imbalanced(struct sched_group *group)
/*
* group_has_capacity returns true if the group has spare capacity that could
* be used by some tasks.
- * We consider that a group has spare capacity if the * number of task is
+ * We consider that a group has spare capacity if the number of task is
* smaller than the number of CPUs or if the utilization is lower than the
* available capacity for CFS tasks.
* For the latter, we use a threshold to stabilize the state, to take into
@@ -8586,6 +8892,8 @@ group_type group_classify(unsigned int imbalance_pct,
*
* If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
* of @dst_cpu are idle and @sg has lower priority.
+ *
+ * Return: true if @dst_cpu can pull tasks, false otherwise.
*/
static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
struct sg_lb_stats *sgs,
@@ -8658,9 +8966,23 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
+static inline bool
+sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+{
+ /*
+ * When there is more than 1 task, the group_overloaded case already
+ * takes care of cpu with reduced capacity
+ */
+ if (rq->cfs.h_nr_running != 1)
+ return false;
+
+ return check_cpu_capacity(rq, sd);
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
+ * @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_status: Holds flag indicating the status of the sched_group
@@ -8679,9 +9001,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ unsigned long load = cpu_load(rq);
- sgs->group_load += cpu_load(rq);
- sgs->group_util += cpu_util(i);
+ sgs->group_load += load;
+ sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
@@ -8710,11 +9033,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (local_group)
continue;
- /* Check for a misfit task on the cpu */
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- sgs->group_misfit_task_load < rq->misfit_task_load) {
- sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Check for a misfit task on the cpu */
+ if (sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *sg_status |= SG_OVERLOAD;
+ }
+ } else if ((env->idle != CPU_NOT_IDLE) &&
+ sched_reduced_capacity(rq, env->sd)) {
+ /* Check for a task running on a CPU with reduced capacity */
+ if (sgs->group_misfit_task_load < load)
+ sgs->group_misfit_task_load = load;
}
}
@@ -8767,7 +9096,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
*/
- if (sgs->group_type == group_misfit_task &&
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type == group_misfit_task) &&
(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -9046,16 +9376,6 @@ static bool update_pick_idlest(struct sched_group *idlest,
}
/*
- * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
- * This is an approximation as the number of running tasks may not be
- * related to the number of busy CPUs due to sched_setaffinity.
- */
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
-{
- return (dst_running < (dst_weight >> 2));
-}
-
-/*
* find_idlest_group() finds and returns the least busy CPU group within the
* domain.
*
@@ -9171,7 +9491,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
break;
case group_has_spare:
+#ifdef CONFIG_NUMA
if (sd->flags & SD_NUMA) {
+ int imb_numa_nr = sd->imb_numa_nr;
#ifdef CONFIG_NUMA_BALANCING
int idlest_cpu;
/*
@@ -9184,16 +9506,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
/*
- * Otherwise, keep the task on this node to stay close
- * its wakeup source and improve locality. If there is
- * a real need of migration, periodic load balance will
+ * Otherwise, keep the task close to the wakeup source
+ * and improve locality if the number of running tasks
+ * would remain below threshold where an imbalance is
+ * allowed while accounting for the possibility the
+ * task is pinned to a subset of CPUs. If there is a
+ * real need of migration, periodic load balance will
* take care of it.
*/
- if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+ if (p->nr_cpus_allowed != NR_CPUS) {
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
+
+ cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
+ imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
+ }
+
+ imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
+ if (!adjust_numa_imbalance(imbalance,
+ local_sgs.sum_nr_running + 1,
+ imb_numa_nr)) {
return NULL;
+ }
}
+#endif /* CONFIG_NUMA */
/*
* Select group with highest number of idle CPUs. We could also
@@ -9209,6 +9546,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
return idlest;
}
+static void update_idle_cpu_scan(struct lb_env *env,
+ unsigned long sum_util)
+{
+ struct sched_domain_shared *sd_share;
+ int llc_weight, pct;
+ u64 x, y, tmp;
+ /*
+ * Update the number of CPUs to scan in LLC domain, which could
+ * be used as a hint in select_idle_cpu(). The update of sd_share
+ * could be expensive because it is within a shared cache line.
+ * So the write of this hint only occurs during periodic load
+ * balancing, rather than CPU_NEWLY_IDLE, because the latter
+ * can fire way more frequently than the former.
+ */
+ if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
+ return;
+
+ llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
+ if (env->sd->span_weight != llc_weight)
+ return;
+
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+ if (!sd_share)
+ return;
+
+ /*
+ * The number of CPUs to search drops as sum_util increases, when
+ * sum_util hits 85% or above, the scan stops.
+ * The reason to choose 85% as the threshold is because this is the
+ * imbalance_pct(117) when a LLC sched group is overloaded.
+ *
+ * let y = SCHED_CAPACITY_SCALE - p * x^2 [1]
+ * and y'= y / SCHED_CAPACITY_SCALE
+ *
+ * x is the ratio of sum_util compared to the CPU capacity:
+ * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
+ * y' is the ratio of CPUs to be scanned in the LLC domain,
+ * and the number of CPUs to scan is calculated by:
+ *
+ * nr_scan = llc_weight * y' [2]
+ *
+ * When x hits the threshold of overloaded, AKA, when
+ * x = 100 / pct, y drops to 0. According to [1],
+ * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
+ *
+ * Scale x by SCHED_CAPACITY_SCALE:
+ * x' = sum_util / llc_weight; [3]
+ *
+ * and finally [1] becomes:
+ * y = SCHED_CAPACITY_SCALE -
+ * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4]
+ *
+ */
+ /* equation [3] */
+ x = sum_util;
+ do_div(x, llc_weight);
+
+ /* equation [4] */
+ pct = env->sd->imbalance_pct;
+ tmp = x * x * pct * pct;
+ do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
+ tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
+ y = SCHED_CAPACITY_SCALE - tmp;
+
+ /* equation [2] */
+ y *= llc_weight;
+ do_div(y, SCHED_CAPACITY_SCALE);
+ if ((int)y != sd_share->nr_idle_scan)
+ WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -9221,6 +9629,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
+ unsigned long sum_util = 0;
int sg_status = 0;
do {
@@ -9253,6 +9662,7 @@ next_group:
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
+ sum_util += sgs->group_util;
sg = sg->next;
} while (sg != env->sd->groups);
@@ -9278,24 +9688,8 @@ next_group:
WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
}
-}
-
-#define NUMA_IMBALANCE_MIN 2
-
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int dst_weight)
-{
- if (!allow_numa_imbalance(dst_running, dst_weight))
- return imbalance;
-
- /*
- * Allow a small imbalance based on a simple pair of communicating
- * tasks that remain local when the destination is lightly loaded.
- */
- if (imbalance <= NUMA_IMBALANCE_MIN)
- return 0;
- return imbalance;
+ update_idle_cpu_scan(env, sum_util);
}
/**
@@ -9312,9 +9706,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
busiest = &sds->busiest_stat;
if (busiest->group_type == group_misfit_task) {
- /* Set imbalance to allow misfit tasks to be balanced. */
- env->migration_type = migrate_misfit;
- env->imbalance = 1;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ } else {
+ /*
+ * Set load imbalance to allow moving task from cpu
+ * with reduced capacity.
+ */
+ env->migration_type = migrate_load;
+ env->imbalance = busiest->group_misfit_task_load;
+ }
return;
}
@@ -9382,7 +9785,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff >> 1;
+ env->imbalance = nr_diff;
} else {
/*
@@ -9390,15 +9793,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* idle cpus.
*/
env->migration_type = migrate_task;
- env->imbalance = max_t(long, 0, (local->idle_cpus -
- busiest->idle_cpus) >> 1);
+ env->imbalance = max_t(long, 0,
+ (local->idle_cpus - busiest->idle_cpus));
}
+#ifdef CONFIG_NUMA
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- busiest->sum_nr_running, busiest->group_weight);
+ local->sum_nr_running + 1,
+ env->sd->imb_numa_nr);
}
+#endif
+
+ /* Number of tasks to move to restore balance */
+ env->imbalance >>= 1;
return;
}
@@ -9416,8 +9825,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
local->group_capacity;
- sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
- sds->total_capacity;
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
@@ -9426,6 +9833,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
env->imbalance = 0;
return;
}
+
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ sds->total_capacity;
}
/*
@@ -9451,7 +9861,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
* has_spare nr_idle balanced N/A N/A balanced balanced
* fully_busy nr_idle nr_idle N/A N/A balanced balanced
- * misfit_task force N/A N/A N/A force force
+ * misfit_task force N/A N/A N/A N/A N/A
* asym_packing force force N/A N/A force force
* imbalanced force force N/A N/A force force
* overloaded force force N/A N/A force avg_load
@@ -9468,12 +9878,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
+ * @env: The load balancing environment.
*
* Also calculates the amount of runnable load which should be moved
* to restore balance.
*
- * @env: The load balancing environment.
- *
* Return: - The busiest group if imbalance exists.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
@@ -9699,7 +10108,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
- util = cpu_util(cpu_of(rq));
+ util = cpu_util_cfs(i);
/*
* Don't try to pull utilization from a CPU with one
@@ -9821,9 +10230,15 @@ static int should_we_balance(struct lb_env *env)
/*
* In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
+ *
+ * However, we bail out if we already have tasks or a wakeup pending,
+ * to optimize wakeup latency.
*/
- if (env->idle == CPU_NEWLY_IDLE)
+ if (env->idle == CPU_NEWLY_IDLE) {
+ if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
+ return 0;
return 1;
+ }
/* Try to find first idle CPU */
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
@@ -9852,14 +10267,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct rq *busiest;
struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
-
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_span(sd->groups),
.idle = idle,
- .loop_break = sched_nr_migrate_break,
+ .loop_break = SCHED_NR_MIGRATE_BREAK,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
@@ -9887,7 +10301,7 @@ redo:
goto out_balanced;
}
- BUG_ON(busiest == env.dst_rq);
+ WARN_ON_ONCE(busiest == env.dst_rq);
schedstat_add(sd->lb_imbalance[idle], env.imbalance);
@@ -9935,7 +10349,9 @@ more_balance:
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
- goto more_balance;
+ /* Stop if we tried all running tasks */
+ if (env.loop < busiest->nr_running)
+ goto more_balance;
}
/*
@@ -9966,7 +10382,7 @@ more_balance:
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
/*
* Go back to "more_balance" rather than "redo" since we
@@ -9998,7 +10414,7 @@ more_balance:
*/
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
goto redo;
}
goto out_all_pinned;
@@ -10183,7 +10599,7 @@ static int active_load_balance_cpu_stop(void *data)
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-CPU setup.
*/
- BUG_ON(busiest_rq == target_rq);
+ WARN_ON_ONCE(busiest_rq == target_rq);
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
@@ -10362,7 +10778,7 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
* anywhere yet.
*/
@@ -10371,7 +10787,7 @@ static inline int find_new_ilb(void)
int ilb;
const struct cpumask *hk_mask;
- hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
@@ -10387,7 +10803,7 @@ static inline int find_new_ilb(void)
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
+ * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
@@ -10600,7 +11016,7 @@ void nohz_balance_enter_idle(int cpu)
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
+ if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
return;
/*
@@ -10669,8 +11085,7 @@ static bool update_nohz_stats(struct rq *rq)
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
*/
-static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
- enum cpu_idle_type idle)
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
@@ -10785,7 +11200,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (idle != CPU_IDLE)
return false;
- _nohz_idle_balance(this_rq, flags, idle);
+ _nohz_idle_balance(this_rq, flags);
return true;
}
@@ -10805,7 +11220,7 @@ void nohz_run_idle_balance(int cpu)
* (ie NOHZ_STATS_KICK set) and will do the same.
*/
if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
- _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
+ _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
}
static void nohz_newidle_balance(struct rq *this_rq)
@@ -10816,7 +11231,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
* This CPU doesn't want to be disturbed by scheduler
* housekeeping
*/
- if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+ if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
return;
/* Will wake up very soon. No time for doing anything else*/
@@ -11068,7 +11483,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
- if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+ if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
@@ -11274,9 +11689,13 @@ static inline bool vruntime_normalized(struct task_struct *p)
*/
static void propagate_entity_cfs_rq(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ return;
- list_add_leaf_cfs_rq(cfs_rq_of(se));
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
/* Start to propagate at parent */
se = se->parent;
@@ -11284,14 +11703,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- if (!cfs_rq_throttled(cfs_rq)){
- update_load_avg(cfs_rq, se, UPDATE_TG);
- list_add_leaf_cfs_rq(cfs_rq);
- continue;
- }
+ update_load_avg(cfs_rq, se, UPDATE_TG);
- if (list_add_leaf_cfs_rq(cfs_rq))
+ if (cfs_rq_throttled(cfs_rq))
break;
+
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
}
}
#else
@@ -11302,6 +11720,17 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+#ifdef CONFIG_SMP
+ /*
+ * In case the task sched_avg hasn't been attached:
+ * - A forked task which hasn't been woken up by wake_up_new_task().
+ * - A task which has been woken up by try_to_wake_up() but is
+ * waiting for actually being woken up by sched_ttwu_pending().
+ */
+ if (!se->avg.last_update_time)
+ return;
+#endif
+
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
@@ -11313,14 +11742,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
@@ -11409,49 +11830,32 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+ u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_set_group_fair(struct task_struct *p)
+static void task_change_group_fair(struct task_struct *p)
{
- struct sched_entity *se = &p->se;
-
- set_task_rq(p, task_cpu(p));
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-}
+ /*
+ * We couldn't detach or attach a forked task which
+ * hasn't been woken up by wake_up_new_task().
+ */
+ if (READ_ONCE(p->__state) == TASK_NEW)
+ return;
-static void task_move_group_fair(struct task_struct *p)
-{
detach_task_cfs_rq(p);
- set_task_rq(p, task_cpu(p));
#ifdef CONFIG_SMP
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
#endif
+ set_task_rq(p, task_cpu(p));
attach_task_cfs_rq(p);
}
-static void task_change_group_fair(struct task_struct *p, int type)
-{
- switch (type) {
- case TASK_SET_GROUP:
- task_set_group_fair(p);
- break;
-
- case TASK_MOVE_GROUP:
- task_move_group_fair(p);
- break;
- }
-}
-
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -11828,6 +12232,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
+ int i;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+ }
+
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON
@@ -11838,101 +12249,3 @@ __init void init_sched_fair_class(void)
#endif /* SMP */
}
-
-/*
- * Helper functions to facilitate extracting info from tracepoints.
- */
-
-const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
-{
-#ifdef CONFIG_SMP
- return cfs_rq ? &cfs_rq->avg : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
-
-char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
-{
- if (!cfs_rq) {
- if (str)
- strlcpy(str, "(null)", len);
- else
- return NULL;
- }
-
- cfs_rq_tg_path(cfs_rq, str, len);
- return str;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
-
-int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
-{
- return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
-
-const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_rt : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
-
-const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_dl : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
-
-const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
- return rq ? &rq->avg_irq : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
-
-int sched_trace_rq_cpu(struct rq *rq)
-{
- return rq ? cpu_of(rq) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
-
-int sched_trace_rq_cpu_capacity(struct rq *rq)
-{
- return rq ?
-#ifdef CONFIG_SMP
- rq->cpu_capacity
-#else
- SCHED_CAPACITY_SCALE
-#endif
- : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
-
-const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
-{
-#ifdef CONFIG_SMP
- return rd ? rd->span : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rd_span);
-
-int sched_trace_rq_nr_running(struct rq *rq)
-{
- return rq ? rq->nr_running : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);