aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/topology.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-06-28 12:14:19 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-06-28 12:14:19 -0700
commit54a728dc5e4feb0a9278ad62b19f34ad21ed0ee4 (patch)
tree2737c23d4dbc6426d6d9467626a7634cbbb40fcd /kernel/sched/topology.c
parentMerge tag 'perf-core-2021-06-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (diff)
parentsched/doc: Update the CPU capacity asymmetry bits (diff)
downloadlinux-dev-54a728dc5e4feb0a9278ad62b19f34ad21ed0ee4.tar.xz
linux-dev-54a728dc5e4feb0a9278ad62b19f34ad21ed0ee4.zip
Merge tag 'sched-core-2021-06-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler udpates from Ingo Molnar: - Changes to core scheduling facilities: - Add "Core Scheduling" via CONFIG_SCHED_CORE=y, which enables coordinated scheduling across SMT siblings. This is a much requested feature for cloud computing platforms, to allow the flexible utilization of SMT siblings, without exposing untrusted domains to information leaks & side channels, plus to ensure more deterministic computing performance on SMT systems used by heterogenous workloads. There are new prctls to set core scheduling groups, which allows more flexible management of workloads that can share siblings. - Fix task->state access anti-patterns that may result in missed wakeups and rename it to ->__state in the process to catch new abuses. - Load-balancing changes: - Tweak newidle_balance for fair-sched, to improve 'memcache'-like workloads. - "Age" (decay) average idle time, to better track & improve workloads such as 'tbench'. - Fix & improve energy-aware (EAS) balancing logic & metrics. - Fix & improve the uclamp metrics. - Fix task migration (taskset) corner case on !CONFIG_CPUSET. - Fix RT and deadline utilization tracking across policy changes - Introduce a "burstable" CFS controller via cgroups, which allows bursty CPU-bound workloads to borrow a bit against their future quota to improve overall latencies & batching. Can be tweaked via /sys/fs/cgroup/cpu/<X>/cpu.cfs_burst_us. - Rework assymetric topology/capacity detection & handling. - Scheduler statistics & tooling: - Disable delayacct by default, but add a sysctl to enable it at runtime if tooling needs it. Use static keys and other optimizations to make it more palatable. - Use sched_clock() in delayacct, instead of ktime_get_ns(). - Misc cleanups and fixes. * tag 'sched-core-2021-06-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (72 commits) sched/doc: Update the CPU capacity asymmetry bits sched/topology: Rework CPU capacity asymmetry detection sched/core: Introduce SD_ASYM_CPUCAPACITY_FULL sched_domain flag psi: Fix race between psi_trigger_create/destroy sched/fair: Introduce the burstable CFS controller sched/uclamp: Fix uclamp_tg_restrict() sched/rt: Fix Deadline utilization tracking during policy change sched/rt: Fix RT utilization tracking during policy change sched: Change task_struct::state sched,arch: Remove unused TASK_STATE offsets sched,timer: Use __set_current_state() sched: Add get_current_state() sched,perf,kvm: Fix preemption condition sched: Introduce task_is_running() sched: Unbreak wakeups sched/fair: Age the average idle time sched/cpufreq: Consider reduced CPU capacity in energy calculation sched/fair: Take thermal pressure into account while estimating energy thermal/cpufreq_cooling: Update offline CPUs per-cpu thermal_pressure sched/fair: Return early from update_tg_cfs_load() if delta == 0 ...
Diffstat (limited to 'kernel/sched/topology.c')
-rw-r--r--kernel/sched/topology.c213
1 files changed, 133 insertions, 80 deletions
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 55a0a243e871..b77ad49dc14f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -467,7 +467,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
struct root_domain *old_rd = NULL;
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
if (rq->rd) {
old_rd = rq->rd;
@@ -493,7 +493,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
@@ -675,7 +675,7 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
- sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
+ sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
}
@@ -1267,6 +1267,116 @@ next:
}
/*
+ * Asymmetric CPU capacity bits
+ */
+struct asym_cap_data {
+ struct list_head link;
+ unsigned long capacity;
+ unsigned long cpus[];
+};
+
+/*
+ * Set of available CPUs grouped by their corresponding capacities
+ * Each list entry contains a CPU mask reflecting CPUs that share the same
+ * capacity.
+ * The lifespan of data is unlimited.
+ */
+static LIST_HEAD(asym_cap_list);
+
+#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+
+/*
+ * Verify whether there is any CPU capacity asymmetry in a given sched domain.
+ * Provides sd_flags reflecting the asymmetry scope.
+ */
+static inline int
+asym_cpu_capacity_classify(const struct cpumask *sd_span,
+ const struct cpumask *cpu_map)
+{
+ struct asym_cap_data *entry;
+ int count = 0, miss = 0;
+
+ /*
+ * Count how many unique CPU capacities this domain spans across
+ * (compare sched_domain CPUs mask with ones representing available
+ * CPUs capacities). Take into account CPUs that might be offline:
+ * skip those.
+ */
+ list_for_each_entry(entry, &asym_cap_list, link) {
+ if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
+ ++count;
+ else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
+ ++miss;
+ }
+
+ WARN_ON_ONCE(!count && !list_empty(&asym_cap_list));
+
+ /* No asymmetry detected */
+ if (count < 2)
+ return 0;
+ /* Some of the available CPU capacity values have not been detected */
+ if (miss)
+ return SD_ASYM_CPUCAPACITY;
+
+ /* Full asymmetry */
+ return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL;
+
+}
+
+static inline void asym_cpu_capacity_update_data(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+ struct asym_cap_data *entry = NULL;
+
+ list_for_each_entry(entry, &asym_cap_list, link) {
+ if (capacity == entry->capacity)
+ goto done;
+ }
+
+ entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
+ if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
+ return;
+ entry->capacity = capacity;
+ list_add(&entry->link, &asym_cap_list);
+done:
+ __cpumask_set_cpu(cpu, cpu_capacity_span(entry));
+}
+
+/*
+ * Build-up/update list of CPUs grouped by their capacities
+ * An update requires explicit request to rebuild sched domains
+ * with state indicating CPU topology changes.
+ */
+static void asym_cpu_capacity_scan(void)
+{
+ struct asym_cap_data *entry, *next;
+ int cpu;
+
+ list_for_each_entry(entry, &asym_cap_list, link)
+ cpumask_clear(cpu_capacity_span(entry));
+
+ for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN))
+ asym_cpu_capacity_update_data(cpu);
+
+ list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
+ if (cpumask_empty(cpu_capacity_span(entry))) {
+ list_del(&entry->link);
+ kfree(entry);
+ }
+ }
+
+ /*
+ * Only one capacity value has been detected i.e. this system is symmetric.
+ * No need to keep this data around.
+ */
+ if (list_is_singular(&asym_cap_list)) {
+ entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
+ list_del(&entry->link);
+ kfree(entry);
+ }
+}
+
+/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
@@ -1399,11 +1509,12 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map,
- struct sched_domain *child, int dflags, int cpu)
+ struct sched_domain *child, int cpu)
{
struct sd_data *sdd = &tl->data;
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
+ struct cpumask *sd_span;
#ifdef CONFIG_NUMA
/*
@@ -1420,9 +1531,6 @@ sd_init(struct sched_domain_topology_level *tl,
"wrong sd_flags in topology description\n"))
sd_flags &= TOPOLOGY_SD_FLAGS;
- /* Apply detected topology flags */
- sd_flags |= dflags;
-
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
@@ -1454,13 +1562,19 @@ sd_init(struct sched_domain_topology_level *tl,
#endif
};
- cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
- sd_id = cpumask_first(sched_domain_span(sd));
+ sd_span = sched_domain_span(sd);
+ cpumask_and(sd_span, cpu_map, tl->mask(cpu));
+ sd_id = cpumask_first(sd_span);
+
+ sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
+
+ WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
+ (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
+ "CPU capacity asymmetry not supported on SMT\n");
/*
* Convert topological properties into behaviour.
*/
-
/* Don't attempt to spread across CPUs of different capacities. */
if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
sd->child->flags &= ~SD_PREFER_SIBLING;
@@ -1926,9 +2040,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *child, int dflags, int cpu)
+ struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
if (child) {
sd->level = child->level + 1;
@@ -1991,65 +2105,6 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl,
}
/*
- * Find the sched_domain_topology_level where all CPU capacities are visible
- * for all CPUs.
- */
-static struct sched_domain_topology_level
-*asym_cpu_capacity_level(const struct cpumask *cpu_map)
-{
- int i, j, asym_level = 0;
- bool asym = false;
- struct sched_domain_topology_level *tl, *asym_tl = NULL;
- unsigned long cap;
-
- /* Is there any asymmetry? */
- cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
-
- for_each_cpu(i, cpu_map) {
- if (arch_scale_cpu_capacity(i) != cap) {
- asym = true;
- break;
- }
- }
-
- if (!asym)
- return NULL;
-
- /*
- * Examine topology from all CPU's point of views to detect the lowest
- * sched_domain_topology_level where a highest capacity CPU is visible
- * to everyone.
- */
- for_each_cpu(i, cpu_map) {
- unsigned long max_capacity = arch_scale_cpu_capacity(i);
- int tl_id = 0;
-
- for_each_sd_topology(tl) {
- if (tl_id < asym_level)
- goto next_level;
-
- for_each_cpu_and(j, tl->mask(i), cpu_map) {
- unsigned long capacity;
-
- capacity = arch_scale_cpu_capacity(j);
-
- if (capacity <= max_capacity)
- continue;
-
- max_capacity = capacity;
- asym_level = tl_id;
- asym_tl = tl;
- }
-next_level:
- tl_id++;
- }
- }
-
- return asym_tl;
-}
-
-
-/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
*/
@@ -2061,7 +2116,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct s_data d;
struct rq *rq = NULL;
int i, ret = -ENOMEM;
- struct sched_domain_topology_level *tl_asym;
bool has_asym = false;
if (WARN_ON(cpumask_empty(cpu_map)))
@@ -2071,24 +2125,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (alloc_state != sa_rootdomain)
goto error;
- tl_asym = asym_cpu_capacity_level(cpu_map);
-
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
- int dflags = 0;
sd = NULL;
for_each_sd_topology(tl) {
- if (tl == tl_asym) {
- dflags |= SD_ASYM_CPUCAPACITY;
- has_asym = true;
- }
if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
goto error;
- sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+
+ has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
@@ -2217,6 +2266,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
arch_update_cpu_topology();
+ asym_cpu_capacity_scan();
ndoms_cur = 1;
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
@@ -2299,6 +2349,9 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
/* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology();
+ /* Trigger rebuilding CPU capacity asymmetry data */
+ if (new_topology)
+ asym_cpu_capacity_scan();
if (!doms_new) {
WARN_ON_ONCE(dattr_new);