diff options
-rw-r--r-- | kernel/sched/ext.c | 136 |
1 files changed, 119 insertions, 17 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6705c2e67c99..fc7f15eefe54 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -870,6 +870,11 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); +#ifdef CONFIG_SMP +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); +#endif + static struct static_key_false scx_has_op[SCX_OPI_END] = { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; @@ -3124,31 +3129,79 @@ found: goto retry; } -#ifdef CONFIG_SCHED_MC /* - * Return the cpumask of CPUs usable by task @p in the same LLC domain of @cpu, - * or NULL if the LLC domain cannot be determined. + * Initialize topology-aware scheduling. + * + * Detect if the system has multiple LLC or multiple NUMA domains and enable + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle + * selection policy. */ -static const struct cpumask *llc_domain(const struct task_struct *p, s32 cpu) +static void update_selcpu_topology(void) { - struct sched_domain *sd = rcu_dereference(per_cpu(sd_llc, cpu)); - const struct cpumask *llc_cpus = sd ? sched_domain_span(sd) : NULL; + bool enable_llc = false, enable_numa = false; + struct sched_domain *sd; + const struct cpumask *cpus; + s32 cpu = cpumask_first(cpu_online_mask); /* - * Return the LLC domain only if the task is allowed to run on all - * CPUs. + * We only need to check the NUMA node and LLC domain of the first + * available CPU to determine if they cover all CPUs. + * + * If all CPUs belong to the same NUMA node or share the same LLC + * domain, enabling NUMA or LLC optimizations is unnecessary. + * Otherwise, these optimizations can be enabled. */ - return p->nr_cpus_allowed == nr_cpu_ids ? llc_cpus : NULL; -} -#else /* CONFIG_SCHED_MC */ -static inline const struct cpumask *llc_domain(struct task_struct *p, s32 cpu) -{ - return NULL; + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (sd) { + cpus = sched_domain_span(sd); + if (cpumask_weight(cpus) < num_possible_cpus()) + enable_llc = true; + } + sd = highest_flag_domain(cpu, SD_NUMA); + if (sd) { + cpus = sched_group_span(sd->groups); + if (cpumask_weight(cpus) < num_possible_cpus()) + enable_numa = true; + } + rcu_read_unlock(); + + pr_debug("sched_ext: LLC idle selection %s\n", + enable_llc ? "enabled" : "disabled"); + pr_debug("sched_ext: NUMA idle selection %s\n", + enable_numa ? "enabled" : "disabled"); + + if (enable_llc) + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); + if (enable_numa) + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); } -#endif /* CONFIG_SCHED_MC */ /* - * Built-in cpu idle selection policy. + * Built-in CPU idle selection policy: + * + * 1. Prioritize full-idle cores: + * - always prioritize CPUs from fully idle cores (both logical CPUs are + * idle) to avoid interference caused by SMT. + * + * 2. Reuse the same CPU: + * - prefer the last used CPU to take advantage of cached data (L1, L2) and + * branch prediction optimizations. + * + * 3. Pick a CPU within the same LLC (Last-Level Cache): + * - if the above conditions aren't met, pick a CPU that shares the same LLC + * to maintain cache locality. + * + * 4. Pick a CPU within the same NUMA node, if enabled: + * - choose a CPU from the same NUMA node to reduce memory access latency. + * + * Step 3 and 4 are performed only if the system has, respectively, multiple + * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and + * scx_selcpu_topo_numa). * * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because * we never call ops.select_cpu() for them, see select_task_rq(). @@ -3156,7 +3209,8 @@ static inline const struct cpumask *llc_domain(struct task_struct *p, s32 cpu) static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found) { - const struct cpumask *llc_cpus = llc_domain(p, prev_cpu); + const struct cpumask *llc_cpus = NULL; + const struct cpumask *numa_cpus = NULL; s32 cpu; *found = false; @@ -3167,6 +3221,30 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, } /* + * Determine the scheduling domain only if the task is allowed to run + * on all CPUs. + * + * This is done primarily for efficiency, as it avoids the overhead of + * updating a cpumask every time we need to select an idle CPU (which + * can be costly in large SMP systems), but it also aligns logically: + * if a task's scheduling domain is restricted by user-space (through + * CPU affinity), the task will simply use the flat scheduling domain + * defined by user-space. + */ + if (p->nr_cpus_allowed >= num_possible_cpus()) { + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) + numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu)); + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, prev_cpu)); + if (sd) + llc_cpus = sched_domain_span(sd); + } + } + + /* * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. */ if (wake_flags & SCX_WAKE_SYNC) { @@ -3227,6 +3305,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, } /* + * Search for any fully idle core in the same NUMA node. + */ + if (numa_cpus) { + cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + /* * Search for any full idle core usable by the task. */ cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); @@ -3252,6 +3339,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, } /* + * Search for any idle CPU in the same NUMA node. + */ + if (numa_cpus) { + cpu = scx_pick_idle_cpu(numa_cpus, 0); + if (cpu >= 0) + goto cpu_found; + } + + /* * Search for any idle CPU usable by the task. */ cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); @@ -3383,6 +3479,9 @@ static void handle_hotplug(struct rq *rq, bool online) atomic_long_inc(&scx_hotplug_seq); + if (scx_enabled()) + update_selcpu_topology(); + if (online && SCX_HAS_OP(cpu_online)) SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); else if (!online && SCX_HAS_OP(cpu_offline)) @@ -5202,6 +5301,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable_cpuslocked(&scx_has_op[i]); check_hotplug_seq(ops); +#ifdef CONFIG_SMP + update_selcpu_topology(); +#endif cpus_read_unlock(); ret = validate_ops(ops); |