diff options
author | 2025-03-24 17:23:48 -0700 | |
---|---|---|
committer | 2025-03-24 17:23:48 -0700 | |
commit | bcb044256d3f5d9f5bb61d1eac6492f77883bd60 (patch) | |
tree | 0c0c48beef0742ac47a5e136fe0ecbcea66b6871 /kernel | |
parent | Merge tag 'cgroup-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup (diff) | |
parent | sched_ext: idle: Refactor scx_select_cpu_dfl() (diff) | |
download | wireguard-linux-bcb044256d3f5d9f5bb61d1eac6492f77883bd60.tar.xz wireguard-linux-bcb044256d3f5d9f5bb61d1eac6492f77883bd60.zip |
Merge tag 'sched_ext-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext updates from Tejun Heo:
- Add mechanism to count and report internal events. This significantly
improves visibility on subtle corner conditions.
- The default idle CPU selection logic is revamped and improved in
multiple ways including being made topology aware.
- sched_ext was disabling ttwu_queue for simplicity, which can be
costly when hardware topology is more complex. Implement
SCX_OPS_ALLOWED_QUEUED_WAKEUP so that BPF schedulers can selectively
enable ttwu_queue.
- tools/sched_ext updates to improve compatibility among others.
- Other misc updates and fixes.
- sched_ext/for-6.14-fixes were pulled a few times to receive
prerequisite fixes and resolve conflicts.
* tag 'sched_ext-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (42 commits)
sched_ext: idle: Refactor scx_select_cpu_dfl()
sched_ext: idle: Honor idle flags in the built-in idle selection policy
sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()
sched_ext: Add trace point to track sched_ext core events
sched_ext: Change the event type from u64 to s64
sched_ext: Documentation: add task lifecycle summary
tools/sched_ext: Provide a compatible helper for scx_bpf_events()
selftests/sched_ext: Add NUMA-aware scheduler test
tools/sched_ext: Provide consistent access to scx flags
sched_ext: idle: Fix scx_bpf_pick_any_cpu_node() behavior
sched_ext: idle: Introduce scx_bpf_nr_node_ids()
sched_ext: idle: Introduce node-aware idle cpu kfunc helpers
sched_ext: idle: Per-node idle cpumasks
sched_ext: idle: Introduce SCX_OPS_BUILTIN_IDLE_PER_NODE
sched_ext: idle: Make idle static keys private
sched/topology: Introduce for_each_node_numadist() iterator
mm/numa: Introduce nearest_node_nodemask()
nodemask: numa: reorganize inclusion path
nodemask: add nodes_copy()
tools/sched_ext: Sync with scx repo
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/build_policy.c | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 9 | ||||
-rw-r--r-- | kernel/sched/ext.c | 1085 | ||||
-rw-r--r-- | kernel/sched/ext.h | 10 | ||||
-rw-r--r-- | kernel/sched/ext_idle.c | 1171 | ||||
-rw-r--r-- | kernel/sched/ext_idle.h | 35 |
6 files changed, 1547 insertions, 764 deletions
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index fae1f5c921eb..72d97aa8b726 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -61,6 +61,7 @@ #ifdef CONFIG_SCHED_CLASS_EXT # include "ext.c" +# include "ext_idle.c" #endif #include "syscalls.c" diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 042351c7afce..f6840c33a296 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3922,13 +3922,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { - /* - * The BPF scheduler may depend on select_task_rq() being invoked during - * wakeups. In addition, @p may end up executing on a different CPU - * regardless of what happens in the wakeup path making the ttwu_queue - * optimization less meaningful. Skip if on SCX. - */ - if (task_on_scx(p)) + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ + if (!scx_allow_ttwu_queue(p)) return false; /* diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7b9dfee858e7..06561d6717c9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6,6 +6,9 @@ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> * Copyright (c) 2022 David Vernet <dvernet@meta.com> */ +#include <linux/btf_ids.h> +#include "ext_idle.h" + #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { @@ -93,7 +96,7 @@ enum scx_ops_flags { /* * Keep built-in idle tracking even if ops.update_idle() is implemented. */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, /* * By default, if there are no other task to run on the CPU, ext core @@ -101,7 +104,7 @@ enum scx_ops_flags { * flag is specified, such tasks are passed to ops.enqueue() with * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. */ - SCX_OPS_ENQ_LAST = 1LLU << 1, + SCX_OPS_ENQ_LAST = 1LLU << 1, /* * An exiting task may schedule after PF_EXITING is set. In such cases, @@ -114,13 +117,13 @@ enum scx_ops_flags { * depend on pid lookups and wants to handle these tasks directly, the * following flag can be used. */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, + SCX_OPS_ENQ_EXITING = 1LLU << 2, /* * If set, only tasks with policy set to SCHED_EXT are attached to * sched_ext. If clear, SCHED_NORMAL tasks are also included. */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, /* * A migration disabled task can only execute on its current CPU. By @@ -133,7 +136,29 @@ enum scx_ops_flags { * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr * and thus may disagree with cpumask_weight(p->cpus_ptr). */ - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, /* * CPU cgroup support flags @@ -144,7 +169,9 @@ enum scx_ops_flags { SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | SCX_OPS_HAS_CGROUP_WEIGHT, }; @@ -779,6 +806,7 @@ enum scx_deq_flags { enum scx_pick_idle_cpu_flags { SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ }; enum scx_kick_flags { @@ -894,16 +922,11 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all); static struct sched_ext_ops scx_ops; static bool scx_warned_zero_slice; +DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); -static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); - -#ifdef CONFIG_SMP -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); -#endif static struct static_key_false scx_has_op[SCX_OPI_END] = { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; @@ -938,21 +961,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; -/* idle tracking */ -#ifdef CONFIG_SMP -#ifdef CONFIG_CPUMASK_OFFSTACK -#define CL_ALIGNED_IF_ONSTACK -#else -#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp -#endif - -static struct { - cpumask_var_t cpu; - cpumask_var_t smt; -} idle_masks CL_ALIGNED_IF_ONSTACK; - -#endif /* CONFIG_SMP */ - /* for %SCX_KICK_WAIT */ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; @@ -1473,6 +1481,117 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return p; } +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * The total number of tasks enqueued (or pick_task-ed) with a + * default time slice (SCX_SLICE_DFL). + */ + s64 SCX_EV_ENQ_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +/* + * The event counter is organized by a per-CPU variable to minimize the + * accounting overhead without synchronization. A system-wide view on the + * event counter is constructed when requested by scx_bpf_get_event_stat(). + */ +static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); + +/** + * scx_add_event - Increase an event counter for 'name' by 'cnt' + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This can be used when preemption is not disabled. + */ +#define scx_add_event(name, cnt) do { \ + this_cpu_add(event_stats_cpu.name, cnt); \ + trace_sched_ext_event(#name, cnt); \ +} while(0) + +/** + * __scx_add_event - Increase an event counter for 'name' by 'cnt' + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This should be used only when preemption is disabled. + */ +#define __scx_add_event(name, cnt) do { \ + __this_cpu_add(event_stats_cpu.name, cnt); \ + trace_sched_ext_event(#name, cnt); \ +} while(0) + +/** + * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' + * @dst_e: destination event stats + * @src_e: source event stats + * @kind: a kind of event to be aggregated + */ +#define scx_agg_event(dst_e, src_e, kind) do { \ + (dst_e)->kind += READ_ONCE((src_e)->kind); \ +} while(0) + +/** + * scx_dump_event - Dump an event 'kind' in 'events' to 's' + * @s: output seq_buf + * @events: event stats + * @kind: a kind of event to dump + */ +#define scx_dump_event(s, events, kind) do { \ + dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ +} while (0) + + +static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz); + static enum scx_ops_enable_state scx_ops_enable_state(void) { return atomic_read(&scx_ops_enable_state_var); @@ -2018,21 +2137,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (scx_rq_bypassing(rq)) + if (scx_rq_bypassing(rq)) { + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); goto global; + } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; /* see %SCX_OPS_ENQ_EXITING */ if (!static_branch_unlikely(&scx_ops_enq_exiting) && - unlikely(p->flags & PF_EXITING)) + unlikely(p->flags & PF_EXITING)) { + __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1); goto local; + } /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && - is_migration_disabled(p)) + is_migration_disabled(p)) { + __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); goto local; + } if (!SCX_HAS_OP(enqueue)) goto global; @@ -2072,6 +2197,7 @@ local: */ touch_core_sched(rq, p); p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); local_norefill: dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); return; @@ -2079,6 +2205,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); dispatch_enqueue(find_global_dsq(p), p, enq_flags); } @@ -2150,6 +2277,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags do_enqueue_task(rq, p, enq_flags, sticky_cpu); out: rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; + + if ((enq_flags & SCX_ENQ_CPU_SELECTED) && + unlikely(cpu_of(rq) != p->scx.selected_cpu)) + __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); } static void ops_dequeue(struct task_struct *p, u64 deq_flags) @@ -2337,7 +2468,7 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * The caller must ensure that @p and @rq are on different CPUs. */ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, - bool trigger_error) + bool enforce) { int cpu = cpu_of(rq); @@ -2356,7 +2487,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, * easily be masked if task_allowed_on_cpu() is done first. */ if (unlikely(is_migration_disabled(p))) { - if (trigger_error) + if (enforce) scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", p->comm, p->pid, task_cpu(p), cpu); return false; @@ -2369,14 +2500,17 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, * picked CPU is outside the allowed mask. */ if (!task_allowed_on_cpu(p, cpu)) { - if (trigger_error) + if (enforce) scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", cpu, p->comm, p->pid); return false; } - if (!scx_rq_online(rq)) + if (!scx_rq_online(rq)) { + if (enforce) + __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; + } return true; } @@ -2446,7 +2580,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } #else /* CONFIG_SMP */ static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } -static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } +static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ @@ -2893,6 +3027,7 @@ no_tasks: if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; + __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks; } rq->scx.flags &= ~SCX_RQ_IN_BALANCE; @@ -3159,8 +3294,10 @@ static struct task_struct *pick_task_scx(struct rq *rq) */ if (keep_prev) { p = prev; - if (!p->scx.slice) + if (!p->scx.slice) { p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + } } else { p = first_local_task(rq); if (!p) { @@ -3176,6 +3313,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); } } @@ -3220,418 +3358,10 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, #ifdef CONFIG_SMP -static bool test_and_clear_cpu_idle(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - /* - * SMT mask should be cleared whether we can claim @cpu or not. The SMT - * cluster is not wholly idle either way. This also prevents - * scx_pick_idle_cpu() from getting caught in an infinite loop. - */ - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - /* - * If offline, @cpu is not its own sibling and - * scx_pick_idle_cpu() can get caught in an infinite loop as - * @cpu is never cleared from idle_masks.smt. Ensure that @cpu - * is eventually cleared. - * - * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to - * reduce memory writes, which may help alleviate cache - * coherence pressure. - */ - if (cpumask_intersects(smt, idle_masks.smt)) - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - else if (cpumask_test_cpu(cpu, idle_masks.smt)) - __cpumask_clear_cpu(cpu, idle_masks.smt); - } -#endif - return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); -} - -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) -{ - int cpu; - -retry: - if (sched_smt_active()) { - cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); - if (cpu < nr_cpu_ids) - goto found; - - if (flags & SCX_PICK_IDLE_CORE) - return -EBUSY; - } - - cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); - if (cpu >= nr_cpu_ids) - return -EBUSY; - -found: - if (test_and_clear_cpu_idle(cpu)) - return cpu; - else - goto retry; -} - -/* - * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC - * domain is not defined). - */ -static unsigned int llc_weight(s32 cpu) -{ - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) - return 0; - - return sd->span_weight; -} - -/* - * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC - * domain is not defined). - */ -static struct cpumask *llc_span(s32 cpu) -{ - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) - return 0; - - return sched_domain_span(sd); -} - -/* - * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the - * NUMA domain is not defined). - */ -static unsigned int numa_weight(s32 cpu) -{ - struct sched_domain *sd; - struct sched_group *sg; - - sd = rcu_dereference(per_cpu(sd_numa, cpu)); - if (!sd) - return 0; - sg = sd->groups; - if (!sg) - return 0; - - return sg->group_weight; -} - -/* - * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA - * domain is not defined). - */ -static struct cpumask *numa_span(s32 cpu) -{ - struct sched_domain *sd; - struct sched_group *sg; - - sd = rcu_dereference(per_cpu(sd_numa, cpu)); - if (!sd) - return NULL; - sg = sd->groups; - if (!sg) - return NULL; - - return sched_group_span(sg); -} - -/* - * Return true if the LLC domains do not perfectly overlap with the NUMA - * domains, false otherwise. - */ -static bool llc_numa_mismatch(void) -{ - int cpu; - - /* - * We need to scan all online CPUs to verify whether their scheduling - * domains overlap. - * - * While it is rare to encounter architectures with asymmetric NUMA - * topologies, CPU hotplugging or virtualized environments can result - * in asymmetric configurations. - * - * For example: - * - * NUMA 0: - * - LLC 0: cpu0..cpu7 - * - LLC 1: cpu8..cpu15 [offline] - * - * NUMA 1: - * - LLC 0: cpu16..cpu23 - * - LLC 1: cpu24..cpu31 - * - * In this case, if we only check the first online CPU (cpu0), we might - * incorrectly assume that the LLC and NUMA domains are fully - * overlapping, which is incorrect (as NUMA 1 has two distinct LLC - * domains). - */ - for_each_online_cpu(cpu) - if (llc_weight(cpu) != numa_weight(cpu)) - return true; - - return false; -} - -/* - * Initialize topology-aware scheduling. - * - * Detect if the system has multiple LLC or multiple NUMA domains and enable - * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle - * selection policy. - * - * Assumption: the kernel's internal topology representation assumes that each - * CPU belongs to a single LLC domain, and that each LLC domain is entirely - * contained within a single NUMA node. - */ -static void update_selcpu_topology(void) -{ - bool enable_llc = false, enable_numa = false; - unsigned int nr_cpus; - s32 cpu = cpumask_first(cpu_online_mask); - - /* - * Enable LLC domain optimization only when there are multiple LLC - * domains among the online CPUs. If all online CPUs are part of a - * single LLC domain, the idle CPU selection logic can choose any - * online CPU without bias. - * - * Note that it is sufficient to check the LLC domain of the first - * online CPU to determine whether a single LLC domain includes all - * CPUs. - */ - rcu_read_lock(); - nr_cpus = llc_weight(cpu); - if (nr_cpus > 0) { - if (nr_cpus < num_online_cpus()) - enable_llc = true; - pr_debug("sched_ext: LLC=%*pb weight=%u\n", - cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); - } - - /* - * Enable NUMA optimization only when there are multiple NUMA domains - * among the online CPUs and the NUMA domains don't perfectly overlaps - * with the LLC domains. - * - * If all CPUs belong to the same NUMA node and the same LLC domain, - * enabling both NUMA and LLC optimizations is unnecessary, as checking - * for an idle CPU in the same domain twice is redundant. - */ - nr_cpus = numa_weight(cpu); - if (nr_cpus > 0) { - if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) - enable_numa = true; - pr_debug("sched_ext: NUMA=%*pb weight=%u\n", - cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); - } - rcu_read_unlock(); - - pr_debug("sched_ext: LLC idle selection %s\n", - str_enabled_disabled(enable_llc)); - pr_debug("sched_ext: NUMA idle selection %s\n", - str_enabled_disabled(enable_numa)); - - if (enable_llc) - static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); - if (enable_numa) - static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); -} - -/* - * Built-in CPU idle selection policy: - * - * 1. Prioritize full-idle cores: - * - always prioritize CPUs from fully idle cores (both logical CPUs are - * idle) to avoid interference caused by SMT. - * - * 2. Reuse the same CPU: - * - prefer the last used CPU to take advantage of cached data (L1, L2) and - * branch prediction optimizations. - * - * 3. Pick a CPU within the same LLC (Last-Level Cache): - * - if the above conditions aren't met, pick a CPU that shares the same LLC - * to maintain cache locality. - * - * 4. Pick a CPU within the same NUMA node, if enabled: - * - choose a CPU from the same NUMA node to reduce memory access latency. - * - * 5. Pick any idle CPU usable by the task. - * - * Step 3 and 4 are performed only if the system has, respectively, multiple - * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and - * scx_selcpu_topo_numa). - * - * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because - * we never call ops.select_cpu() for them, see select_task_rq(). - */ -static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *found) -{ - const struct cpumask *llc_cpus = NULL; - const struct cpumask *numa_cpus = NULL; - s32 cpu; - - *found = false; - - /* - * This is necessary to protect llc_cpus. - */ - rcu_read_lock(); - - /* - * Determine the scheduling domain only if the task is allowed to run - * on all CPUs. - * - * This is done primarily for efficiency, as it avoids the overhead of - * updating a cpumask every time we need to select an idle CPU (which - * can be costly in large SMP systems), but it also aligns logically: - * if a task's scheduling domain is restricted by user-space (through - * CPU affinity), the task will simply use the flat scheduling domain - * defined by user-space. - */ - if (p->nr_cpus_allowed >= num_possible_cpus()) { - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) - numa_cpus = numa_span(prev_cpu); - - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) - llc_cpus = llc_span(prev_cpu); - } - - /* - * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. - */ - if (wake_flags & SCX_WAKE_SYNC) { - cpu = smp_processor_id(); - - /* - * If the waker's CPU is cache affine and prev_cpu is idle, - * then avoid a migration. - */ - if (cpus_share_cache(cpu, prev_cpu) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * If the waker's local DSQ is empty, and the system is under - * utilized, try to wake up @p to the local DSQ of the waker. - * - * Checking only for an empty local DSQ is insufficient as it - * could give the wakee an unfair advantage when the system is - * oversaturated. - * - * Checking only for the presence of idle CPUs is also - * insufficient as the local DSQ of the waker could have tasks - * piled up on it even if there is an idle core elsewhere on - * the system. - */ - if (!cpumask_empty(idle_masks.cpu) && - !(current->flags & PF_EXITING) && - cpu_rq(cpu)->scx.local_dsq.nr == 0) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) - goto cpu_found; - } - } - - /* - * If CPU has SMT, any wholly idle CPU is likely a better pick than - * partially idle @prev_cpu. - */ - if (sched_smt_active()) { - /* - * Keep using @prev_cpu if it's part of a fully idle core. - */ - if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any fully idle core in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any fully idle core in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any full idle core usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Use @prev_cpu if it's idle. - */ - if (test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any idle CPU in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); - if (cpu >= 0) - goto cpu_found; - - rcu_read_unlock(); - return prev_cpu; - -cpu_found: - rcu_read_unlock(); - - *found = true; - return cpu; -} - static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { + bool rq_bypass; + /* * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it * can be a good migration opportunity with low cache and memory @@ -3645,7 +3375,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { + rq_bypass = scx_rq_bypassing(task_rq(p)); + if (SCX_HAS_OP(select_cpu) && !rq_bypass) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3655,20 +3386,27 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, select_cpu, p, prev_cpu, wake_flags); + p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; if (ops_cpu_valid(cpu, "from ops.select_cpu()")) return cpu; else return prev_cpu; } else { - bool found; s32 cpu; - cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); - if (found) { + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + if (cpu >= 0) { p->scx.slice = SCX_SLICE_DFL; p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + } else { + cpu = prev_cpu; } + p->scx.selected_cpu = cpu; + + if (rq_bypass) + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); return cpu; } } @@ -3696,90 +3434,6 @@ static void set_cpus_allowed_scx(struct task_struct *p, (struct cpumask *)p->cpus_ptr); } -static void reset_idle_masks(void) -{ - /* - * Consider all online cpus idle. Should converge to the actual state - * quickly. - */ - cpumask_copy(idle_masks.cpu, cpu_online_mask); - cpumask_copy(idle_masks.smt, cpu_online_mask); -} - -static void update_builtin_idle(int cpu, bool idle) -{ - assign_cpu(cpu, idle_masks.cpu, idle); - -#ifdef CONFIG_SCHED_SMT - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - if (idle) { - /* - * idle_masks.smt handling is racy but that's fine as - * it's only for optimization and self-correcting. - */ - if (!cpumask_subset(smt, idle_masks.cpu)) - return; - cpumask_or(idle_masks.smt, idle_masks.smt, smt); - } else { - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - } - } -#endif -} - -/* - * Update the idle state of a CPU to @idle. - * - * If @do_notify is true, ops.update_idle() is invoked to notify the scx - * scheduler of an actual idle state transition (idle to busy or vice - * versa). If @do_notify is false, only the idle state in the idle masks is - * refreshed without invoking ops.update_idle(). - * - * This distinction is necessary, because an idle CPU can be "reserved" and - * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as - * busy even if no tasks are dispatched. In this case, the CPU may return - * to idle without a true state transition. Refreshing the idle masks - * without invoking ops.update_idle() ensures accurate idle state tracking - * while avoiding unnecessary updates and maintaining balanced state - * transitions. - */ -void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) -{ - int cpu = cpu_of(rq); - - lockdep_assert_rq_held(rq); - - /* - * Trigger ops.update_idle() only when transitioning from a task to - * the idle thread and vice versa. - * - * Idle transitions are indicated by do_notify being set to true, - * managed by put_prev_task_idle()/set_next_task_idle(). - */ - if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); - - /* - * Update the idle masks: - * - for real idle transitions (do_notify == true) - * - for idle-to-idle transitions (indicated by the previous task - * being the idle thread, managed by pick_task_idle()) - * - * Skip updating idle masks if the previous task is not the idle - * thread, since set_next_task_idle() has already handled it when - * transitioning from a task to the idle thread (calling this - * function with do_notify == true). - * - * In this way we can avoid updating the idle masks twice, - * unnecessarily. - */ - if (static_branch_likely(&scx_builtin_idle_enabled)) - if (do_notify || is_idle_task(rq->curr)) - update_builtin_idle(cpu, idle); -} - static void handle_hotplug(struct rq *rq, bool online) { int cpu = cpu_of(rq); @@ -3787,7 +3441,7 @@ static void handle_hotplug(struct rq *rq, bool online) atomic_long_inc(&scx_hotplug_seq); if (scx_enabled()) - update_selcpu_topology(); + scx_idle_update_selcpu_topology(&scx_ops); if (online && SCX_HAS_OP(cpu_online)) SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); @@ -3819,12 +3473,6 @@ static void rq_offline_scx(struct rq *rq) rq->scx.flags &= ~SCX_RQ_ONLINE; } -#else /* CONFIG_SMP */ - -static bool test_and_clear_cpu_idle(int cpu) { return false; } -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } -static void reset_idle_masks(void) {} - #endif /* CONFIG_SMP */ static bool check_rq_for_timeouts(struct rq *rq) @@ -4749,8 +4397,33 @@ static ssize_t scx_attr_ops_show(struct kobject *kobj, } SCX_ATTR(ops); +#define scx_attr_event_show(buf, at, events, kind) ({ \ + sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ +}) + +static ssize_t scx_attr_events_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct scx_event_stats events; + int at = 0; + + scx_bpf_events(&events, sizeof(events)); + at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); + return at; +} +SCX_ATTR(events); + static struct attribute *scx_sched_attrs[] = { &scx_attr_ops.attr, + &scx_attr_events.attr, NULL, }; ATTRIBUTE_GROUPS(scx_sched); @@ -4862,6 +4535,8 @@ static void scx_clear_softlockup(void) static void scx_ops_bypass(bool bypass) { static DEFINE_RAW_SPINLOCK(bypass_lock); + static unsigned long bypass_timestamp; + int cpu; unsigned long flags; @@ -4871,11 +4546,15 @@ static void scx_ops_bypass(bool bypass) WARN_ON_ONCE(scx_ops_bypass_depth <= 0); if (scx_ops_bypass_depth != 1) goto unlock; + bypass_timestamp = ktime_get_ns(); + scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1); } else { scx_ops_bypass_depth--; WARN_ON_ONCE(scx_ops_bypass_depth < 0); if (scx_ops_bypass_depth != 0) goto unlock; + scx_add_event(SCX_EV_BYPASS_DURATION, + ktime_get_ns() - bypass_timestamp); } atomic_inc(&scx_ops_breather_depth); @@ -5095,11 +4774,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work) static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_allow_queued_wakeup); static_branch_disable(&scx_ops_enq_last); static_branch_disable(&scx_ops_enq_exiting); static_branch_disable(&scx_ops_enq_migration_disabled); static_branch_disable(&scx_ops_cpu_preempt); - static_branch_disable(&scx_builtin_idle_enabled); + scx_idle_disable(); synchronize_rcu(); if (ei->kind >= SCX_EXIT_ERROR) { @@ -5349,6 +5029,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) .at_jiffies = jiffies, }; struct seq_buf s; + struct scx_event_stats events; unsigned long flags; char *buf; int cpu; @@ -5457,6 +5138,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) rq_unlock(rq, &rf); } + dump_newline(&s); + dump_line(&s, "Event counters"); + dump_line(&s, "--------------"); + + scx_bpf_events(&events, sizeof(events)); + scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); + scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL); + scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); + scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); + scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); + if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), trunc_marker, sizeof(trunc_marker)); @@ -5546,6 +5242,16 @@ static int validate_ops(const struct sched_ext_ops *ops) return -EINVAL; } + /* + * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle + * selection policy to be enabled. + */ + if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && + (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); + return -EINVAL; + } + return 0; } @@ -5564,6 +5270,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) mutex_lock(&scx_ops_enable_mutex); + /* + * Clear event counters so a new scx scheduler gets + * fresh event counter values. + */ + for_each_possible_cpu(cpu) { + struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu); + memset(e, 0, sizeof(*e)); + } + if (!scx_ops_helper) { WRITE_ONCE(scx_ops_helper, scx_create_rt_helper("sched_ext_ops_helper")); @@ -5661,9 +5376,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable_cpuslocked(&scx_has_op[i]); check_hotplug_seq(ops); -#ifdef CONFIG_SMP - update_selcpu_topology(); -#endif + scx_idle_update_selcpu_topology(ops); + cpus_read_unlock(); ret = validate_ops(ops); @@ -5702,9 +5416,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) static_branch_enable(&scx_has_op[i]); + if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) + static_branch_enable(&scx_ops_allow_queued_wakeup); if (ops->flags & SCX_OPS_ENQ_LAST) static_branch_enable(&scx_ops_enq_last); - if (ops->flags & SCX_OPS_ENQ_EXITING) static_branch_enable(&scx_ops_enq_exiting); if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) @@ -5712,12 +5427,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); - if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { - reset_idle_masks(); - static_branch_enable(&scx_builtin_idle_enabled); - } else { - static_branch_disable(&scx_builtin_idle_enabled); - } + scx_idle_enable(ops); /* * Lock out forks, cgroup on/offlining and moves before opening the @@ -6356,10 +6066,8 @@ void __init init_sched_ext_class(void) SCX_TG_ONLINE); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); -#ifdef CONFIG_SMP - BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); -#endif + scx_idle_init_masks(); + scx_kick_cpus_pnt_seqs = __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, __alignof__(scx_kick_cpus_pnt_seqs[0])); @@ -6367,15 +6075,16 @@ void __init init_sched_ext_class(void) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); + int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); @@ -6392,65 +6101,6 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -#include <linux/btf_ids.h> - -__bpf_kfunc_start_defs(); - -static bool check_builtin_idle_enabled(void) -{ - if (static_branch_likely(&scx_builtin_idle_enabled)) - return true; - - scx_ops_error("built-in idle tracking is disabled"); - return false; -} - -/** - * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() - * @p: task_struct to select a CPU for - * @prev_cpu: CPU @p was on previously - * @wake_flags: %SCX_WAKE_* flags - * @is_idle: out parameter indicating whether the returned CPU is idle - * - * Can only be called from ops.select_cpu() if the built-in CPU selection is - * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). - * - * Returns the picked CPU with *@is_idle indicating whether the picked CPU is - * currently idle and thus a good candidate for direct dispatching. - */ -__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *is_idle) -{ - if (!ops_cpu_valid(prev_cpu, NULL)) - goto prev_cpu; - - if (!check_builtin_idle_enabled()) - goto prev_cpu; - - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) - goto prev_cpu; - -#ifdef CONFIG_SMP - return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#endif - -prev_cpu: - *is_idle = false; - return prev_cpu; -} - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) -BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) - -static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { - .owner = THIS_MODULE, - .set = &scx_kfunc_ids_select_cpu, -}; - static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) { if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) @@ -6972,8 +6622,12 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. + * + * Also skip re-enqueueing tasks that can only run on this + * CPU, as they would just be re-added to the same local + * DSQ without any benefit. */ - if (p->migration_pending) + if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) continue; dispatch_dequeue(rq, p); @@ -7470,6 +7124,16 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) } /** + * scx_bpf_nr_node_ids - Return the number of possible node IDs + * + * All valid node IDs in the system are smaller than the returned value. + */ +__bpf_kfunc u32 scx_bpf_nr_node_ids(void) +{ + return nr_node_ids; +} + +/** * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs * * All valid CPU IDs in the system are smaller than the returned value. @@ -7510,142 +7174,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) } /** - * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking - * per-CPU cpumask. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) -{ - if (!check_builtin_idle_enabled()) - return cpu_none_mask; - -#ifdef CONFIG_SMP - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, - * per-physical-core cpumask. Can be used to determine if an entire physical - * core is free. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) -{ - if (!check_builtin_idle_enabled()) - return cpu_none_mask; - -#ifdef CONFIG_SMP - if (sched_smt_active()) - return idle_masks.smt; - else - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to - * either the percpu, or SMT idle-tracking cpumask. - * @idle_mask: &cpumask to use - */ -__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) -{ - /* - * Empty function body because we aren't actually acquiring or releasing - * a reference to a global idle cpumask, which is read-only in the - * caller and is never released. The acquire / release semantics here - * are just used to make the cpumask a trusted pointer in the caller. - */ -} - -/** - * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state - * @cpu: cpu to test and clear idle for - * - * Returns %true if @cpu was idle and its idle state was successfully cleared. - * %false otherwise. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) -{ - if (!check_builtin_idle_enabled()) - return false; - - if (ops_cpu_valid(cpu, NULL)) - return test_and_clear_cpu_idle(cpu); - else - return false; -} - -/** - * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu - * number on success. -%EBUSY if no matching cpu was found. - * - * Idle CPU tracking may race against CPU scheduling state transitions. For - * example, this function may return -%EBUSY as CPUs are transitioning into the - * idle state. If the caller then assumes that there will be dispatch events on - * the CPUs as they were all busy, the scheduler may end up stalling with CPUs - * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and - * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch - * event in the near future. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - if (!check_builtin_idle_enabled()) - return -EBUSY; - - return scx_pick_idle_cpu(cpus_allowed, flags); -} - -/** - * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any - * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu - * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is - * empty. - * - * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not - * set, this function can't tell which CPUs are idle and will always pick any - * CPU. - */ -__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - s32 cpu; - - if (static_branch_likely(&scx_builtin_idle_enabled)) { - cpu = scx_pick_idle_cpu(cpus_allowed, flags); - if (cpu >= 0) - return cpu; - } - - cpu = cpumask_any_distribute(cpus_allowed); - if (cpu < nr_cpu_ids) - return cpu; - else - return -EBUSY; -} - -/** * scx_bpf_task_running - Is task currently running? * @p: task of interest */ @@ -7765,6 +7293,43 @@ __bpf_kfunc u64 scx_bpf_now(void) return clock; } +/* + * scx_bpf_events - Get a system-wide event counter to + * @events: output buffer from a BPF program + * @events__sz: @events len, must end in '__sz'' for the verifier + */ +__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, + size_t events__sz) +{ + struct scx_event_stats e_sys, *e_cpu; + int cpu; + + /* Aggregate per-CPU event counters into the system-wide counters. */ + memset(&e_sys, 0, sizeof(e_sys)); + for_each_possible_cpu(cpu) { + e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); + scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE); + } + + /* + * We cannot entirely trust a BPF-provided size since a BPF program + * might be compiled against a different vmlinux.h, of which + * scx_event_stats would be larger (a newer vmlinux.h) or smaller + * (an older vmlinux.h). Hence, we use the smaller size to avoid + * memory corruption. + */ + events__sz = min(events__sz, sizeof(*events)); + memcpy(events, &e_sys, events__sz); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) @@ -7780,6 +7345,7 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) +BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) @@ -7797,6 +7363,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq) BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif BTF_ID_FLAGS(func, scx_bpf_now) +BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { @@ -7820,8 +7387,6 @@ static int __init scx_init(void) * check using scx_kf_allowed(). */ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, - &scx_kfunc_set_select_cpu)) || - (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_enqueue_dispatch)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_dispatch)) || @@ -7841,6 +7406,12 @@ static int __init scx_init(void) return ret; } + ret = scx_idle_init(); + if (ret) { + pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); + return ret; + } + ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); if (ret) { pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 1079b56b0f7a..1bda96b19a1b 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,6 +8,8 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT +DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); + void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p) return scx_enabled() && p->sched_class == &ext_sched_class; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) +{ + return !scx_enabled() || + static_branch_likely(&scx_ops_allow_queued_wakeup) || + p->sched_class != &ext_sched_class; +} + #ifdef CONFIG_SCHED_CORE bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi); @@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {} static inline void scx_rq_deactivate(struct rq *rq) {} static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; } static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c new file mode 100644 index 000000000000..52c36a70a3d0 --- /dev/null +++ b/kernel/sched/ext_idle.c @@ -0,0 +1,1171 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Built-in idle CPU tracking policy. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#include "ext_idle.h" + +/* Enable/disable built-in idle CPU selection policy */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + +/* Enable/disable per-node idle cpumasks */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP +/* Enable/disable LLC aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); + +/* Enable/disable NUMA aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); + +/* + * cpumasks to track idle CPUs within each NUMA node. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask + * from is used to track all the idle CPUs in the system. + */ +struct scx_idle_cpus { + cpumask_var_t cpu; + cpumask_var_t smt; +}; + +/* + * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE + * is not enabled). + */ +static struct scx_idle_cpus scx_idle_global_masks; + +/* + * Per-node idle cpumasks. + */ +static struct scx_idle_cpus **scx_idle_node_masks; + +/* + * Return the idle masks associated to a target @node. + * + * NUMA_NO_NODE identifies the global idle cpumask. + */ +static struct scx_idle_cpus *idle_cpumask(int node) +{ + return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node]; +} + +/* + * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if + * per-node idle cpumasks are disabled. + */ +static int scx_cpu_node_if_enabled(int cpu) +{ + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +} + +bool scx_idle_test_and_clear_cpu(int cpu) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + +#ifdef CONFIG_SCHED_SMT + /* + * SMT mask should be cleared whether we can claim @cpu or not. The SMT + * cluster is not wholly idle either way. This also prevents + * scx_pick_idle_cpu() from getting caught in an infinite loop. + */ + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + /* + * If offline, @cpu is not its own sibling and + * scx_pick_idle_cpu() can get caught in an infinite loop as + * @cpu is never cleared from the idle SMT mask. Ensure that + * @cpu is eventually cleared. + * + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to + * reduce memory writes, which may help alleviate cache + * coherence pressure. + */ + if (cpumask_intersects(smt, idle_smts)) + cpumask_andnot(idle_smts, idle_smts, smt); + else if (cpumask_test_cpu(cpu, idle_smts)) + __cpumask_clear_cpu(cpu, idle_smts); + } +#endif + + return cpumask_test_and_clear_cpu(cpu, idle_cpus); +} + +/* + * Pick an idle CPU in a specific NUMA node. + */ +static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + int cpu; + +retry: + if (sched_smt_active()) { + cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed); + if (cpu < nr_cpu_ids) + goto found; + + if (flags & SCX_PICK_IDLE_CORE) + return -EBUSY; + } + + cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed); + if (cpu >= nr_cpu_ids) + return -EBUSY; + +found: + if (scx_idle_test_and_clear_cpu(cpu)) + return cpu; + else + goto retry; +} + +/* + * Tracks nodes that have not yet been visited when searching for an idle + * CPU across all available nodes. + */ +static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited); + +/* + * Search for an idle CPU across all nodes, excluding @node. + */ +static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + nodemask_t *unvisited; + s32 cpu = -EBUSY; + + preempt_disable(); + unvisited = this_cpu_ptr(&per_cpu_unvisited); + + /* + * Restrict the search to the online nodes (excluding the current + * node that has been visited already). + */ + nodes_copy(*unvisited, node_states[N_ONLINE]); + node_clear(node, *unvisited); + + /* + * Traverse all nodes in order of increasing distance, starting + * from @node. + * + * This loop is O(N^2), with N being the amount of NUMA nodes, + * which might be quite expensive in large NUMA systems. However, + * this complexity comes into play only when a scheduler enables + * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU + * without specifying a target NUMA node, so it shouldn't be a + * bottleneck is most cases. + * + * As a future optimization we may want to cache the list of nodes + * in a per-node array, instead of actually traversing them every + * time. + */ + for_each_node_numadist(node, *unvisited) { + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + break; + } + preempt_enable(); + + return cpu; +} + +/* + * Find an idle CPU in the system, starting from @node. + */ +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + s32 cpu; + + /* + * Always search in the starting node first (this is an + * optimization that can save some cycles even when the search is + * not limited to a single node). + */ + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + /* + * Stop the search if we are using only a single global cpumask + * (NUMA_NO_NODE) or if the search is restricted to the first node + * only. + */ + if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE) + return -EBUSY; + + /* + * Extend the search to the other online nodes. + */ + return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags); +} + +/* + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC + * domain is not defined). + */ +static unsigned int llc_weight(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sd->span_weight; +} + +/* + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC + * domain is not defined). + */ +static struct cpumask *llc_span(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sched_domain_span(sd); +} + +/* + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the + * NUMA domain is not defined). + */ +static unsigned int numa_weight(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return 0; + sg = sd->groups; + if (!sg) + return 0; + + return sg->group_weight; +} + +/* + * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA + * domain is not defined). + */ +static struct cpumask *numa_span(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return NULL; + sg = sd->groups; + if (!sg) + return NULL; + + return sched_group_span(sg); +} + +/* + * Return true if the LLC domains do not perfectly overlap with the NUMA + * domains, false otherwise. + */ +static bool llc_numa_mismatch(void) +{ + int cpu; + + /* + * We need to scan all online CPUs to verify whether their scheduling + * domains overlap. + * + * While it is rare to encounter architectures with asymmetric NUMA + * topologies, CPU hotplugging or virtualized environments can result + * in asymmetric configurations. + * + * For example: + * + * NUMA 0: + * - LLC 0: cpu0..cpu7 + * - LLC 1: cpu8..cpu15 [offline] + * + * NUMA 1: + * - LLC 0: cpu16..cpu23 + * - LLC 1: cpu24..cpu31 + * + * In this case, if we only check the first online CPU (cpu0), we might + * incorrectly assume that the LLC and NUMA domains are fully + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC + * domains). + */ + for_each_online_cpu(cpu) + if (llc_weight(cpu) != numa_weight(cpu)) + return true; + + return false; +} + +/* + * Initialize topology-aware scheduling. + * + * Detect if the system has multiple LLC or multiple NUMA domains and enable + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle + * selection policy. + * + * Assumption: the kernel's internal topology representation assumes that each + * CPU belongs to a single LLC domain, and that each LLC domain is entirely + * contained within a single NUMA node. + */ +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) +{ + bool enable_llc = false, enable_numa = false; + unsigned int nr_cpus; + s32 cpu = cpumask_first(cpu_online_mask); + + /* + * Enable LLC domain optimization only when there are multiple LLC + * domains among the online CPUs. If all online CPUs are part of a + * single LLC domain, the idle CPU selection logic can choose any + * online CPU without bias. + * + * Note that it is sufficient to check the LLC domain of the first + * online CPU to determine whether a single LLC domain includes all + * CPUs. + */ + rcu_read_lock(); + nr_cpus = llc_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus()) + enable_llc = true; + pr_debug("sched_ext: LLC=%*pb weight=%u\n", + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); + } + + /* + * Enable NUMA optimization only when there are multiple NUMA domains + * among the online CPUs and the NUMA domains don't perfectly overlaps + * with the LLC domains. + * + * If all CPUs belong to the same NUMA node and the same LLC domain, + * enabling both NUMA and LLC optimizations is unnecessary, as checking + * for an idle CPU in the same domain twice is redundant. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA + * optimization, as we would naturally select idle CPUs within + * specific NUMA nodes querying the corresponding per-node cpumask. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + nr_cpus = numa_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) + enable_numa = true; + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", + cpumask_pr_args(numa_span(cpu)), nr_cpus); + } + } + rcu_read_unlock(); + + pr_debug("sched_ext: LLC idle selection %s\n", + str_enabled_disabled(enable_llc)); + pr_debug("sched_ext: NUMA idle selection %s\n", + str_enabled_disabled(enable_numa)); + + if (enable_llc) + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); + if (enable_numa) + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); +} + +/* + * Built-in CPU idle selection policy: + * + * 1. Prioritize full-idle cores: + * - always prioritize CPUs from fully idle cores (both logical CPUs are + * idle) to avoid interference caused by SMT. + * + * 2. Reuse the same CPU: + * - prefer the last used CPU to take advantage of cached data (L1, L2) and + * branch prediction optimizations. + * + * 3. Pick a CPU within the same LLC (Last-Level Cache): + * - if the above conditions aren't met, pick a CPU that shares the same LLC + * to maintain cache locality. + * + * 4. Pick a CPU within the same NUMA node, if enabled: + * - choose a CPU from the same NUMA node to reduce memory access latency. + * + * 5. Pick any idle CPU usable by the task. + * + * Step 3 and 4 are performed only if the system has, respectively, + * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and + * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs. + * + * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always + * begin in @prev_cpu's node and proceed to other nodes in order of + * increasing distance. + * + * Return the picked CPU if idle, or a negative value otherwise. + * + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because + * we never call ops.select_cpu() for them, see select_task_rq(). + */ +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags) +{ + const struct cpumask *llc_cpus = NULL; + const struct cpumask *numa_cpus = NULL; + int node = scx_cpu_node_if_enabled(prev_cpu); + s32 cpu; + + /* + * This is necessary to protect llc_cpus. + */ + rcu_read_lock(); + + /* + * Determine the scheduling domain only if the task is allowed to run + * on all CPUs. + * + * This is done primarily for efficiency, as it avoids the overhead of + * updating a cpumask every time we need to select an idle CPU (which + * can be costly in large SMP systems), but it also aligns logically: + * if a task's scheduling domain is restricted by user-space (through + * CPU affinity), the task will simply use the flat scheduling domain + * defined by user-space. + */ + if (p->nr_cpus_allowed >= num_possible_cpus()) { + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) + numa_cpus = numa_span(prev_cpu); + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) + llc_cpus = llc_span(prev_cpu); + } + + /* + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. + */ + if (wake_flags & SCX_WAKE_SYNC) { + int waker_node; + + /* + * If the waker's CPU is cache affine and prev_cpu is idle, + * then avoid a migration. + */ + cpu = smp_processor_id(); + if (cpus_share_cache(cpu, prev_cpu) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * If the waker's local DSQ is empty, and the system is under + * utilized, try to wake up @p to the local DSQ of the waker. + * + * Checking only for an empty local DSQ is insufficient as it + * could give the wakee an unfair advantage when the system is + * oversaturated. + * + * Checking only for the presence of idle CPUs is also + * insufficient as the local DSQ of the waker could have tasks + * piled up on it even if there is an idle core elsewhere on + * the system. + */ + waker_node = cpu_to_node(cpu); + if (!(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0 && + (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && + !cpumask_empty(idle_cpumask(waker_node)->cpu)) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) + goto out_unlock; + } + } + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (sched_smt_active()) { + /* + * Keep using @prev_cpu if it's part of a fully idle core. + */ + if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any fully idle core in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any fully idle core in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any full-idle core usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always + * begin in prev_cpu's node and proceed to other nodes in + * order of increasing distance. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + + /* + * Give up if we're strictly looking for a full-idle SMT + * core. + */ + if (flags & SCX_PICK_IDLE_CORE) { + cpu = prev_cpu; + goto out_unlock; + } + } + + /* + * Use @prev_cpu if it's idle. + */ + if (scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any idle CPU in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin + * in prev_cpu's node and proceed to other nodes in order of + * increasing distance. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags); + if (cpu >= 0) + goto out_unlock; + +out_unlock: + rcu_read_unlock(); + + return cpu; +} + +/* + * Initialize global and per-node idle cpumasks. + */ +void scx_idle_init_masks(void) +{ + int node; + + /* Allocate global idle cpumasks */ + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); + + /* Allocate per-node idle cpumasks */ + scx_idle_node_masks = kcalloc(num_possible_nodes(), + sizeof(*scx_idle_node_masks), GFP_KERNEL); + BUG_ON(!scx_idle_node_masks); + + for_each_node(node) { + scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks), + GFP_KERNEL, node); + BUG_ON(!scx_idle_node_masks[node]); + + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node)); + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node)); + } +} + +static void update_builtin_idle(int cpu, bool idle) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + + assign_cpu(cpu, idle_cpus, idle); + +#ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + if (idle) { + /* + * idle_smt handling is racy but that's fine as it's + * only for optimization and self-correcting. + */ + if (!cpumask_subset(smt, idle_cpus)) + return; + cpumask_or(idle_smts, idle_smts, smt); + } else { + cpumask_andnot(idle_smts, idle_smts, smt); + } + } +#endif +} + +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + */ + if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); +} + +static void reset_idle_masks(struct sched_ext_ops *ops) +{ + int node; + + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask); + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask); + return; + } + + for_each_node(node) { + const struct cpumask *node_mask = cpumask_of_node(node); + + cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask); + cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); + } +} +#endif /* CONFIG_SMP */ + +void scx_idle_enable(struct sched_ext_ops *ops) +{ + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) + static_branch_enable(&scx_builtin_idle_enabled); + else + static_branch_disable(&scx_builtin_idle_enabled); + + if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) + static_branch_enable(&scx_builtin_idle_per_node); + else + static_branch_disable(&scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP + reset_idle_masks(ops); +#endif +} + +void scx_idle_disable(void) +{ + static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_per_node); +} + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ + +static int validate_node(int node) +{ + if (!static_branch_likely(&scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is disabled"); + return -EOPNOTSUPP; + } + + /* Return no entry for NUMA_NO_NODE (not a critical scx error) */ + if (node == NUMA_NO_NODE) + return -ENOENT; + + /* Make sure node is in a valid range */ + if (node < 0 || node >= nr_node_ids) { + scx_ops_error("invalid node %d", node); + return -EINVAL; + } + + /* Make sure the node is part of the set of possible nodes */ + if (!node_possible(node)) { + scx_ops_error("unavailable node %d", node); + return -EINVAL; + } + + return node; +} + +__bpf_kfunc_start_defs(); + +static bool check_builtin_idle_enabled(void) +{ + if (static_branch_likely(&scx_builtin_idle_enabled)) + return true; + + scx_ops_error("built-in idle tracking is disabled"); + return false; +} + +/** + * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or + * trigger an error if @cpu is invalid + * @cpu: target CPU + */ +__bpf_kfunc int scx_bpf_cpu_node(s32 cpu) +{ +#ifdef CONFIG_NUMA + if (!ops_cpu_valid(cpu, NULL)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +#else + return 0; +#endif +} + +/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @is_idle: out parameter indicating whether the returned CPU is idle + * + * Can only be called from ops.select_cpu() if the built-in CPU selection is + * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is + * currently idle and thus a good candidate for direct dispatching. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *is_idle) +{ +#ifdef CONFIG_SMP + s32 cpu; +#endif + if (!ops_cpu_valid(prev_cpu, NULL)) + goto prev_cpu; + + if (!check_builtin_idle_enabled()) + goto prev_cpu; + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + +#ifdef CONFIG_SMP + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + if (cpu >= 0) { + *is_idle = true; + return cpu; + } +#endif + +prev_cpu: + *is_idle = false; + return prev_cpu; +} + +/** + * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the + * idle-tracking per-CPU cpumask of a target NUMA node. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_ops_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking + * per-CPU cpumask. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the + * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be + * used to determine if an entire physical core is free. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_ops_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(node)->smt; + else + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, + * per-physical-core cpumask. Can be used to determine if an entire physical + * core is free. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(NUMA_NO_NODE)->smt; + else + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to + * either the percpu, or SMT idle-tracking cpumask. + * @idle_mask: &cpumask to use + */ +__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) +{ + /* + * Empty function body because we aren't actually acquiring or releasing + * a reference to a global idle cpumask, which is read-only in the + * caller and is never released. The acquire / release semantics here + * are just used to make the cpumask a trusted pointer in the caller. + */ +} + +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!check_builtin_idle_enabled()) + return false; + + if (ops_cpu_valid(cpu, NULL)) + return scx_idle_test_and_clear_cpu(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_* flags + * + * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node. + * + * Returns the picked idle cpu number on success, or -%EBUSY if no matching + * cpu was found. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node). + * + * Always returns an error if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if + * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + node = validate_node(node); + if (node < 0) + return node; + + return scx_pick_idle_cpu(cpus_allowed, node, flags); +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_idle_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (!check_builtin_idle_enabled()) + return -EBUSY; + + return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); +} + +/** + * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available + * or pick any CPU from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node, regardless of + * the CPU idle state). + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + s32 cpu; + + node = validate_node(node); + if (node < 0) + return node; + + cpu = scx_pick_idle_cpu(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + if (flags & SCX_PICK_IDLE_IN_NODE) + cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed); + else + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_any_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + s32 cpu; + + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_idle) +BTF_ID_FLAGS(func, scx_bpf_cpu_node) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_idle) + +static const struct btf_kfunc_id_set scx_kfunc_set_idle = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_idle, +}; + +BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) + +static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_select_cpu, +}; + +int scx_idle_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); + + return ret; +} diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h new file mode 100644 index 000000000000..511cc2221f7a --- /dev/null +++ b/kernel/sched/ext_idle.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#ifndef _KERNEL_SCHED_EXT_IDLE_H +#define _KERNEL_SCHED_EXT_IDLE_H + +struct sched_ext_ops; + +#ifdef CONFIG_SMP +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); +void scx_idle_init_masks(void); +bool scx_idle_test_and_clear_cpu(int cpu); +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags); +#else /* !CONFIG_SMP */ +static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} +static inline void scx_idle_init_masks(void) {} +static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } +static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + return -EBUSY; +} +#endif /* CONFIG_SMP */ + +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags); +void scx_idle_enable(struct sched_ext_ops *ops); +void scx_idle_disable(void); +int scx_idle_init(void); + +#endif /* _KERNEL_SCHED_EXT_IDLE_H */ |