path: root/kernel/sched/sched.h
diff options
authorLinus Torvalds <torvalds@linux-foundation.org>2022-01-11 17:14:59 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-01-11 17:14:59 -0800
commit6ae71436cda740148640046d58190a5bbc3ac86d (patch)
treed30635c5c06ac114ca9b07bf96c7ea0bb3904f2f /kernel/sched/sched.h
parentMerge tag 'Wcast-function-type-5.17-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux (diff)
parentsched/fair: Replace CFS internal cpu_util() with cpu_util_cfs() (diff)
Merge tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Borislav Petkov: "Mostly minor things this time; some highlights: - core-sched: Add 'Forced Idle' accounting; this allows to track how much CPU time is 'lost' due to core scheduling constraints. - psi: Fix for MEM_FULL; a task running reclaim would be counted as a runnable task and prevent MEM_FULL from being reported. - cpuacct: Long standing fixes for some cgroup accounting issues. - rt: Bandwidth timer could, under unusual circumstances, be failed to armed, leading to indefinite throttling." [ Description above by Peter Zijlstra ] * tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs() sched/fair: Cleanup task_util and capacity type sched/rt: Try to restart rt period timer when rt runtime exceeded sched/fair: Document the slow path and fast path in select_task_rq_fair sched/fair: Fix per-CPU kthread and wakee stacking for asym CPU capacity sched/fair: Fix detection of per-CPU kthreads waking a task sched/cpuacct: Make user/system times in cpuacct.stat more precise sched/cpuacct: Fix user/system in shown cpuacct.usage* cpuacct: Convert BUG_ON() to WARN_ON_ONCE() cputime, cpuacct: Include guest time in user time in cpuacct.stat psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim sched/core: Forced idle accounting psi: Add a missing SPDX license header psi: Remove repeated verbose comment
Diffstat (limited to '')
1 files changed, 70 insertions, 6 deletions
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0e66749486e7..de53be905739 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1111,8 +1111,10 @@ struct rq {
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
- unsigned char core_forceidle;
+ unsigned int core_forceidle_count;
unsigned int core_forceidle_seq;
+ unsigned int core_forceidle_occupation;
+ u64 core_forceidle_start;
@@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p)
extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
-extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
extern void sched_core_get(void);
extern void sched_core_put(void);
@@ -1854,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { }
#include "stats.h"
#include "autogroup.h"
+extern void __sched_core_account_forceidle(struct rq *rq);
+static inline void sched_core_account_forceidle(struct rq *rq)
+ if (schedstat_enabled())
+ __sched_core_account_forceidle(rq);
+extern void __sched_core_tick(struct rq *rq);
+static inline void sched_core_tick(struct rq *rq)
+ if (sched_core_enabled(rq) && schedstat_enabled())
+ __sched_core_tick(rq);
+static inline void sched_core_account_forceidle(struct rq *rq) {}
+static inline void sched_core_tick(struct rq *rq) {}
@@ -2938,16 +2966,52 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
return READ_ONCE(rq->avg_dl.util_avg);
-static inline unsigned long cpu_util_cfs(struct rq *rq)
+ * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
+ * @cpu: the CPU to get the utilization for.
+ *
+ * The unit of the return value must be the same as the one of CPU capacity
+ * so that CPU utilization can be compared with CPU capacity.
+ *
+ * CPU utilization is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on that CPU.
+ * It represents the amount of CPU capacity currently used by CFS tasks in
+ * the range [0..max CPU capacity] with max CPU capacity being the CPU
+ * capacity at f_max.
+ *
+ * The estimated CPU utilization is defined as the maximum between CPU
+ * utilization and sum of the estimated utilization of the currently
+ * runnable tasks on that CPU. It preserves a utilization "snapshot" of
+ * previously-executed tasks, which helps better deduce how busy a CPU will
+ * be when a long-sleeping task wakes up. The contribution to CPU utilization
+ * of such a task would be significantly decayed at this point of time.
+ *
+ * CPU utilization can be higher than the current CPU capacity
+ * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
+ * of rounding errors as well as task migrations or wakeups of new tasks.
+ * CPU utilization has to be capped to fit into the [0..max CPU capacity]
+ * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
+ * could be seen as over-utilized even though CPU1 has 20% of spare CPU
+ * capacity. CPU utilization is allowed to overshoot current CPU capacity
+ * though since this is useful for predicting the CPU capacity required
+ * after task migrations (scheduler-driven DVFS).
+ *
+ * Return: (Estimated) utilization for the specified CPU.
+ */
+static inline unsigned long cpu_util_cfs(int cpu)
- unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
+ struct cfs_rq *cfs_rq;
+ unsigned long util;
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST)) {
util = max_t(unsigned long, util,
- READ_ONCE(rq->cfs.avg.util_est.enqueued));
+ READ_ONCE(cfs_rq->avg.util_est.enqueued));
- return util;
+ return min(util, capacity_orig_of(cpu));
static inline unsigned long cpu_util_rt(struct rq *rq)