aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/psi.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-01-11 17:14:59 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-01-11 17:14:59 -0800
commit6ae71436cda740148640046d58190a5bbc3ac86d (patch)
treed30635c5c06ac114ca9b07bf96c7ea0bb3904f2f /kernel/sched/psi.c
parentMerge tag 'Wcast-function-type-5.17-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux (diff)
parentsched/fair: Replace CFS internal cpu_util() with cpu_util_cfs() (diff)
downloadlinux-dev-6ae71436cda740148640046d58190a5bbc3ac86d.tar.xz
linux-dev-6ae71436cda740148640046d58190a5bbc3ac86d.zip
Merge tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Borislav Petkov: "Mostly minor things this time; some highlights: - core-sched: Add 'Forced Idle' accounting; this allows to track how much CPU time is 'lost' due to core scheduling constraints. - psi: Fix for MEM_FULL; a task running reclaim would be counted as a runnable task and prevent MEM_FULL from being reported. - cpuacct: Long standing fixes for some cgroup accounting issues. - rt: Bandwidth timer could, under unusual circumstances, be failed to armed, leading to indefinite throttling." [ Description above by Peter Zijlstra ] * tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs() sched/fair: Cleanup task_util and capacity type sched/rt: Try to restart rt period timer when rt runtime exceeded sched/fair: Document the slow path and fast path in select_task_rq_fair sched/fair: Fix per-CPU kthread and wakee stacking for asym CPU capacity sched/fair: Fix detection of per-CPU kthreads waking a task sched/cpuacct: Make user/system times in cpuacct.stat more precise sched/cpuacct: Fix user/system in shown cpuacct.usage* cpuacct: Convert BUG_ON() to WARN_ON_ONCE() cputime, cpuacct: Include guest time in user time in cpuacct.stat psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim sched/core: Forced idle accounting psi: Add a missing SPDX license header psi: Remove repeated verbose comment
Diffstat (limited to '')
-rw-r--r--kernel/sched/psi.c47
1 files changed, 29 insertions, 18 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1652f2bb54b7..a679613a7cb7 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Pressure stall information for CPU, memory and IO
*
@@ -34,13 +35,19 @@
* delayed on that resource such that nobody is advancing and the CPU
* goes idle. This leaves both workload and CPU unproductive.
*
- * Naturally, the FULL state doesn't exist for the CPU resource at the
- * system level, but exist at the cgroup level, means all non-idle tasks
- * in a cgroup are delayed on the CPU resource which used by others outside
- * of the cgroup or throttled by the cgroup cpu.max configuration.
- *
* SOME = nr_delayed_tasks != 0
- * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an oncpu task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
*
* The percentage of wallclock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource,
@@ -81,13 +88,13 @@
*
* threads = min(nr_nonidle_tasks, nr_cpus)
* SOME = min(nr_delayed_tasks / threads, 1)
- * FULL = (threads - min(nr_running_tasks, threads)) / threads
+ * FULL = (threads - min(nr_productive_tasks, threads)) / threads
*
* For the 257 number crunchers on 256 CPUs, this yields:
*
* threads = min(257, 256)
* SOME = min(1 / 256, 1) = 0.4%
- * FULL = (256 - min(257, 256)) / 256 = 0%
+ * FULL = (256 - min(256, 256)) / 256 = 0%
*
* For the 1 out of 4 memory-delayed tasks, this yields:
*
@@ -112,7 +119,7 @@
* For each runqueue, we track:
*
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
*
* and then periodically aggregate:
@@ -233,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
case PSI_MEM_SOME:
return unlikely(tasks[NR_MEMSTALL]);
case PSI_MEM_FULL:
- return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+ return unlikely(tasks[NR_MEMSTALL] &&
+ tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
case PSI_CPU_FULL:
@@ -710,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (groupc->tasks[t]) {
groupc->tasks[t]--;
} else if (!psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
- groupc->tasks[3], clear, set);
+ groupc->tasks[3], groupc->tasks[4],
+ clear, set);
psi_bug = 1;
}
}
@@ -833,7 +842,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
/*
* When switching between tasks that have an identical
* runtime state, the cgroup that contains both tasks
- * runtime state, the cgroup that contains both tasks
* we reach the first common ancestor. Iterate @next's
* ancestors only until we encounter @prev's ONCPU.
*/
@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
int clear = TSK_ONCPU, set = 0;
/*
- * When we're going to sleep, psi_dequeue() lets us handle
- * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
- * with TSK_ONCPU and save walking common ancestors twice.
+ * When we're going to sleep, psi_dequeue() lets us
+ * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+ * TSK_IOWAIT here, where we can combine it with
+ * TSK_ONCPU and save walking common ancestors twice.
*/
if (sleep) {
clear |= TSK_RUNNING;
+ if (prev->in_memstall)
+ clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait)
set |= TSK_IOWAIT;
}
@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 1;
- psi_task_change(current, 0, TSK_MEMSTALL);
+ psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
rq_unlock_irq(rq, &rf);
}
@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 0;
- psi_task_change(current, TSK_MEMSTALL, 0);
+ psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
rq_unlock_irq(rq, &rf);
}