Merge tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Borislav Petkov: "Mostly minor things this time; some highlights: - core-sched: Add 'Forced Idle' accounting; this allows to track how much CPU time is 'lost' due to core scheduling constraints. - psi: Fix for MEM_FULL; a task running reclaim would be counted as a runnable task and prevent MEM_FULL from being reported. - cpuacct: Long standing fixes for some cgroup accounting issues. - rt: Bandwidth timer could, under unusual circumstances, be failed to armed, leading to indefinite throttling." [ Description above by Peter Zijlstra ] * tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs() sched/fair: Cleanup task_util and capacity type sched/rt: Try to restart rt period timer when rt runtime exceeded sched/fair: Document the slow path and fast path in select_task_rq_fair sched/fair: Fix per-CPU kthread and wakee stacking for asym CPU capacity sched/fair: Fix detection of per-CPU kthreads waking a task sched/cpuacct: Make user/system times in cpuacct.stat more precise sched/cpuacct: Fix user/system in shown cpuacct.usage* cpuacct: Convert BUG_ON() to WARN_ON_ONCE() cputime, cpuacct: Include guest time in user time in cpuacct.stat psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim sched/core: Forced idle accounting psi: Add a missing SPDX license header psi: Remove repeated verbose comment
author: Linus Torvalds <torvalds@linux-foundation.org> 2022-01-11 17:14:59 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2022-01-11 17:14:59 -0800
commit: 6ae71436cda740148640046d58190a5bbc3ac86d (patch)
tree: d30635c5c06ac114ca9b07bf96c7ea0bb3904f2f /kernel/sched/psi.c
parent: Merge tag 'Wcast-function-type-5.17-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux (diff)
parent: sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs() (diff)
download: linux-dev-6ae71436cda740148640046d58190a5bbc3ac86d.tar.xz
linux-dev-6ae71436cda740148640046d58190a5bbc3ac86d.zip
1 files changed, 29 insertions, 18 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1652f2bb54b7..a679613a7cb7 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Pressure stall information for CPU, memory and IO
  *
@@ -34,13 +35,19 @@
  * delayed on that resource such that nobody is advancing and the CPU
  * goes idle. This leaves both workload and CPU unproductive.
  *
- * Naturally, the FULL state doesn't exist for the CPU resource at the
- * system level, but exist at the cgroup level, means all non-idle tasks
- * in a cgroup are delayed on the CPU resource which used by others outside
- * of the cgroup or throttled by the cgroup cpu.max configuration.
- *
  *	SOME = nr_delayed_tasks != 0
- *	FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ *	FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an oncpu task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
  *
  * The percentage of wallclock time spent in those compound stall
  * states gives pressure numbers between 0 and 100 for each resource,
@@ -81,13 +88,13 @@
  *
  *	threads = min(nr_nonidle_tasks, nr_cpus)
  *	   SOME = min(nr_delayed_tasks / threads, 1)
- *	   FULL = (threads - min(nr_running_tasks, threads)) / threads
+ *	   FULL = (threads - min(nr_productive_tasks, threads)) / threads
  *
  * For the 257 number crunchers on 256 CPUs, this yields:
  *
  *	threads = min(257, 256)
  *	   SOME = min(1 / 256, 1)             = 0.4%
- *	   FULL = (256 - min(257, 256)) / 256 = 0%
+ *	   FULL = (256 - min(256, 256)) / 256 = 0%
  *
  * For the 1 out of 4 memory-delayed tasks, this yields:
  *
@@ -112,7 +119,7 @@
  * For each runqueue, we track:
  *
  *	   tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- *	   tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ *	   tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
  *	tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
  *
  * and then periodically aggregate:
@@ -233,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 	case PSI_MEM_SOME:
 		return unlikely(tasks[NR_MEMSTALL]);
 	case PSI_MEM_FULL:
-		return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+		return unlikely(tasks[NR_MEMSTALL] &&
+			tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
 	case PSI_CPU_SOME:
 		return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
 	case PSI_CPU_FULL:
@@ -710,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
 		if (groupc->tasks[t]) {
 			groupc->tasks[t]--;
 		} else if (!psi_bug) {
-			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
 					cpu, t, groupc->tasks[0],
 					groupc->tasks[1], groupc->tasks[2],
-					groupc->tasks[3], clear, set);
+					groupc->tasks[3], groupc->tasks[4],
+					clear, set);
 			psi_bug = 1;
 		}
 	}
@@ -833,7 +842,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		/*
 		 * When switching between tasks that have an identical
 		 * runtime state, the cgroup that contains both tasks
-		 * runtime state, the cgroup that contains both tasks
 		 * we reach the first common ancestor. Iterate @next's
 		 * ancestors only until we encounter @prev's ONCPU.
 		 */
@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		int clear = TSK_ONCPU, set = 0;
 
 		/*
-		 * When we're going to sleep, psi_dequeue() lets us handle
-		 * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
-		 * with TSK_ONCPU and save walking common ancestors twice.
+		 * When we're going to sleep, psi_dequeue() lets us
+		 * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+		 * TSK_IOWAIT here, where we can combine it with
+		 * TSK_ONCPU and save walking common ancestors twice.
 		 */
 		if (sleep) {
 			clear |= TSK_RUNNING;
+			if (prev->in_memstall)
+				clear |= TSK_MEMSTALL_RUNNING;
 			if (prev->in_iowait)
 				set |= TSK_IOWAIT;
 		}
@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
 	rq = this_rq_lock_irq(&rf);
 
 	current->in_memstall = 1;
-	psi_task_change(current, 0, TSK_MEMSTALL);
+	psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
 
 	rq_unlock_irq(rq, &rf);
 }
@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
 	rq = this_rq_lock_irq(&rf);
 
 	current->in_memstall = 0;
-	psi_task_change(current, TSK_MEMSTALL, 0);
+	psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
 
 	rq_unlock_irq(rq, &rf);
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2022-01-11 17:14:59 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2022-01-11 17:14:59 -0800
commit	6ae71436cda740148640046d58190a5bbc3ac86d (patch)
tree	d30635c5c06ac114ca9b07bf96c7ea0bb3904f2f /kernel/sched/psi.c
parent	Merge tag 'Wcast-function-type-5.17-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux (diff)
parent	sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs() (diff)
download	linux-dev-6ae71436cda740148640046d58190a5bbc3ac86d.tar.xz linux-dev-6ae71436cda740148640046d58190a5bbc3ac86d.zip