psi: Fix cpu.pressure for cpu.max and competing cgroups

For simplicity, cpu pressure is defined as having more than one runnable task on a given CPU. This works on the system-level, but it has limitations in a cgrouped reality: When cpu.max is in use, it doesn't capture the time in which a task is not executing on the CPU due to throttling. Likewise, it doesn't capture the time in which a competing cgroup is occupying the CPU - meaning it only reflects cgroup-internal competitive pressure, not outside pressure. Enable tracking of currently executing tasks, and then change the definition of cpu pressure in a cgroup from NR_RUNNING > 1 to NR_RUNNING > ON_CPU which will capture the effects of cpu.max as well as competition from outside the cgroup. After this patch, a cgroup running `stress -c 1` with a cpu.max setting of 5000 10000 shows ~50% continuous CPU pressure. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lkml.kernel.org/r/20200316191333.115523-2-hannes@cmpxchg.org
author: Johannes Weiner <hannes@cmpxchg.org> 2020-03-16 15:13:31 -0400
committer: Peter Zijlstra <peterz@infradead.org> 2020-03-20 13:06:18 +0100
commit: b05e75d611380881e73edc58a20fd8c6bb71720b (patch)
tree: 3d641b57b42e934d7518f22a13f3a74cd76f6ff7 /kernel/sched/stats.h
parent: sched/core: Distribute tasks within affinity masks (diff)
download: linux-dev-b05e75d611380881e73edc58a20fd8c6bb71720b.tar.xz
linux-dev-b05e75d611380881e73edc58a20fd8c6bb71720b.zip
1 files changed, 28 insertions, 0 deletions
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index ba683fe81a6e..6ff0ac1a803f 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -93,6 +93,14 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
 		if (p->flags & PF_MEMSTALL)
 			clear |= TSK_MEMSTALL;
 	} else {
+		/*
+		 * When a task sleeps, schedule() dequeues it before
+		 * switching to the next one. Merge the clearing of
+		 * TSK_RUNNING and TSK_ONCPU to save an unnecessary
+		 * psi_task_change() call in psi_sched_switch().
+		 */
+		clear |= TSK_ONCPU;
+
 		if (p->in_iowait)
 			set |= TSK_IOWAIT;
 	}
@@ -126,6 +134,23 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
 	}
 }
 
+static inline void psi_sched_switch(struct task_struct *prev,
+				    struct task_struct *next,
+				    bool sleep)
+{
+	if (static_branch_likely(&psi_disabled))
+		return;
+
+	/*
+	 * Clear the TSK_ONCPU state if the task was preempted. If
+	 * it's a voluntary sleep, dequeue will have taken care of it.
+	 */
+	if (!sleep)
+		psi_task_change(prev, TSK_ONCPU, 0);
+
+	psi_task_change(next, 0, TSK_ONCPU);
+}
+
 static inline void psi_task_tick(struct rq *rq)
 {
 	if (static_branch_likely(&psi_disabled))
@@ -138,6 +163,9 @@ static inline void psi_task_tick(struct rq *rq)
 static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
 static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
 static inline void psi_ttwu_dequeue(struct task_struct *p) {}
+static inline void psi_sched_switch(struct task_struct *prev,
+				    struct task_struct *next,
+				    bool sleep) {}
 static inline void psi_task_tick(struct rq *rq) {}
 #endif /* CONFIG_PSI */
author	Johannes Weiner <hannes@cmpxchg.org>	2020-03-16 15:13:31 -0400
committer	Peter Zijlstra <peterz@infradead.org>	2020-03-20 13:06:18 +0100
commit	b05e75d611380881e73edc58a20fd8c6bb71720b (patch)
tree	3d641b57b42e934d7518f22a13f3a74cd76f6ff7 /kernel/sched/stats.h
parent	sched/core: Distribute tasks within affinity masks (diff)
download	linux-dev-b05e75d611380881e73edc58a20fd8c6bb71720b.tar.xz linux-dev-b05e75d611380881e73edc58a20fd8c6bb71720b.zip