aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2019-11-08 11:11:52 +0100
committerPeter Zijlstra <peterz@infradead.org>2019-11-08 22:34:14 +0100
commit6e2df0581f569038719cf2bc2b3baa3fcc83cab4 (patch)
tree91a337f916b868f9a73864949698dd27762d9a8e /kernel/sched/core.c
parentsched/core: Fix compilation error when cgroup not selected (diff)
downloadlinux-dev-6e2df0581f569038719cf2bc2b3baa3fcc83cab4.tar.xz
linux-dev-6e2df0581f569038719cf2bc2b3baa3fcc83cab4.zip
sched: Fix pick_next_task() vs 'change' pattern race
Commit 67692435c411 ("sched: Rework pick_next_task() slow-path") inadvertly introduced a race because it changed a previously unexplored dependency between dropping the rq->lock and sched_class::put_prev_task(). The comments about dropping rq->lock, in for example newidle_balance(), only mentions the task being current and ->on_cpu being set. But when we look at the 'change' pattern (in for example sched_setnuma()): queued = task_on_rq_queued(p); /* p->on_rq == TASK_ON_RQ_QUEUED */ running = task_current(rq, p); /* rq->curr == p */ if (queued) dequeue_task(...); if (running) put_prev_task(...); /* change task properties */ if (queued) enqueue_task(...); if (running) set_next_task(...); It becomes obvious that if we do this after put_prev_task() has already been called on @p, things go sideways. This is exactly what the commit in question allows to happen when it does: prev->sched_class->put_prev_task(rq, prev, rf); if (!rq->nr_running) newidle_balance(rq, rf); The newidle_balance() call will drop rq->lock after we've called put_prev_task() and that allows the above 'change' pattern to interleave and mess up the state. Furthermore, it turns out we lost the RT-pull when we put the last DL task. Fix both problems by extracting the balancing from put_prev_task() and doing a multi-class balance() pass before put_prev_task(). Fixes: 67692435c411 ("sched: Rework pick_next_task() slow-path") Reported-by: Quentin Perret <qperret@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Quentin Perret <qperret@google.com> Tested-by: Valentin Schneider <valentin.schneider@arm.com>
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c21
1 files changed, 15 insertions, 6 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index afd4d8028771..0f2eb3629070 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3929,13 +3929,22 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
restart:
+#ifdef CONFIG_SMP
/*
- * Ensure that we put DL/RT tasks before the pick loop, such that they
- * can PULL higher prio tasks when we lower the RQ 'priority'.
+ * We must do the balancing pass before put_next_task(), such
+ * that when we release the rq->lock the task is in the same
+ * state as before we took rq->lock.
+ *
+ * We can terminate the balance pass as soon as we know there is
+ * a runnable task of @class priority or higher.
*/
- prev->sched_class->put_prev_task(rq, prev, rf);
- if (!rq->nr_running)
- newidle_balance(rq, rf);
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
+ if (class->balance(rq, prev, rf))
+ break;
+ }
+#endif
+
+ put_prev_task(rq, prev);
for_each_class(class) {
p = class->pick_next_task(rq, NULL, NULL);
@@ -6201,7 +6210,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq)
for_each_class(class) {
next = class->pick_next_task(rq, NULL, NULL);
if (next) {
- next->sched_class->put_prev_task(rq, next, NULL);
+ next->sched_class->put_prev_task(rq, next);
return next;
}
}