aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/sched.h
diff options
context:
space:
mode:
authorJosh Don <joshdon@google.com>2021-07-29 19:00:18 -0700
committerPeter Zijlstra <peterz@infradead.org>2021-08-20 12:32:58 +0200
commit304000390f88d049c85e9a0958ac5567f38816ee (patch)
tree5e57c67a55fc278417dc427246608da758134d25 /kernel/sched/sched.h
parentsched/topology: Skip updating masks for non-online nodes (diff)
downloadlinux-dev-304000390f88d049c85e9a0958ac5567f38816ee.tar.xz
linux-dev-304000390f88d049c85e9a0958ac5567f38816ee.zip
sched: Cgroup SCHED_IDLE support
This extends SCHED_IDLE to cgroups. Interface: cgroup/cpu.idle. 0: default behavior 1: SCHED_IDLE Extending SCHED_IDLE to cgroups means that we incorporate the existing aspects of SCHED_IDLE; a SCHED_IDLE cgroup will count all of its descendant threads towards the idle_h_nr_running count of all of its ancestor cgroups. Thus, sched_idle_rq() will work properly. Additionally, SCHED_IDLE cgroups are configured with minimum weight. There are two key differences between the per-task and per-cgroup SCHED_IDLE interface: - The cgroup interface allows tasks within a SCHED_IDLE hierarchy to maintain their relative weights. The entity that is "idle" is the cgroup, not the tasks themselves. - Since the idle entity is the cgroup, our SCHED_IDLE wakeup preemption decision is not made by comparing the current task with the woken task, but rather by comparing their matching sched_entity. A typical use-case for this is a user that creates an idle and a non-idle subtree. The non-idle subtree will dominate competition vs the idle subtree, but the idle subtree will still be high priority vs other users on the system. The latter is accomplished via comparing matching sched_entity in the waken preemption path (this could also be improved by making the sched_idle_rq() decision dependent on the perspective of a specific task). For now, we maintain the existing SCHED_IDLE semantics. Future patches may make improvements that extend how we treat SCHED_IDLE entities. The per-task_group idle field is an integer that currently only holds either a 0 or a 1. This is explicitly typed as an integer to allow for further extensions to this API. For example, a negative value may indicate a highly latency-sensitive cgroup that should be preferred for preemption/placement/etc. Signed-off-by: Josh Don <joshdon@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org> Link: https://lore.kernel.org/r/20210730020019.1487127-2-joshdon@google.com
Diffstat (limited to '')
-rw-r--r--kernel/sched/sched.h8
1 files changed, 8 insertions, 0 deletions
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 75a5c120e7d9..5fa02902c143 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -396,6 +396,9 @@ struct task_group {
struct cfs_rq **cfs_rq;
unsigned long shares;
+ /* A positive value indicates that this is a SCHED_IDLE group. */
+ int idle;
+
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
@@ -505,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern int sched_group_set_idle(struct task_group *tg, long idle);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
@@ -601,6 +606,9 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
+ /* Locally cached copy of our task_group's idle value */
+ int idle;
+
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;