From 74dc3384fc7983b78cc46ebb1824968a3db85eb1 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sun, 6 Aug 2017 14:41:41 +1000 Subject: sched/debug: Use task_pid_nr_ns in /proc/$pid/sched It appears as though the addition of the PID namespace did not update the output code for /proc/*/sched, which resulted in it providing PIDs that were not self-consistent with the /proc mount. This additionally made it trivial to detect whether a process was inside &init_pid_ns from userspace, making container detection trivial: https://github.com/jessfraz/amicontained This leads to situations such as: % unshare -pmf % mount -t proc proc /proc % head -n1 /proc/1/sched head (10047, #threads: 1) Fix this by just using task_pid_nr_ns for the output of /proc/*/sched. All of the other uses of task_pid_nr in kernel/sched/debug.c are from a sysctl context and thus don't need to be namespaced. Signed-off-by: Aleksa Sarai Signed-off-by: Peter Zijlstra (Intel) Acked-by: Eric W. Biederman Cc: Jess Frazelle Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cyphar@cyphar.com Link: http://lkml.kernel.org/r/20170806044141.5093-1-asarai@suse.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4fa66de52bd6..ac345115877b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -872,11 +872,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) #endif } -void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) { unsigned long nr_switches; - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), get_nr_threads(p)); SEQ_printf(m, "---------------------------------------------------------" -- cgit v1.3-8-gc7d7 From e8c164954b926f06f109a42fb8595ed01275b141 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Mon, 7 Aug 2017 16:44:22 +0800 Subject: sched/debug: Show task state in /proc/sched_debug Currently we print the runnable task in /proc/sched_debug, but there is no task state information. We don't know which task is in the runqueue and which task is sleeping. Add task state in the runnable task list, like this: runnable tasks: S task PID tree-key switches prio wait-time sum-exec sum-sleep ----------------------------------------------------------------------------------------------------------- S watchdog/239 1452 -11.917445 2811 0 0.000000 8.949306 0.000000 7 0 / S migration/239 1453 20686.367740 8 0 0.000000 16215.720897 0.000000 7 0 / S ksoftirqd/239 1454 115383.841071 12 120 0.000000 0.200683 0.000000 7 0 / >R test 21287 4872.190970 407 120 0.000000 4874.911790 0.000000 7 0 /autogroup-150 R test 21288 4868.385454 401 120 0.000000 3672.341489 0.000000 7 0 /autogroup-150 R test 21289 4868.326776 384 120 0.000000 3424.934159 0.000000 7 0 /autogroup-150 Signed-off-by: Xie XiuQi Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1502095463-160172-2-git-send-email-xiexiuqi@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ac345115877b..d8d2ea215b85 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -421,13 +421,19 @@ static char *task_group_path(struct task_group *tg) } #endif +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { - if (rq->curr == p) - SEQ_printf(m, "R"); - else - SEQ_printf(m, " "); + unsigned long state; + + if (rq->curr == p) { + SEQ_printf(m, ">R"); + } else { + state = p->state ? __ffs(p->state) + 1 : 0; + SEQ_printf(m, " %c", state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); + } SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), @@ -456,9 +462,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\nrunnable tasks:\n" - " task PID tree-key switches prio" + " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n" - "------------------------------------------------------" + "-------------------------------------------------------" "----------------------------------------------------\n"); rcu_read_lock(); -- cgit v1.3-8-gc7d7 From 20435d84e5f2041c64c792399ab6f2948a2c2252 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Mon, 7 Aug 2017 16:44:23 +0800 Subject: sched/debug: Intruduce task_state_to_char() helper function Now that we have more than one place to get the task state, intruduce the task_state_to_char() helper function to save some code. No functionality changed. Signed-off-by: Xie XiuQi Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1502095463-160172-3-git-send-email-xiexiuqi@huawei.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 +++++++++++++ kernel/sched/core.c | 15 ++++----------- kernel/sched/debug.c | 10 +++------- 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8337e2db0bb2..c28b182c9833 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1229,6 +1229,19 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } +static inline char task_state_to_char(struct task_struct *task) +{ + const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + unsigned long state = task->state; + + state = state ? __ffs(state) + 1 : 0; + + /* Make sure the string lines up properly with the number of task states: */ + BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); + + return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'; +} + /** * is_global_init - check if a task structure is init. Since init * is free to have sub-threads we need to check tgid. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6d91c10b1814..f9f9948e2470 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5103,24 +5103,17 @@ out_unlock: return retval; } -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - void sched_show_task(struct task_struct *p) { unsigned long free = 0; int ppid; - unsigned long state = p->state; - - /* Make sure the string lines up properly with the number of task states: */ - BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); if (!try_get_task_stack(p)) return; - if (state) - state = __ffs(state) + 1; - printk(KERN_INFO "%-15.15s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); - if (state == TASK_RUNNING) + + printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + + if (p->state == TASK_RUNNING) printk(KERN_CONT " running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index d8d2ea215b85..cfd84f79e075 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -426,14 +426,10 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { - unsigned long state; - - if (rq->curr == p) { + if (rq->curr == p) SEQ_printf(m, ">R"); - } else { - state = p->state ? __ffs(p->state) + 1 : 0; - SEQ_printf(m, " %c", state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); - } + else + SEQ_printf(m, " %c", task_state_to_char(p)); SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), -- cgit v1.3-8-gc7d7 From bbdacdfed2f5fa50a2cc9f500a36e05990a0837d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Aug 2017 17:10:26 +0200 Subject: sched/debug: Optimize sched_domain sysctl generation Currently we unconditionally destroy all sysctl bits and regenerate them after we've rebuild the domains (even if that rebuild is a no-op). And since we unconditionally (re)build the sysctl for all possible CPUs, onlining all CPUs gets us O(n^2) time. Instead change this to only rebuild the bits for CPUs we've actually installed new domains on. Reported-by: Ofer Levi(SW) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 68 +++++++++++++++++++++++++++++++++++++++---------- kernel/sched/sched.h | 4 +++ kernel/sched/topology.c | 1 + 3 files changed, 59 insertions(+), 14 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index cfd84f79e075..4a23bbc3111b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) return table; } +static cpumask_var_t sd_sysctl_cpus; static struct ctl_table_header *sd_sysctl_header; + void register_sched_domain_sysctl(void) { - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + static struct ctl_table *cpu_entries; + static struct ctl_table **cpu_idx; char buf[32]; + int i; - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; + if (!cpu_entries) { + cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); + if (!cpu_entries) + return; - if (entry == NULL) - return; + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = cpu_entries; + } - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; + if (!cpu_idx) { + struct ctl_table *e = cpu_entries; + + cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); + if (!cpu_idx) + return; + + /* deal with sparse possible map */ + for_each_possible_cpu(i) { + cpu_idx[i] = e; + e++; + } + } + + if (!cpumask_available(sd_sysctl_cpus)) { + if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) + return; + + /* init to possible to not have holes in @cpu_entries */ + cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); + } + + for_each_cpu(i, sd_sysctl_cpus) { + struct ctl_table *e = cpu_idx[i]; + + if (e->child) + sd_free_ctl_entry(&e->child); + + if (!e->procname) { + snprintf(buf, 32, "cpu%d", i); + e->procname = kstrdup(buf, GFP_KERNEL); + } + e->mode = 0555; + e->child = sd_alloc_ctl_cpu_table(i); + + __cpumask_clear_cpu(i, sd_sysctl_cpus); } WARN_ON(sd_sysctl_header); sd_sysctl_header = register_sysctl_table(sd_ctl_root); } +void dirty_sched_domain_sysctl(int cpu) +{ + if (cpumask_available(sd_sysctl_cpus)) + __cpumask_set_cpu(cpu, sd_sysctl_cpus); +} + /* may be called multiple times per register */ void unregister_sched_domain_sysctl(void) { unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); } #endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SMP */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eeef1a3086d1..25e5cb1107f3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sched_group *sg); #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) void register_sched_domain_sysctl(void); +void dirty_sched_domain_sysctl(int cpu); void unregister_sched_domain_sysctl(void); #else static inline void register_sched_domain_sysctl(void) { } +static inline void dirty_sched_domain_sysctl(int cpu) +{ +} static inline void unregister_sched_domain_sysctl(void) { } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 727daa2a0abe..6f7b43982f73 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -459,6 +459,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) rq_attach_root(rq, rd); tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); + dirty_sched_domain_sysctl(cpu); destroy_sched_domains(tmp); update_top_cache_domain(cpu); -- cgit v1.3-8-gc7d7 From bfb068892d30dcf0a32b89302fe293347adeaaaa Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 8 Sep 2017 16:14:55 -0700 Subject: sched/fair: replace cfs_rq->rb_leftmost ... with the generic rbtree flavor instead. No changes in semantics whatsoever. Link: http://lkml.kernel.org/r/20170719014603.19029-8-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/debug.c | 2 +- kernel/sched/fair.c | 39 +++++++++++++-------------------------- kernel/sched/sched.h | 3 +-- 3 files changed, 15 insertions(+), 29 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4a23bbc3111b..8e536d963652 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -530,7 +530,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_lock_irqsave(&rq->lock, flags); - if (cfs_rq->rb_leftmost) + if (rb_first_cached(&cfs_rq->tasks_timeline)) MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); if (last) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8bc0a883d190..a5d83ed8dd82 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -513,6 +513,7 @@ static inline int entity_before(struct sched_entity *a, static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; + struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); u64 vruntime = cfs_rq->min_vruntime; @@ -523,10 +524,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) curr = NULL; } - if (cfs_rq->rb_leftmost) { - struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, - struct sched_entity, - run_node); + if (leftmost) { /* non-empty tree */ + struct sched_entity *se; + se = rb_entry(leftmost, struct sched_entity, run_node); if (!curr) vruntime = se->vruntime; @@ -547,10 +547,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; - int leftmost = 1; + bool leftmost = true; /* * Find the right place in the rbtree: @@ -566,36 +566,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - /* - * Maintain a cache of leftmost tree entries (it is frequently - * used): - */ - if (leftmost) - cfs_rq->rb_leftmost = &se->run_node; - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); + rb_insert_color_cached(&se->run_node, + &cfs_rq->tasks_timeline, leftmost); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) { - struct rb_node *next_node; - - next_node = rb_next(&se->run_node); - cfs_rq->rb_leftmost = next_node; - } - - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { - struct rb_node *left = cfs_rq->rb_leftmost; + struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); if (!left) return NULL; @@ -616,7 +603,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) #ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); if (!last) return NULL; @@ -9312,7 +9299,7 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { - cfs_rq->tasks_timeline = RB_ROOT; + cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6ed7962dc896..c30c57563dbc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -426,8 +426,7 @@ struct cfs_rq { u64 min_vruntime_copy; #endif - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; + struct rb_root_cached tasks_timeline; /* * 'curr' points to currently running entity on this cfs_rq. -- cgit v1.3-8-gc7d7 From 9469eb01db891b55367ee7539f1b9f7f6fd2819d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 17:03:53 +0200 Subject: sched/debug: Add debugfs knob for "sched_debug" I'm forever late for editing my kernel cmdline, add a runtime knob to disable the "sched_debug" thing. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170907150614.142924283@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 5 +++++ kernel/sched/sched.h | 2 ++ kernel/sched/topology.c | 4 +--- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4a23bbc3111b..b19d06ea6e10 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -181,11 +181,16 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; +__read_mostly bool sched_debug_enabled; + static __init int sched_init_debug(void) { debugfs_create_file("sched_features", 0644, NULL, NULL, &sched_feat_fops); + debugfs_create_bool("sched_debug", 0644, NULL, + &sched_debug_enabled); + return 0; } late_initcall(sched_init_debug); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ab1c7f5409a0..7ea2a0339771 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1954,6 +1954,8 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); #ifdef CONFIG_SCHED_DEBUG +extern bool sched_debug_enabled; + extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6f7b43982f73..2ab2aa68c796 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -14,11 +14,9 @@ cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_debug_enabled; - static int __init sched_debug_setup(char *str) { - sched_debug_enabled = 1; + sched_debug_enabled = true; return 0; } -- cgit v1.3-8-gc7d7 From 65d5dc47fe8530e17e318722fb4df676536c2bfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:14:08 +0200 Subject: sched/debug: Remove unused variable Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 01217fb5a5de..2f93e4a2d9f6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg) } #endif -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { -- cgit v1.3-8-gc7d7 From 2a2f5d4e44ed160a5ed822c94e04f918f9fbb487 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 May 2017 16:51:41 +0200 Subject: sched/fair: Rewrite cfs_rq->removed_*avg Since on wakeup migration we don't hold the rq->lock for the old CPU we cannot update its state. Instead we add the removed 'load' to an atomic variable and have the next update on that CPU collect and process it. Currently we have 2 atomic variables; which already have the issue that they can be read out-of-sync. Also, two atomic ops on a single cacheline is already more expensive than an uncontended lock. Since we want to add more, convert the thing over to an explicit cacheline with a lock in. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 8 ++++---- kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++---------------- kernel/sched/sched.h | 13 +++++++++---- 3 files changed, 48 insertions(+), 24 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2f93e4a2d9f6..2f22342c48ff 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -564,10 +564,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); - SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", - atomic_long_read(&cfs_rq->removed_load_avg)); - SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", - atomic_long_read(&cfs_rq->removed_util_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", + cfs_rq->removed.load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", + cfs_rq->removed.util_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d8c02b31498d..fe4a66b10480 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3501,36 +3501,47 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { + unsigned long removed_load = 0, removed_util = 0; struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed_load = 0, removed_util = 0; + int decayed = 0; - if (atomic_long_read(&cfs_rq->removed_load_avg)) { - s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + if (cfs_rq->removed.nr) { + unsigned long r; + + raw_spin_lock(&cfs_rq->removed.lock); + swap(cfs_rq->removed.util_avg, removed_util); + swap(cfs_rq->removed.load_avg, removed_load); + cfs_rq->removed.nr = 0; + raw_spin_unlock(&cfs_rq->removed.lock); + + /* + * The LOAD_AVG_MAX for _sum is a slight over-estimate, + * which is safe due to sub_positive() clipping at 0. + */ + r = removed_load; sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); - removed_load = 1; - set_tg_cfs_propagate(cfs_rq); - } - if (atomic_long_read(&cfs_rq->removed_util_avg)) { - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); + r = removed_util; sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); - removed_util = 1; + set_tg_cfs_propagate(cfs_rq); + + decayed = 1; } - decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); + decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (decayed || removed_util) + if (decayed) cfs_rq_util_change(cfs_rq); - return decayed || removed_load; + return decayed; } /** @@ -3645,6 +3656,7 @@ void sync_entity_load_avg(struct sched_entity *se) void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); + unsigned long flags; /* * tasks cannot exit without having gone through wake_up_new_task() -> @@ -3654,11 +3666,19 @@ void remove_entity_load_avg(struct sched_entity *se) * Similarly for groups, they will have passed through * post_init_entity_util_avg() before unregister_sched_fair_group() * calls this. + * + * XXX in case entity_is_task(se) && task_of(se)->on_rq == MIGRATING + * we could actually get the right time, since we're called with + * rq->lock held, see detach_task(). */ sync_entity_load_avg(se); - atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); + + raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); + ++cfs_rq->removed.nr; + cfs_rq->removed.util_avg += se->avg.util_avg; + cfs_rq->removed.load_avg += se->avg.load_avg; + raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) @@ -9449,8 +9469,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->propagate_avg = 0; #endif - atomic_long_set(&cfs_rq->removed_load_avg, 0); - atomic_long_set(&cfs_rq->removed_util_avg, 0); + raw_spin_lock_init(&cfs_rq->removed.lock); #endif } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a5d97460ee4e..2fd350a12bb7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -445,14 +445,19 @@ struct cfs_rq { struct sched_avg avg; u64 runnable_load_sum; unsigned long runnable_load_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; unsigned long propagate_avg; #endif - atomic_long_t removed_load_avg, removed_util_avg; -#ifndef CONFIG_64BIT - u64 load_last_update_time_copy; -#endif + struct { + raw_spinlock_t lock ____cacheline_aligned; + int nr; + unsigned long load_avg; + unsigned long util_avg; + } removed; #ifdef CONFIG_FAIR_GROUP_SCHED /* -- cgit v1.3-8-gc7d7 From 0e2d2aaaae52c247c047d14999b93486bdbd3431 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 May 2017 17:30:46 +0200 Subject: sched/fair: Rewrite PELT migration propagation When an entity migrates in (or out) of a runqueue, we need to add (or remove) its contribution from the entire PELT hierarchy, because even non-runnable entities are included in the load average sums. In order to do this we have some propagation logic that updates the PELT tree, however the way it 'propagates' the runnable (or load) change is (more or less): tg->weight * grq->avg.load_avg ge->avg.load_avg = ------------------------------ tg->load_avg But that is the expression for ge->weight, and per the definition of load_avg: ge->avg.load_avg := ge->weight * ge->avg.runnable_avg That destroys the runnable_avg (by setting it to 1) we wanted to propagate. Instead directly propagate runnable_sum. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 + kernel/sched/fair.c | 186 +++++++++++++++++++++++++++++---------------------- kernel/sched/sched.h | 9 +-- 3 files changed, 112 insertions(+), 85 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2f22342c48ff..2e039a81864c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -568,6 +568,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", cfs_rq->removed.util_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", + cfs_rq->removed.runnable_sum); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe4a66b10480..086a5d979720 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3319,11 +3319,77 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } -/* Take into account change of utilization of a child task group */ + +/* + * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to + * propagate its contribution. The key to this propagation is the invariant + * that for each group: + * + * ge->avg == grq->avg (1) + * + * _IFF_ we look at the pure running and runnable sums. Because they + * represent the very same entity, just at different points in the hierarchy. + * + * + * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and + * simply copies the running sum over. + * + * However, update_tg_cfs_runnable() is more complex. So we have: + * + * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) + * + * And since, like util, the runnable part should be directly transferable, + * the following would _appear_ to be the straight forward approach: + * + * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) + * + * And per (1) we have: + * + * ge->avg.running_avg == grq->avg.running_avg + * + * Which gives: + * + * ge->load.weight * grq->avg.load_avg + * ge->avg.load_avg = ----------------------------------- (4) + * grq->load.weight + * + * Except that is wrong! + * + * Because while for entities historical weight is not important and we + * really only care about our future and therefore can consider a pure + * runnable sum, runqueues can NOT do this. + * + * We specifically want runqueues to have a load_avg that includes + * historical weights. Those represent the blocked load, the load we expect + * to (shortly) return to us. This only works by keeping the weights as + * integral part of the sum. We therefore cannot decompose as per (3). + * + * OK, so what then? + * + * + * Another way to look at things is: + * + * grq->avg.load_avg = \Sum se->avg.load_avg + * + * Therefore, per (2): + * + * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg + * + * And the very thing we're propagating is a change in that sum (someone + * joined/left). So we can easily know the runnable change, which would be, per + * (2) the already tracked se->load_avg divided by the corresponding + * se->weight. + * + * Basically (4) but in differential form: + * + * d(runnable_avg) += se->avg.load_avg / se->load.weight + * (5) + * ge->avg.load_avg += ge->load.weight * d(runnable_avg) + */ + static inline void -update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - struct cfs_rq *gcfs_rq = group_cfs_rq(se); long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; /* Nothing to update */ @@ -3339,102 +3405,59 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; } -/* Take into account change of load of a child task group */ static inline void -update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - struct cfs_rq *gcfs_rq = group_cfs_rq(se); - long delta, load = gcfs_rq->avg.load_avg; + long runnable_sum = gcfs_rq->prop_runnable_sum; + long load_avg; + s64 load_sum; - /* - * If the load of group cfs_rq is null, the load of the - * sched_entity will also be null so we can skip the formula - */ - if (load) { - long tg_load; - - /* Get tg's load and ensure tg_load > 0 */ - tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; - - /* Ensure tg_load >= load and updated with current load*/ - tg_load -= gcfs_rq->tg_load_avg_contrib; - tg_load += load; - - /* - * We need to compute a correction term in the case that the - * task group is consuming more CPU than a task of equal - * weight. A task with a weight equals to tg->shares will have - * a load less or equal to scale_load_down(tg->shares). - * Similarly, the sched_entities that represent the task group - * at parent level, can't have a load higher than - * scale_load_down(tg->shares). And the Sum of sched_entities' - * load must be <= scale_load_down(tg->shares). - */ - if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { - /* scale gcfs_rq's load into tg's shares*/ - load *= scale_load_down(gcfs_rq->tg->shares); - load /= tg_load; - } - } + if (!runnable_sum) + return; - delta = load - se->avg.load_avg; + gcfs_rq->prop_runnable_sum = 0; - /* Nothing to update */ - if (!delta) - return; + load_sum = (s64)se_weight(se) * runnable_sum; + load_avg = div_s64(load_sum, LOAD_AVG_MAX); - /* Set new sched_entity's load */ - se->avg.load_avg = load; - se->avg.load_sum = LOAD_AVG_MAX; + add_positive(&se->avg.load_sum, runnable_sum); + add_positive(&se->avg.load_avg, load_avg); - /* Update parent cfs_rq load */ - add_positive(&cfs_rq->avg.load_avg, delta); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + add_positive(&cfs_rq->avg.load_avg, load_avg); + add_positive(&cfs_rq->avg.load_sum, load_sum); - /* - * If the sched_entity is already enqueued, we also have to update the - * runnable load avg. - */ if (se->on_rq) { - /* Update parent cfs_rq runnable_load_avg */ - add_positive(&cfs_rq->runnable_load_avg, delta); - cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + add_positive(&cfs_rq->runnable_load_avg, load_avg); + add_positive(&cfs_rq->runnable_load_sum, load_sum); } } -static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) { - cfs_rq->propagate_avg = 1; -} - -static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = group_cfs_rq(se); - - if (!cfs_rq->propagate_avg) - return 0; - - cfs_rq->propagate_avg = 0; - return 1; + cfs_rq->propagate = 1; + cfs_rq->prop_runnable_sum += runnable_sum; } /* Update task and its cfs_rq load average */ static inline int propagate_entity_load_avg(struct sched_entity *se) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *gcfs_rq; if (entity_is_task(se)) return 0; - if (!test_and_clear_tg_cfs_propagate(se)) + gcfs_rq = group_cfs_rq(se); + if (!gcfs_rq->propagate) return 0; + gcfs_rq->propagate = 0; + cfs_rq = cfs_rq_of(se); - set_tg_cfs_propagate(cfs_rq); + add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum); - update_tg_cfs_util(cfs_rq, se); - update_tg_cfs_load(cfs_rq, se); + update_tg_cfs_util(cfs_rq, se, gcfs_rq); + update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); return 1; } @@ -3458,7 +3481,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) * If there is a pending propagation, we have to update the load and * the utilization of the sched_entity: */ - if (gcfs_rq->propagate_avg) + if (gcfs_rq->propagate) return false; /* @@ -3478,7 +3501,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 0; } -static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3501,7 +3524,7 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - unsigned long removed_load = 0, removed_util = 0; + unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; struct sched_avg *sa = &cfs_rq->avg; int decayed = 0; @@ -3511,6 +3534,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); swap(cfs_rq->removed.load_avg, removed_load); + swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); cfs_rq->removed.nr = 0; raw_spin_unlock(&cfs_rq->removed.lock); @@ -3526,7 +3550,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); - set_tg_cfs_propagate(cfs_rq); + add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); decayed = 1; } @@ -3558,7 +3582,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; - set_tg_cfs_propagate(cfs_rq); + + add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); cfs_rq_util_change(cfs_rq); } @@ -3576,7 +3601,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); - set_tg_cfs_propagate(cfs_rq); + + add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); cfs_rq_util_change(cfs_rq); } @@ -3678,6 +3704,7 @@ void remove_entity_load_avg(struct sched_entity *se) ++cfs_rq->removed.nr; cfs_rq->removed.util_avg += se->avg.util_avg; cfs_rq->removed.load_avg += se->avg.load_avg; + cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } @@ -9466,9 +9493,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->propagate_avg = 0; -#endif raw_spin_lock_init(&cfs_rq->removed.lock); #endif } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2fd350a12bb7..5bcb86eb026b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -447,19 +447,20 @@ struct cfs_rq { unsigned long runnable_load_avg; #ifndef CONFIG_64BIT u64 load_last_update_time_copy; -#endif -#ifdef CONFIG_FAIR_GROUP_SCHED - unsigned long tg_load_avg_contrib; - unsigned long propagate_avg; #endif struct { raw_spinlock_t lock ____cacheline_aligned; int nr; unsigned long load_avg; unsigned long util_avg; + unsigned long runnable_sum; } removed; #ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long tg_load_avg_contrib; + long propagate; + long prop_runnable_sum; + /* * h_load = weight * f(tg) * -- cgit v1.3-8-gc7d7 From 1ea6c46a23f1213d1972bfae220db5c165e27bba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 15:59:54 +0200 Subject: sched/fair: Propagate an effective runnable_load_avg The load balancer uses runnable_load_avg as load indicator. For !cgroup this is: runnable_load_avg = \Sum se->avg.load_avg ; where se->on_rq That is, a direct sum of all runnable tasks on that runqueue. As opposed to load_avg, which is a sum of all tasks on the runqueue, which includes a blocked component. However, in the cgroup case, this comes apart since the group entities are always runnable, even if most of their constituent entities are blocked. Therefore introduce a runnable_weight which for task entities is the same as the regular weight, but for group entities is a fraction of the entity weight and represents the runnable part of the group runqueue. Then propagate this load through the PELT hierarchy to arrive at an effective runnable load avgerage -- which we should not confuse with the canonical runnable load average. Suggested-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 + kernel/sched/debug.c | 8 ++- kernel/sched/fair.c | 172 +++++++++++++++++++++++++++++++++----------------- kernel/sched/sched.h | 3 +- 4 files changed, 124 insertions(+), 62 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a7df4e558c..bdd6ad6fcce1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -331,9 +331,11 @@ struct load_weight { struct sched_avg { u64 last_update_time; u64 load_sum; + u64 runnable_load_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; + unsigned long runnable_load_avg; unsigned long util_avg; }; @@ -376,6 +378,7 @@ struct sched_statistics { struct sched_entity { /* For load-balancing: */ struct load_weight load; + unsigned long runnable_weight; struct rb_node run_node; struct list_head group_node; unsigned int on_rq; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2e039a81864c..1ca0130ed4f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P_SCHEDSTAT(se->statistics.wait_count); } P(se->load.weight); + P(se->runnable_weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); + P(se->avg.runnable_load_avg); #endif #undef PN_SCHEDSTAT @@ -558,10 +560,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", - cfs_rq->runnable_load_avg); + cfs_rq->avg.runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", @@ -1006,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); + P(se.runnable_weight); #ifdef CONFIG_SMP P(se.avg.load_sum); + P(se.avg.runnable_load_sum); P(se.avg.util_sum); P(se.avg.load_avg); + P(se.avg.runnable_load_avg); P(se.avg.util_avg); P(se.avg.last_update_time); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 086a5d979720..10d2000fca2d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -730,8 +730,9 @@ void init_entity_runnable_average(struct sched_entity *se) * nothing has been attached to the task group yet. */ if (entity_is_task(se)) - sa->load_avg = scale_load_down(se->load.weight); - sa->load_sum = LOAD_AVG_MAX; + sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); + sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX; + /* * At this point, util_avg won't be used in select_task_rq_fair anyway */ @@ -2731,25 +2732,35 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_SMP /* - * XXX we want to get rid of this helper and use the full load resolution. + * XXX we want to get rid of these helpers and use the full load resolution. */ static inline long se_weight(struct sched_entity *se) { return scale_load_down(se->load.weight); } +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} + static inline void enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - cfs_rq->runnable_load_avg += se->avg.load_avg; - cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; + cfs_rq->runnable_weight += se->runnable_weight; + + cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; + cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; } static inline void dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); + cfs_rq->runnable_weight -= se->runnable_weight; + + sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); + sub_positive(&cfs_rq->avg.runnable_load_sum, + se_runnable(se) * se->avg.runnable_load_sum); } static inline void @@ -2777,7 +2788,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) + unsigned long weight, unsigned long runnable) { if (se->on_rq) { /* commit outstanding execution time */ @@ -2788,11 +2799,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } dequeue_load_avg(cfs_rq, se); + se->runnable_weight = runnable; update_load_set(&se->load, weight); #ifdef CONFIG_SMP - se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, - LOAD_AVG_MAX - 1024 + se->avg.period_contrib); + do { + u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); + se->avg.runnable_load_avg = + div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); + } while (0); #endif enqueue_load_avg(cfs_rq, se); @@ -2809,7 +2826,7 @@ void reweight_task(struct task_struct *p, int prio) struct load_weight *load = &se->load; unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight); + reweight_entity(cfs_rq, se, weight, weight); load->inv_weight = sched_prio_to_wmult[prio]; } @@ -2917,31 +2934,45 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq) static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct sched_entity *se) +/* + * Recomputes the group entity based on the current state of its group + * runqueue. + */ +static void update_cfs_group(struct sched_entity *se) { - struct cfs_rq *cfs_rq = group_cfs_rq(se); - long shares; + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long shares, runnable; - if (!cfs_rq) + if (!gcfs_rq) return; - if (throttled_hierarchy(cfs_rq)) + if (throttled_hierarchy(gcfs_rq)) return; #ifndef CONFIG_SMP - shares = READ_ONCE(cfs_rq->tg->shares); + runnable = shares = READ_ONCE(gcfs_rq->tg->shares); if (likely(se->load.weight == shares)) return; #else - shares = calc_cfs_shares(cfs_rq); + shares = calc_cfs_shares(gcfs_rq); + /* + * The hierarchical runnable load metric is the proportional part + * of this group's runnable_load_avg / load_avg. + * + * Note: we need to deal with very sporadic 'runnable > load' cases + * due to numerical instability. + */ + runnable = shares * gcfs_rq->avg.runnable_load_avg; + if (runnable) + runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg); #endif - reweight_entity(cfs_rq_of(se), se, shares); + reweight_entity(cfs_rq_of(se), se, shares, runnable); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct sched_entity *se) +static inline void update_cfs_group(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3050,7 +3081,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) */ static __always_inline u32 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) + unsigned long load, unsigned long runnable, int running) { unsigned long scale_freq, scale_cpu; u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ @@ -3067,10 +3098,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, */ if (periods) { sa->load_sum = decay_load(sa->load_sum, periods); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods); - } + sa->runnable_load_sum = + decay_load(sa->runnable_load_sum, periods); sa->util_sum = decay_load((u64)(sa->util_sum), periods); /* @@ -3083,11 +3112,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, sa->period_contrib = delta; contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } + if (load) + sa->load_sum += load * contrib; + if (runnable) + sa->runnable_load_sum += runnable * contrib; if (running) sa->util_sum += contrib * scale_cpu; @@ -3124,7 +3152,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, */ static __always_inline int ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) + unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -3157,8 +3185,8 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * this happens during idle_balance() which calls * update_blocked_averages() */ - if (!weight) - running = 0; + if (!load) + runnable = running = 0; /* * Now we know we crossed measurement unit boundaries. The *_avg @@ -3167,45 +3195,60 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) return 0; return 1; } static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq) +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) { u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(weight * sa->load_sum, divider); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, divider); - } + sa->load_avg = div_u64(load * sa->load_sum, divider); + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); sa->util_avg = sa->util_sum / divider; } /* * sched_entity: * + * task: + * se_runnable() == se_weight() + * + * group: [ see update_cfs_group() ] + * se_weight() = tg->weight * grq->load_avg / tg->load_avg + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + * * load_sum := runnable_sum * load_avg = se_weight(se) * runnable_avg * + * runnable_load_sum := runnable_sum + * runnable_load_avg = se_runnable(se) * runnable_avg + * + * XXX collapse load_sum and runnable_load_sum + * * cfq_rs: * * load_sum = \Sum se_weight(se) * se->avg.load_sum * load_avg = \Sum se->avg.load_avg + * + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum + * runnable_load_avg = \Sum se->avg.runable_load_avg */ static int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) { - ___update_load_avg(&se->avg, se_weight(se), NULL); + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; } @@ -3215,10 +3258,13 @@ __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) static int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, - cfs_rq->curr == se, NULL)) { + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + cfs_rq->curr == se)) { - ___update_load_avg(&se->avg, se_weight(se), NULL); + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; } @@ -3230,8 +3276,10 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, cpu, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->curr != NULL, cfs_rq)) { - ___update_load_avg(&cfs_rq->avg, 1, cfs_rq); + scale_load_down(cfs_rq->runnable_weight), + cfs_rq->curr != NULL)) { + + ___update_load_avg(&cfs_rq->avg, 1, 1); return 1; } @@ -3409,8 +3457,8 @@ static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long runnable_sum = gcfs_rq->prop_runnable_sum; - long load_avg; - s64 load_sum; + long runnable_load_avg, load_avg; + s64 runnable_load_sum, load_sum; if (!runnable_sum) return; @@ -3426,9 +3474,15 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf add_positive(&cfs_rq->avg.load_avg, load_avg); add_positive(&cfs_rq->avg.load_sum, load_sum); + runnable_load_sum = (s64)se_runnable(se) * runnable_sum; + runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); + + add_positive(&se->avg.runnable_load_sum, runnable_sum); + add_positive(&se->avg.runnable_load_avg, runnable_load_avg); + if (se->on_rq) { - add_positive(&cfs_rq->runnable_load_avg, load_avg); - add_positive(&cfs_rq->runnable_load_sum, load_sum); + add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); + add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); } } @@ -3710,7 +3764,7 @@ void remove_entity_load_avg(struct sched_entity *se) static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) { - return cfs_rq->runnable_load_avg; + return cfs_rq->avg.runnable_load_avg; } static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) @@ -3882,8 +3936,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + update_cfs_group(se); enqueue_runnable_load_avg(cfs_rq, se); - update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) @@ -3989,7 +4043,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_cfs_shares(se); + update_cfs_group(se); /* * Now advance min_vruntime if @se was the entity holding it back, @@ -4172,7 +4226,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_load_avg(cfs_rq, curr, UPDATE_TG); - update_cfs_shares(curr); + update_cfs_group(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -5090,7 +5144,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } if (!se) @@ -5149,7 +5203,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } if (!se) @@ -7174,7 +7228,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (cfs_rq->avg.util_sum) return false; - if (cfs_rq->runnable_load_sum) + if (cfs_rq->avg.runnable_load_sum) return false; return true; @@ -9692,7 +9746,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) update_rq_clock(rq); for_each_sched_entity(se) { update_load_avg(cfs_rq_of(se), se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } rq_unlock_irqrestore(rq, &rf); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5bcb86eb026b..e83d1b8be611 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -418,6 +418,7 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; + unsigned long runnable_weight; unsigned int nr_running, h_nr_running; u64 exec_clock; @@ -443,8 +444,6 @@ struct cfs_rq { * CFS load tracking */ struct sched_avg avg; - u64 runnable_load_sum; - unsigned long runnable_load_avg; #ifndef CONFIG_64BIT u64 load_last_update_time_copy; #endif -- cgit v1.3-8-gc7d7