From 3b03706fa621ce31a3e9ef6307020fde4e6aae16 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 18 Mar 2021 13:38:50 +0100 Subject: sched: Fix various typos Fix ~42 single-word typos in scheduler code comments. We have accumulated a few fun ones over the years. :-) Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: linux-kernel@vger.kernel.org --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 486f403a778b..4b49cc2af5c4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -815,7 +815,7 @@ void sysrq_sched_debug_show(void) } /* - * This itererator needs some explanation. + * This iterator needs some explanation. * It returns 1 for the header position. * This means 2 is CPU 0. * In a hotplugged system some CPUs, including CPU 0, may be missing so we have -- cgit v1.2.3-59-g8ed1b From 8a99b6833c884fa0e7919030d93fecedc69fc625 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Mar 2021 11:43:21 +0100 Subject: sched: Move SCHED_DEBUG sysctl to debugfs Stop polluting sysctl with undocumented knobs that really are debug only, move them all to /debug/sched/ along with the existing /debug/sched_* files that already exist. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Greg Kroah-Hartman Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210412102001.287610138@infradead.org --- include/linux/sched/sysctl.h | 8 ++-- kernel/sched/core.c | 4 +- kernel/sched/debug.c | 74 ++++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 9 +---- kernel/sched/sched.h | 2 + kernel/sysctl.c | 96 -------------------------------------------- 6 files changed, 80 insertions(+), 113 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3c31ba88aca5..0a3f34638cf5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -26,10 +26,11 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, enum { sysctl_hung_task_timeout_secs = 0 }; #endif +extern unsigned int sysctl_sched_child_runs_first; + extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_child_runs_first; enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, @@ -37,7 +38,7 @@ enum sched_tunable_scaling { SCHED_TUNABLESCALING_LINEAR, SCHED_TUNABLESCALING_END, }; -extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; +extern unsigned int sysctl_sched_tunable_scaling; extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; @@ -47,9 +48,6 @@ extern unsigned int sysctl_numa_balancing_scan_size; #ifdef CONFIG_SCHED_DEBUG extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_nr_migrate; - -int sched_proc_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos); #endif /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d031da20df3..bac30db29d3f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5504,9 +5504,11 @@ static const struct file_operations sched_dynamic_fops = { .release = single_release, }; +extern struct dentry *debugfs_sched; + static __init int sched_init_debug_dynamic(void) { - debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops); + debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); return 0; } late_initcall(sched_init_debug_dynamic); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4b49cc2af5c4..2093b9086e34 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -169,15 +169,81 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; +#ifdef CONFIG_SMP + +static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[16]; + + if (cnt > 15) + cnt = 15; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling)) + return -EINVAL; + + if (sched_update_scaling()) + return -EINVAL; + + *ppos += cnt; + return cnt; +} + +static int sched_scaling_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", sysctl_sched_tunable_scaling); + return 0; +} + +static int sched_scaling_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_scaling_show, NULL); +} + +static const struct file_operations sched_scaling_fops = { + .open = sched_scaling_open, + .write = sched_scaling_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* SMP */ + __read_mostly bool sched_debug_enabled; +struct dentry *debugfs_sched; + static __init int sched_init_debug(void) { - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); + struct dentry __maybe_unused *numa; - debugfs_create_bool("sched_debug", 0644, NULL, - &sched_debug_enabled); + debugfs_sched = debugfs_create_dir("sched", NULL); + + debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops); + debugfs_create_bool("debug_enabled", 0644, debugfs_sched, &sched_debug_enabled); + + debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); + debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); + debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); + +#ifdef CONFIG_SMP + debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); + debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); + debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); +#endif + +#ifdef CONFIG_NUMA_BALANCING + numa = debugfs_create_dir("numa_balancing", debugfs_sched); + + debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay); + debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min); + debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); + debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); +#endif return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9b8ae02f1994..b3ea14c08a9b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -49,7 +49,7 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL; * * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ -enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: @@ -634,15 +634,10 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ -int sched_proc_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +int sched_update_scaling(void) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); - if (ret || !write) - return ret; - sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, sysctl_sched_min_granularity); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7e7e936b4938..123ff3bb90bc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1568,6 +1568,8 @@ static inline void unregister_sched_domain_sysctl(void) } #endif +extern int sched_update_scaling(void); + extern void flush_smp_call_function_from_idle(void); #else /* !CONFIG_SMP: */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 17f1cc9988ca..4bff44d47154 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -184,17 +184,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; int sysctl_legacy_va_layout; #endif -#ifdef CONFIG_SCHED_DEBUG -static int min_sched_granularity_ns = 100000; /* 100 usecs */ -static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ -static int min_wakeup_granularity_ns; /* 0 usecs */ -static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ -#ifdef CONFIG_SMP -static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; -static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif /* CONFIG_SMP */ -#endif /* CONFIG_SCHED_DEBUG */ - #ifdef CONFIG_COMPACTION static int min_extfrag_threshold; static int max_extfrag_threshold = 1000; @@ -1659,91 +1648,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_SCHED_DEBUG - { - .procname = "sched_min_granularity_ns", - .data = &sysctl_sched_min_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .procname = "sched_latency_ns", - .data = &sysctl_sched_latency, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .procname = "sched_wakeup_granularity_ns", - .data = &sysctl_sched_wakeup_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, -#ifdef CONFIG_SMP - { - .procname = "sched_tunable_scaling", - .data = &sysctl_sched_tunable_scaling, - .maxlen = sizeof(enum sched_tunable_scaling), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_tunable_scaling, - .extra2 = &max_sched_tunable_scaling, - }, - { - .procname = "sched_migration_cost_ns", - .data = &sysctl_sched_migration_cost, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_nr_migrate", - .data = &sysctl_sched_nr_migrate, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif /* CONFIG_SMP */ -#ifdef CONFIG_NUMA_BALANCING - { - .procname = "numa_balancing_scan_delay_ms", - .data = &sysctl_numa_balancing_scan_delay, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_period_min_ms", - .data = &sysctl_numa_balancing_scan_period_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_period_max_ms", - .data = &sysctl_numa_balancing_scan_period_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_size_mb", - .data = &sysctl_numa_balancing_scan_size, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, -#endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", -- cgit v1.2.3-59-g8ed1b From 1011dcce99f8026d48fdd7b9cc259e32a8b472be Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Mar 2021 12:21:38 +0100 Subject: sched,preempt: Move preempt_dynamic to debug.c Move the #ifdef SCHED_DEBUG bits to kernel/sched/debug.c in order to collect all the debugfs bits. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210412102001.353833279@infradead.org --- kernel/sched/core.c | 77 ++-------------------------------------------------- kernel/sched/debug.c | 67 ++++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 11 ++++++-- 3 files changed, 78 insertions(+), 77 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bac30db29d3f..e6c714b9b211 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5371,9 +5371,9 @@ enum { preempt_dynamic_full, }; -static int preempt_dynamic_mode = preempt_dynamic_full; +int preempt_dynamic_mode = preempt_dynamic_full; -static int sched_dynamic_mode(const char *str) +int sched_dynamic_mode(const char *str) { if (!strcmp(str, "none")) return preempt_dynamic_none; @@ -5387,7 +5387,7 @@ static int sched_dynamic_mode(const char *str) return -EINVAL; } -static void sched_dynamic_update(int mode) +void sched_dynamic_update(int mode) { /* * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in @@ -5444,79 +5444,8 @@ static int __init setup_preempt_mode(char *str) } __setup("preempt=", setup_preempt_mode); -#ifdef CONFIG_SCHED_DEBUG - -static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[16]; - int mode; - - if (cnt > 15) - cnt = 15; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - mode = sched_dynamic_mode(strstrip(buf)); - if (mode < 0) - return mode; - - sched_dynamic_update(mode); - - *ppos += cnt; - - return cnt; -} - -static int sched_dynamic_show(struct seq_file *m, void *v) -{ - static const char * preempt_modes[] = { - "none", "voluntary", "full" - }; - int i; - - for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { - if (preempt_dynamic_mode == i) - seq_puts(m, "("); - seq_puts(m, preempt_modes[i]); - if (preempt_dynamic_mode == i) - seq_puts(m, ")"); - - seq_puts(m, " "); - } - - seq_puts(m, "\n"); - return 0; -} - -static int sched_dynamic_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_dynamic_show, NULL); -} - -static const struct file_operations sched_dynamic_fops = { - .open = sched_dynamic_open, - .write = sched_dynamic_write, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -extern struct dentry *debugfs_sched; - -static __init int sched_init_debug_dynamic(void) -{ - debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); - return 0; -} -late_initcall(sched_init_debug_dynamic); - -#endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_PREEMPT_DYNAMIC */ - /* * This is the entry point to schedule() from kernel preemption * off of irq context. diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2093b9086e34..bdd344f92c41 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -213,9 +213,71 @@ static const struct file_operations sched_scaling_fops = { #endif /* SMP */ +#ifdef CONFIG_PREEMPT_DYNAMIC + +static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[16]; + int mode; + + if (cnt > 15) + cnt = 15; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + mode = sched_dynamic_mode(strstrip(buf)); + if (mode < 0) + return mode; + + sched_dynamic_update(mode); + + *ppos += cnt; + + return cnt; +} + +static int sched_dynamic_show(struct seq_file *m, void *v) +{ + static const char * preempt_modes[] = { + "none", "voluntary", "full" + }; + int i; + + for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + if (preempt_dynamic_mode == i) + seq_puts(m, "("); + seq_puts(m, preempt_modes[i]); + if (preempt_dynamic_mode == i) + seq_puts(m, ")"); + + seq_puts(m, " "); + } + + seq_puts(m, "\n"); + return 0; +} + +static int sched_dynamic_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_dynamic_show, NULL); +} + +static const struct file_operations sched_dynamic_fops = { + .open = sched_dynamic_open, + .write = sched_dynamic_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_PREEMPT_DYNAMIC */ + __read_mostly bool sched_debug_enabled; -struct dentry *debugfs_sched; +static struct dentry *debugfs_sched; static __init int sched_init_debug(void) { @@ -225,6 +287,9 @@ static __init int sched_init_debug(void) debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops); debugfs_create_bool("debug_enabled", 0644, debugfs_sched, &sched_debug_enabled); +#ifdef CONFIG_PREEMPT_DYNAMIC + debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); +#endif debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 123ff3bb90bc..c312389113ce 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2734,5 +2734,12 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) } #endif -void swake_up_all_locked(struct swait_queue_head *q); -void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +extern void swake_up_all_locked(struct swait_queue_head *q); +extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +#ifdef CONFIG_PREEMPT_DYNAMIC +extern int preempt_dynamic_mode; +extern int sched_dynamic_mode(const char *str); +extern void sched_dynamic_update(int mode); +#endif + -- cgit v1.2.3-59-g8ed1b From 3b87f136f8fccddf7da016ab7d04bb3cf9b180f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Mar 2021 11:31:20 +0100 Subject: sched,debug: Convert sysctl sched_domains to debugfs Stop polluting sysctl, move to debugfs for SCHED_DEBUG stuff. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/YHgB/s4KCBQ1ifdm@hirez.programming.kicks-ass.net --- kernel/sched/debug.c | 254 +++++++++++------------------------------------- kernel/sched/sched.h | 10 +- kernel/sched/topology.c | 6 +- 3 files changed, 59 insertions(+), 211 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index bdd344f92c41..b25de7b07792 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -299,6 +299,10 @@ static __init int sched_init_debug(void) debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); + + mutex_lock(&sched_domains_mutex); + update_sched_domain_debugfs(); + mutex_unlock(&sched_domains_mutex); #endif #ifdef CONFIG_NUMA_BALANCING @@ -316,229 +320,88 @@ late_initcall(sched_init_debug); #ifdef CONFIG_SMP -#ifdef CONFIG_SYSCTL - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} -}; - -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; -} +static cpumask_var_t sd_sysctl_cpus; +static struct dentry *sd_dentry; -static int sd_ctl_doflags(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int sd_flags_show(struct seq_file *m, void *v) { - unsigned long flags = *(unsigned long *)table->data; - size_t data_size = 0; - size_t len = 0; - char *tmp, *buf; + unsigned long flags = *(unsigned int *)m->private; int idx; - if (write) - return 0; - - for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { - char *name = sd_flag_debug[idx].name; - - /* Name plus whitespace */ - data_size += strlen(name) + 1; - } - - if (*ppos > data_size) { - *lenp = 0; - return 0; - } - - buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL); - if (!buf) - return -ENOMEM; - for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { - char *name = sd_flag_debug[idx].name; - - len += snprintf(buf + len, strlen(name) + 2, "%s ", name); - } - - tmp = buf + *ppos; - len -= *ppos; - - if (len > *lenp) - len = *lenp; - if (len) - memcpy(buffer, tmp, len); - if (len < *lenp) { - ((char *)buffer)[len] = '\n'; - len++; + seq_puts(m, sd_flag_debug[idx].name); + seq_puts(m, " "); } - - *lenp = len; - *ppos += len; - - kfree(buf); + seq_puts(m, "\n"); return 0; } -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(9); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags); - set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); - /* &table[8] is terminator */ - - return table; -} - -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) +static int sd_flags_open(struct inode *inode, struct file *file) { - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; + return single_open(file, sd_flags_show, inode->i_private); } -static cpumask_var_t sd_sysctl_cpus; -static struct ctl_table_header *sd_sysctl_header; +static const struct file_operations sd_flags_fops = { + .open = sd_flags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; -void register_sched_domain_sysctl(void) +static void register_sd(struct sched_domain *sd, struct dentry *parent) { - static struct ctl_table *cpu_entries; - static struct ctl_table **cpu_idx; - static bool init_done = false; - char buf[32]; - int i; +#define SDM(type, mode, member) \ + debugfs_create_##type(#member, mode, parent, &sd->member) - if (!cpu_entries) { - cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); - if (!cpu_entries) - return; - - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = cpu_entries; - } + SDM(ulong, 0644, min_interval); + SDM(ulong, 0644, max_interval); + SDM(u64, 0644, max_newidle_lb_cost); + SDM(u32, 0644, busy_factor); + SDM(u32, 0644, imbalance_pct); + SDM(u32, 0644, cache_nice_tries); + SDM(str, 0444, name); - if (!cpu_idx) { - struct ctl_table *e = cpu_entries; +#undef SDM - cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); - if (!cpu_idx) - return; + debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); +} - /* deal with sparse possible map */ - for_each_possible_cpu(i) { - cpu_idx[i] = e; - e++; - } - } +void update_sched_domain_debugfs(void) +{ + int cpu, i; if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; - } - - if (!init_done) { - init_done = true; - /* init to possible to not have holes in @cpu_entries */ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); } - for_each_cpu(i, sd_sysctl_cpus) { - struct ctl_table *e = cpu_idx[i]; + if (!sd_dentry) + sd_dentry = debugfs_create_dir("domains", debugfs_sched); + + for_each_cpu(cpu, sd_sysctl_cpus) { + struct sched_domain *sd; + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%d", cpu); + debugfs_remove(debugfs_lookup(buf, sd_dentry)); + d_cpu = debugfs_create_dir(buf, sd_dentry); - if (e->child) - sd_free_ctl_entry(&e->child); + i = 0; + for_each_domain(cpu, sd) { + struct dentry *d_sd; - if (!e->procname) { - snprintf(buf, 32, "cpu%d", i); - e->procname = kstrdup(buf, GFP_KERNEL); + snprintf(buf, sizeof(buf), "domain%d", i); + d_sd = debugfs_create_dir(buf, d_cpu); + + register_sd(sd, d_sd); + i++; } - e->mode = 0555; - e->child = sd_alloc_ctl_cpu_table(i); - __cpumask_clear_cpu(i, sd_sysctl_cpus); + __cpumask_clear_cpu(cpu, sd_sysctl_cpus); } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); } void dirty_sched_domain_sysctl(int cpu) @@ -547,13 +410,6 @@ void dirty_sched_domain_sysctl(int cpu) __cpumask_set_cpu(cpu, sd_sysctl_cpus); } -/* may be called multiple times per register */ -void unregister_sched_domain_sysctl(void) -{ - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; -} -#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c312389113ce..55232db745ac 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1552,20 +1552,16 @@ static inline unsigned int group_first_cpu(struct sched_group *group) extern int group_balance_cpu(struct sched_group *sg); -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -void register_sched_domain_sysctl(void); +#ifdef CONFIG_SCHED_DEBUG +void update_sched_domain_debugfs(void); void dirty_sched_domain_sysctl(int cpu); -void unregister_sched_domain_sysctl(void); #else -static inline void register_sched_domain_sysctl(void) +static inline void update_sched_domain_debugfs(void) { } static inline void dirty_sched_domain_sysctl(int cpu) { } -static inline void unregister_sched_domain_sysctl(void) -{ -} #endif extern int sched_update_scaling(void); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d1aec244c027..c343aedd9caa 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2223,7 +2223,6 @@ int sched_init_domains(const struct cpumask *cpu_map) doms_cur = &fallback_doms; cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN)); err = build_sched_domains(doms_cur[0], NULL); - register_sched_domain_sysctl(); return err; } @@ -2298,9 +2297,6 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], lockdep_assert_held(&sched_domains_mutex); - /* Always unregister in case we don't destroy any domains: */ - unregister_sched_domain_sysctl(); - /* Let the architecture update CPU core mappings: */ new_topology = arch_update_cpu_topology(); @@ -2389,7 +2385,7 @@ match3: dattr_cur = dattr_new; ndoms_cur = ndoms_new; - register_sched_domain_sysctl(); + update_sched_domain_debugfs(); } /* -- cgit v1.2.3-59-g8ed1b From d27e9ae2f244805bbdc730d85fba28685d2471e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Mar 2021 15:18:19 +0100 Subject: sched: Move /proc/sched_debug to debugfs Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Greg Kroah-Hartman Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210412102001.548833671@infradead.org --- kernel/sched/debug.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b25de7b07792..bf199d6ca906 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -277,6 +277,20 @@ static const struct file_operations sched_dynamic_fops = { __read_mostly bool sched_debug_enabled; +static const struct seq_operations sched_debug_sops; + +static int sched_debug_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &sched_debug_sops); +} + +static const struct file_operations sched_debug_fops = { + .open = sched_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static struct dentry *debugfs_sched; static __init int sched_init_debug(void) @@ -314,6 +328,8 @@ static __init int sched_init_debug(void) debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); #endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + return 0; } late_initcall(sched_init_debug); @@ -847,15 +863,6 @@ static const struct seq_operations sched_debug_sops = { .show = sched_debug_show, }; -static int __init init_sched_debug_procfs(void) -{ - if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops)) - return -ENOMEM; - return 0; -} - -__initcall(init_sched_debug_procfs); - #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F)) #define __P(F) __PS(#F, F) #define P(F) __PS(#F, p->F) -- cgit v1.2.3-59-g8ed1b From 9406415f46f6127fd31bb66f0260f7a61a8d2786 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Apr 2021 18:23:17 +0200 Subject: sched/debug: Rename the sched_debug parameter to sched_verbose CONFIG_SCHED_DEBUG is the build-time Kconfig knob, the boot param sched_debug and the /debug/sched/debug_enabled knobs control the sched_debug_enabled variable, but what they really do is make SCHED_DEBUG more verbose, so rename the lot. Signed-off-by: Peter Zijlstra (Intel) --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/scheduler/sched-domains.rst | 10 +++++----- kernel/sched/debug.c | 4 ++-- kernel/sched/sched.h | 2 +- kernel/sched/topology.c | 12 ++++++------ 5 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 04545725f187..9e4c026f4837 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4725,7 +4725,7 @@ sbni= [NET] Granch SBNI12 leased line adapter - sched_debug [KNL] Enables verbose scheduler debug messages. + sched_verbose [KNL] Enables verbose scheduler debug messages. schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index 8582fa5e9170..14ea2f21d20e 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -74,8 +74,8 @@ for a given topology level by creating a sched_domain_topology_level array and calling set_sched_topology() with this array as the parameter. The sched-domains debugging infrastructure can be enabled by enabling -CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to -tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug -knob. This enables an error checking parse of the sched domains which should -catch most possible errors (described above). It also prints out the domain -structure in a visual format. +CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you +forgot to tweak your cmdline, you can also flip the +/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of +the sched domains which should catch most possible errors (described above). It +also prints out the domain structure in a visual format. diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index bf199d6ca906..461342f598ef 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -275,7 +275,7 @@ static const struct file_operations sched_dynamic_fops = { #endif /* CONFIG_PREEMPT_DYNAMIC */ -__read_mostly bool sched_debug_enabled; +__read_mostly bool sched_debug_verbose; static const struct seq_operations sched_debug_sops; @@ -300,7 +300,7 @@ static __init int sched_init_debug(void) debugfs_sched = debugfs_create_dir("sched", NULL); debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops); - debugfs_create_bool("debug_enabled", 0644, debugfs_sched, &sched_debug_enabled); + debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose); #ifdef CONFIG_PREEMPT_DYNAMIC debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55232db745ac..bde72482cbb8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2363,7 +2363,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); #ifdef CONFIG_SCHED_DEBUG -extern bool sched_debug_enabled; +extern bool sched_debug_verbose; extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c343aedd9caa..55a0a243e871 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -14,15 +14,15 @@ static cpumask_var_t sched_domains_tmpmask2; static int __init sched_debug_setup(char *str) { - sched_debug_enabled = true; + sched_debug_verbose = true; return 0; } -early_param("sched_debug", sched_debug_setup); +early_param("sched_verbose", sched_debug_setup); static inline bool sched_debug(void) { - return sched_debug_enabled; + return sched_debug_verbose; } #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, @@ -131,7 +131,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; - if (!sched_debug_enabled) + if (!sched_debug_verbose) return; if (!sd) { @@ -152,7 +152,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } #else /* !CONFIG_SCHED_DEBUG */ -# define sched_debug_enabled 0 +# define sched_debug_verbose 0 # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { @@ -2141,7 +2141,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); - if (rq && sched_debug_enabled) { + if (rq && sched_debug_verbose) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); } -- cgit v1.2.3-59-g8ed1b From c006fac556e401a62054d065da168099ea5a5b10 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 16 Apr 2021 14:29:36 -0700 Subject: sched: Warn on long periods of pending need_resched CPU scheduler marks need_resched flag to signal a schedule() on a particular CPU. But, schedule() may not happen immediately in cases where the current task is executing in the kernel mode (no preemption state) for extended periods of time. This patch adds a warn_on if need_resched is pending for more than the time specified in sysctl resched_latency_warn_ms. If it goes off, it is likely that there is a missing cond_resched() somewhere. Monitoring is done via the tick and the accuracy is hence limited to jiffy scale. This also means that we won't trigger the warning if the tick is disabled. This feature (LATENCY_WARN) is default disabled. Signed-off-by: Paul Turner Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210416212936.390566-1-joshdon@google.com --- include/linux/sched/sysctl.h | 3 ++ kernel/sched/core.c | 70 +++++++++++++++++++++++++++++++++++++++++++- kernel/sched/debug.c | 13 ++++++++ kernel/sched/features.h | 2 ++ kernel/sched/sched.h | 10 +++++++ 5 files changed, 97 insertions(+), 1 deletion(-) (limited to 'kernel/sched/debug.c') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 0a3f34638cf5..db2c0f34aaaf 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -48,6 +48,9 @@ extern unsigned int sysctl_numa_balancing_scan_size; #ifdef CONFIG_SCHED_DEBUG extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_nr_migrate; + +extern int sysctl_resched_latency_warn_ms; +extern int sysctl_resched_latency_warn_once; #endif /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e6c714b9b211..fcb35ae15619 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -58,7 +58,17 @@ const_debug unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT -#endif + +/* + * Print a warning if need_resched is set for the given duration (if + * LATENCY_WARN is enabled). + * + * If sysctl_resched_latency_warn_once is set, only one warning will be shown + * per boot. + */ +__read_mostly int sysctl_resched_latency_warn_ms = 100; +__read_mostly int sysctl_resched_latency_warn_once = 1; +#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. @@ -4527,6 +4537,55 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_SCHED_DEBUG +static u64 cpu_resched_latency(struct rq *rq) +{ + int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); + u64 resched_latency, now = rq_clock(rq); + static bool warned_once; + + if (sysctl_resched_latency_warn_once && warned_once) + return 0; + + if (!need_resched() || !latency_warn_ms) + return 0; + + if (system_state == SYSTEM_BOOTING) + return 0; + + if (!rq->last_seen_need_resched_ns) { + rq->last_seen_need_resched_ns = now; + rq->ticks_without_resched = 0; + return 0; + } + + rq->ticks_without_resched++; + resched_latency = now - rq->last_seen_need_resched_ns; + if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC) + return 0; + + warned_once = true; + + return resched_latency; +} + +static int __init setup_resched_latency_warn_ms(char *str) +{ + long val; + + if ((kstrtol(str, 0, &val))) { + pr_warn("Unable to set resched_latency_warn_ms\n"); + return 1; + } + + sysctl_resched_latency_warn_ms = val; + return 1; +} +__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); +#else +static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } +#endif /* CONFIG_SCHED_DEBUG */ + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -4538,6 +4597,7 @@ void scheduler_tick(void) struct task_struct *curr = rq->curr; struct rq_flags rf; unsigned long thermal_pressure; + u64 resched_latency; arch_scale_freq_tick(); sched_clock_tick(); @@ -4548,10 +4608,15 @@ void scheduler_tick(void) thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); + if (sched_feat(LATENCY_WARN)) + resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); rq_unlock(rq, &rf); + if (sched_feat(LATENCY_WARN) && resched_latency) + resched_latency_warn(cpu, resched_latency); + perf_event_task_tick(); #ifdef CONFIG_SMP @@ -5046,6 +5111,9 @@ static void __sched notrace __schedule(bool preempt) next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); +#ifdef CONFIG_SCHED_DEBUG + rq->last_seen_need_resched_ns = 0; +#endif if (likely(prev != next)) { rq->nr_switches++; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 461342f598ef..7251fc477a11 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -309,6 +309,9 @@ static __init int sched_init_debug(void) debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); + debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); + debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); + #ifdef CONFIG_SMP debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); @@ -1027,3 +1030,13 @@ void proc_sched_set_task(struct task_struct *p) memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif } + +void resched_latency_warn(int cpu, u64 latency) +{ + static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1); + + WARN(__ratelimit(&latency_check_ratelimit), + "sched: CPU %d need_resched set for > %llu ns (%d ticks) " + "without schedule\n", + cpu, latency, cpu_rq(cpu)->ticks_without_resched); +} diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 011c5ec7b7b5..7f8dace0964c 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -91,5 +91,7 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(UTIL_EST_FASTUP, true) +SCHED_FEAT(LATENCY_WARN, false) + SCHED_FEAT(ALT_PERIOD, true) SCHED_FEAT(BASE_SLICE, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bde72482cbb8..a189bec13729 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -971,6 +972,11 @@ struct rq { atomic_t nr_iowait; +#ifdef CONFIG_SCHED_DEBUG + u64 last_seen_need_resched_ns; + int ticks_without_resched; +#endif + #ifdef CONFIG_MEMBARRIER int membarrier_state; #endif @@ -2371,6 +2377,8 @@ extern void print_dl_stats(struct seq_file *m, int cpu); extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); + +extern void resched_latency_warn(int cpu, u64 latency); #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); @@ -2378,6 +2386,8 @@ extern void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, unsigned long tpf, unsigned long gsf, unsigned long gpf); #endif /* CONFIG_NUMA_BALANCING */ +#else +static inline void resched_latency_warn(int cpu, u64 latency) {} #endif /* CONFIG_SCHED_DEBUG */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); -- cgit v1.2.3-59-g8ed1b From ad789f84c9a145f8a18744c0387cec22ec51651e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 15 Apr 2021 15:54:26 -0400 Subject: sched/debug: Fix cgroup_path[] serialization The handling of sysrq key can be activated by echoing the key to /proc/sysrq-trigger or via the magic key sequence typed into a terminal that is connected to the system in some way (serial, USB or other mean). In the former case, the handling is done in a user context. In the latter case, it is likely to be in an interrupt context. Currently in print_cpu() of kernel/sched/debug.c, sched_debug_lock is taken with interrupt disabled for the whole duration of the calls to print_*_stats() and print_rq() which could last for the quite some time if the information dump happens on the serial console. If the system has many cpus and the sched_debug_lock is somehow busy (e.g. parallel sysrq-t), the system may hit a hard lockup panic depending on the actually serial console implementation of the system. The purpose of sched_debug_lock is to serialize the use of the global cgroup_path[] buffer in print_cpu(). The rests of the printk calls don't need serialization from sched_debug_lock. Calling printk() with interrupt disabled can still be problematic if multiple instances are running. Allocating a stack buffer of PATH_MAX bytes is not feasible because of the limited size of the kernel stack. The solution implemented in this patch is to allow only one caller at a time to use the full size group_path[], while other simultaneous callers will have to use shorter stack buffers with the possibility of path name truncation. A "..." suffix will be printed if truncation may have happened. The cgroup path name is provided for informational purpose only, so occasional path name truncation should not be a big problem. Fixes: efe25c2c7b3a ("sched: Reinstate group names in /proc/sched_debug") Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210415195426.6677-1-longman@redhat.com --- kernel/sched/debug.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7251fc477a11..9c882f20803e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -8,8 +8,6 @@ */ #include "sched.h" -static DEFINE_SPINLOCK(sched_debug_lock); - /* * This allows printing both to /proc/sched_debug and * to the console @@ -476,16 +474,37 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #endif #ifdef CONFIG_CGROUP_SCHED +static DEFINE_SPINLOCK(sched_debug_lock); static char group_path[PATH_MAX]; -static char *task_group_path(struct task_group *tg) +static void task_group_path(struct task_group *tg, char *path, int plen) { - if (autogroup_path(tg, group_path, PATH_MAX)) - return group_path; + if (autogroup_path(tg, path, plen)) + return; - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, path, plen); +} - return group_path; +/* + * Only 1 SEQ_printf_task_group_path() caller can use the full length + * group_path[] for cgroup path. Other simultaneous callers will have + * to use a shorter stack buffer. A "..." suffix is appended at the end + * of the stack buffer so that it will show up in case the output length + * matches the given buffer size to indicate possible path name truncation. + */ +#define SEQ_printf_task_group_path(m, tg, fmt...) \ +{ \ + if (spin_trylock(&sched_debug_lock)) { \ + task_group_path(tg, group_path, sizeof(group_path)); \ + SEQ_printf(m, fmt, group_path); \ + spin_unlock(&sched_debug_lock); \ + } else { \ + char buf[128]; \ + char *bufend = buf + sizeof(buf) - 3; \ + task_group_path(tg, buf, bufend - buf); \ + strcpy(bufend - 1, "..."); \ + SEQ_printf(m, fmt, buf); \ + } \ } #endif @@ -512,7 +531,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif #ifdef CONFIG_CGROUP_SCHED - SEQ_printf(m, " %s", task_group_path(task_group(p))); + SEQ_printf_task_group_path(m, task_group(p), " %s") #endif SEQ_printf(m, "\n"); @@ -549,7 +568,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, "\n"); - SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); + SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu); #else SEQ_printf(m, "\n"); SEQ_printf(m, "cfs_rq[%d]:\n", cpu); @@ -620,7 +639,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #ifdef CONFIG_RT_GROUP_SCHED SEQ_printf(m, "\n"); - SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); + SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu); #else SEQ_printf(m, "\n"); SEQ_printf(m, "rt_rq[%d]:\n", cpu); @@ -672,7 +691,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; #ifdef CONFIG_X86 { @@ -723,13 +741,11 @@ do { \ } #undef P - spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); print_dl_stats(m, cpu); print_rq(m, rq, cpu); - spin_unlock_irqrestore(&sched_debug_lock, flags); SEQ_printf(m, "\n"); } -- cgit v1.2.3-59-g8ed1b