From 5e83eafbfd3b351537c0d74467fc43e8a88f4ae4 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:10 +0100 Subject: sched/fair: Remove the rq->cpu_load[] update code With LB_BIAS disabled, there is no need to update the rq->cpu_load[idx] any more. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-2-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched/nohz.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index b36f4cf38111..1abe91ff6e4a 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -6,14 +6,6 @@ * This is the interface between the scheduler and nohz/dynticks: */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void cpu_load_update_nohz_start(void); -extern void cpu_load_update_nohz_stop(void); -#else -static inline void cpu_load_update_nohz_start(void) { } -static inline void cpu_load_update_nohz_stop(void) { } -#endif - #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern int get_nohz_timer_target(void); -- cgit v1.2.3-59-g8ed1b From 0e1fef63d92d61ed561e504c3a078a827a0f9bfe Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:14 +0100 Subject: sched/core: Remove sd->*_idx The sched domain per rq load index files also disappear from the /proc/sys/kernel/sched_domain/cpuX/domainY directories. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-6-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 5 ----- kernel/sched/debug.c | 25 ++++++++++--------------- kernel/sched/topology.c | 10 ---------- 3 files changed, 10 insertions(+), 30 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index cfc0a89a7159..53afbe07354a 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -84,11 +84,6 @@ struct sched_domain { unsigned int busy_factor; /* less balancing by factor if busy */ unsigned int imbalance_pct; /* No balance until over watermark */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ - unsigned int busy_idx; - unsigned int idle_idx; - unsigned int newidle_idx; - unsigned int wake_idx; - unsigned int forkexec_idx; int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a0b0d6e21e5b..7ffde8ce82fd 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -251,25 +251,20 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table = sd_alloc_ctl_entry(9); if (table == NULL) return NULL; - set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); - /* &table[13] is terminator */ + set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[8] is terminator */ return table; } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f53f89df837d..63184cf0d0d7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl, .imbalance_pct = 125, .cache_nice_tries = 0, - .busy_idx = 0, - .idle_idx = 0, - .newidle_idx = 0, - .wake_idx = 0, - .forkexec_idx = 0, .flags = 1*SD_LOAD_BALANCE | 1*SD_BALANCE_NEWIDLE @@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl, } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; sd->cache_nice_tries = 1; - sd->busy_idx = 2; #ifdef CONFIG_NUMA } else if (sd->flags & SD_NUMA) { sd->cache_nice_tries = 2; - sd->busy_idx = 3; - sd->idle_idx = 2; sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; @@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { sd->cache_nice_tries = 1; - sd->busy_idx = 2; - sd->idle_idx = 1; } /* -- cgit v1.2.3-59-g8ed1b From 8ec59c0f5f4966f89f4e3e3cab81710c7fa959d0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 17 Jun 2019 17:00:17 +0200 Subject: sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity() The 'struct sched_domain *sd' parameter to arch_scale_cpu_capacity() is unused since commit: 765d0af19f5f ("sched/topology: Remove the ::smt_gain field from 'struct sched_domain'") Remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Viresh Kumar Reviewed-by: Valentin Schneider Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: gregkh@linuxfoundation.org Cc: linux@armlinux.org.uk Cc: quentin.perret@arm.com Cc: rafael@kernel.org Link: https://lkml.kernel.org/r/1560783617-5827-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/arm/kernel/topology.c | 2 +- drivers/base/arch_topology.c | 6 +++--- include/linux/arch_topology.h | 2 +- include/linux/energy_model.h | 2 +- include/linux/sched/topology.h | 14 +++----------- kernel/power/energy_model.c | 2 +- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 6 +++--- kernel/sched/pelt.c | 2 +- kernel/sched/pelt.h | 2 +- kernel/sched/sched.h | 2 +- kernel/sched/topology.c | 8 ++++---- 13 files changed, 22 insertions(+), 30 deletions(-) (limited to 'include/linux/sched') diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 60e375ce1ab2..d17cb1e6d679 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,7 +169,7 @@ static void update_cpu_capacity(unsigned int cpu) topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity); pr_info("CPU%u: update cpu_capacity %lu\n", - cpu, topology_get_cpu_scale(NULL, cpu)); + cpu, topology_get_cpu_scale(cpu)); } #else diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 1739d7e1952a..9b09e31ae82f 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -43,7 +43,7 @@ static ssize_t cpu_capacity_show(struct device *dev, { struct cpu *cpu = container_of(dev, struct cpu, dev); - return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id)); + return sprintf(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id)); } static void update_topology_flags_workfn(struct work_struct *work); @@ -116,7 +116,7 @@ void topology_normalize_cpu_scale(void) / capacity_scale; topology_set_cpu_scale(cpu, capacity); pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", - cpu, topology_get_cpu_scale(NULL, cpu)); + cpu, topology_get_cpu_scale(cpu)); } } @@ -185,7 +185,7 @@ init_cpu_capacity_callback(struct notifier_block *nb, cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); for_each_cpu(cpu, policy->related_cpus) { - raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) * + raw_capacity[cpu] = topology_get_cpu_scale(cpu) * policy->cpuinfo.max_freq / 1000UL; capacity_scale = max(raw_capacity[cpu], capacity_scale); } diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index d9bdc1a7f4e7..1cfe05ea1d89 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -18,7 +18,7 @@ DECLARE_PER_CPU(unsigned long, cpu_scale); struct sched_domain; static inline -unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu) +unsigned long topology_get_cpu_scale(int cpu) { return per_cpu(cpu_scale, cpu); } diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index aa027f7bcb3e..73f8c3cb9588 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -89,7 +89,7 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd, * like schedutil. */ cpu = cpumask_first(to_cpumask(pd->cpus)); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(cpu); cs = &pd->table[pd->nr_cap_states - 1]; freq = map_util_freq(max_util, cs->frequency, scale_cpu); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 53afbe07354a..e445d3767cdd 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -196,14 +196,6 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl); # define SD_INIT_NAME(type) #endif -#ifndef arch_scale_cpu_capacity -static __always_inline -unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} -#endif - #else /* CONFIG_SMP */ struct sched_domain_attr; @@ -219,16 +211,16 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +#endif /* !CONFIG_SMP */ + #ifndef arch_scale_cpu_capacity static __always_inline -unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) +unsigned long arch_scale_cpu_capacity(int cpu) { return SCHED_CAPACITY_SCALE; } #endif -#endif /* !CONFIG_SMP */ - static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 7d66ee68aaaf..0a9326f5f421 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, * All CPUs of a domain must have the same micro-architecture * since they all share the same table. */ - cap = arch_scale_cpu_capacity(NULL, cpu); + cap = arch_scale_cpu_capacity(cpu); if (prev_cap && prev_cap != cap) { pr_err("CPUs of %*pbl must have the same capacity\n", cpumask_pr_args(span)); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 962cf343f798..7c4ce69067c4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -276,7 +276,7 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); unsigned long util = cpu_util_cfs(rq); - unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c1ef30861068..8b5bb2ac16e2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq) &curr->dl); } else { unsigned long scale_freq = arch_scale_freq_capacity(cpu); - unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); scaled_delta_exec = cap_scale(delta_exec, scale_freq); scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c11dcdedcbc..4f8754157763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -764,7 +764,7 @@ void post_init_entity_util_avg(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; - long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; if (cap > 0) { @@ -7646,7 +7646,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(sd, cpu); + unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -7671,7 +7671,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = scale_rt_capacity(sd, cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); if (!capacity) capacity = 1; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index befce29bd882..42ea66b07b1d 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -366,7 +366,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) * reflect the real amount of computation */ running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); - running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq))); /* * We know the time that has been used by interrupt since last update diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 7489d5f56960..afff644da065 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) * Scale the elapsed time to reflect the real amount of * computation */ - delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq))); delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); rq->clock_pelt += delta; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b08dee29ef5e..e58ab597ec88 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2248,7 +2248,7 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) { - unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long max = arch_scale_cpu_capacity(cpu); return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 63184cf0d0d7..f751ce0b783e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1874,10 +1874,10 @@ static struct sched_domain_topology_level unsigned long cap; /* Is there any asymmetry? */ - cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); + cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); for_each_cpu(i, cpu_map) { - if (arch_scale_cpu_capacity(NULL, i) != cap) { + if (arch_scale_cpu_capacity(i) != cap) { asym = true; break; } @@ -1892,7 +1892,7 @@ static struct sched_domain_topology_level * to everyone. */ for_each_cpu(i, cpu_map) { - unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); + unsigned long max_capacity = arch_scale_cpu_capacity(i); int tl_id = 0; for_each_sd_topology(tl) { @@ -1902,7 +1902,7 @@ static struct sched_domain_topology_level for_each_cpu_and(j, tl->mask(i), cpu_map) { unsigned long capacity; - capacity = arch_scale_cpu_capacity(NULL, j); + capacity = arch_scale_cpu_capacity(j); if (capacity <= max_capacity) continue; -- cgit v1.2.3-59-g8ed1b From 69842cba9ace84849bb9b8edcdf2cefccd97901c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:02 +0100 Subject: sched/uclamp: Add CPU's clamp buckets refcounting Utilization clamping allows to clamp the CPU's utilization within a [util_min, util_max] range, depending on the set of RUNNABLE tasks on that CPU. Each task references two "clamp buckets" defining its minimum and maximum (util_{min,max}) utilization "clamp values". A CPU's clamp bucket is active if there is at least one RUNNABLE tasks enqueued on that CPU and refcounting that bucket. When a task is {en,de}queued {on,from} a rq, the set of active clamp buckets on that CPU can change. If the set of active clamp buckets changes for a CPU a new "aggregated" clamp value is computed for that CPU. This is because each clamp bucket enforces a different utilization clamp value. Clamp values are always MAX aggregated for both util_min and util_max. This ensures that no task can affect the performance of other co-scheduled tasks which are more boosted (i.e. with higher util_min clamp) or less capped (i.e. with higher util_max clamp). A task has: task_struct::uclamp[clamp_id]::bucket_id to track the "bucket index" of the CPU's clamp bucket it refcounts while enqueued, for each clamp index (clamp_id). A runqueue has: rq::uclamp[clamp_id]::bucket[bucket_id].tasks to track how many RUNNABLE tasks on that CPU refcount each clamp bucket (bucket_id) of a clamp index (clamp_id). It also has a: rq::uclamp[clamp_id]::bucket[bucket_id].value to track the clamp value of each clamp bucket (bucket_id) of a clamp index (clamp_id). The rq::uclamp::bucket[clamp_id][] array is scanned every time it's needed to find a new MAX aggregated clamp value for a clamp_id. This operation is required only when it's dequeued the last task of a clamp bucket tracking the current MAX aggregated clamp value. In this case, the CPU is either entering IDLE or going to schedule a less boosted or more clamped task. The expected number of different clamp values configured at build time is small enough to fit the full unordered array into a single cache line, for configurations of up to 7 buckets. Add to struct rq the basic data structures required to refcount the number of RUNNABLE tasks for each clamp bucket. Add also the max aggregation required to update the rq's clamp value at each enqueue/dequeue event. Use a simple linear mapping of clamp values into clamp buckets. Pre-compute and cache bucket_id to avoid integer divisions at enqueue/dequeue time. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- include/linux/log2.h | 34 +++++++++ include/linux/sched.h | 39 ++++++++++ include/linux/sched/topology.h | 6 -- init/Kconfig | 53 +++++++++++++ kernel/sched/core.c | 166 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 51 +++++++++++++ 6 files changed, 343 insertions(+), 6 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/log2.h b/include/linux/log2.h index 1aec01365ed4..83a4a3ca3e8a 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -220,4 +220,38 @@ int __order_base_2(unsigned long n) ilog2((n) - 1) + 1) : \ __order_base_2(n) \ ) + +static inline __attribute__((const)) +int __bits_per(unsigned long n) +{ + if (n < 2) + return 1; + if (is_power_of_2(n)) + return order_base_2(n) + 1; + return order_base_2(n); +} + +/** + * bits_per - calculate the number of bits required for the argument + * @n: parameter + * + * This is constant-capable and can be used for compile time + * initializations, e.g bitfields. + * + * The first few values calculated by this routine: + * bf(0) = 1 + * bf(1) = 1 + * bf(2) = 2 + * bf(3) = 2 + * bf(4) = 3 + * ... and so on. + */ +#define bits_per(n) \ +( \ + __builtin_constant_p(n) ? ( \ + ((n) == 0 || (n) == 1) \ + ? 1 : ilog2(n) + 1 \ + ) : \ + __bits_per(n) \ +) #endif /* _LINUX_LOG2_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 044c023875e8..80235bcd05f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -283,6 +283,18 @@ struct vtime { u64 gtime; }; +/* + * Utilization clamp constraints. + * @UCLAMP_MIN: Minimum utilization + * @UCLAMP_MAX: Maximum utilization + * @UCLAMP_CNT: Utilization clamp constraints count + */ +enum uclamp_id { + UCLAMP_MIN = 0, + UCLAMP_MAX, + UCLAMP_CNT +}; + struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ @@ -314,6 +326,10 @@ struct sched_info { # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) +/* Increase resolution of cpu_capacity calculations */ +# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT +# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) + struct load_weight { unsigned long weight; u32 inv_weight; @@ -562,6 +578,25 @@ struct sched_dl_entity { struct hrtimer inactive_timer; }; +#ifdef CONFIG_UCLAMP_TASK +/* Number of utilization clamp buckets (shorter alias) */ +#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT + +/* + * Utilization clamp for a scheduling entity + * @value: clamp value "assigned" to a se + * @bucket_id: bucket index corresponding to the "assigned" value + * + * The bucket_id is the index of the clamp bucket matching the clamp value + * which is pre-computed and stored to avoid expensive integer divisions from + * the fast path. + */ +struct uclamp_se { + unsigned int value : bits_per(SCHED_CAPACITY_SCALE); + unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); +}; +#endif /* CONFIG_UCLAMP_TASK */ + union rcu_special { struct { u8 blocked; @@ -642,6 +677,10 @@ struct task_struct { #endif struct sched_dl_entity dl; +#ifdef CONFIG_UCLAMP_TASK + struct uclamp_se uclamp[UCLAMP_CNT]; +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index e445d3767cdd..7863bb62d2ab 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -6,12 +6,6 @@ #include -/* - * Increase resolution of cpu_capacity calculations - */ -#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT -#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) - /* * sched-domains (multiprocessor balancing) declarations: */ diff --git a/init/Kconfig b/init/Kconfig index 0e2344389501..c88289c18d59 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -677,6 +677,59 @@ config HAVE_UNSTABLE_SCHED_CLOCK config GENERIC_SCHED_CLOCK bool +menu "Scheduler features" + +config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. + + With this option, the user can specify the min and max CPU + utilization allowed for RUNNABLE tasks. The max utilization defines + the maximum frequency a task should use while the min utilization + defines the minimum frequency it should use. + + Both min and max utilization clamp values are hints to the scheduler, + aiming at improving its frequency selection policy, but they do not + enforce or grant any specific bandwidth for tasks. + + If in doubt, say N. + +config UCLAMP_BUCKETS_COUNT + int "Number of supported utilization clamp buckets" + range 5 20 + default 5 + depends on UCLAMP_TASK + help + Defines the number of clamp buckets to use. The range of each bucket + will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the + number of clamp buckets the finer their granularity and the higher + the precision of clamping aggregation and tracking at run-time. + + For example, with the minimum configuration value we will have 5 + clamp buckets tracking 20% utilization each. A 25% boosted tasks will + be refcounted in the [20..39]% bucket and will set the bucket clamp + effective value to 25%. + If a second 30% boosted task should be co-scheduled on the same CPU, + that task will be refcounted in the same bucket of the first task and + it will boost the bucket clamp effective value to 30%. + The clamp effective value of a bucket is reset to its nominal value + (20% in the example above) when there are no more tasks refcounted in + that bucket. + + An additional boost/capping margin can be added to some tasks. In the + example above the 25% task will be boosted to 30% until it exits the + CPU. If that should be considered not acceptable on certain systems, + it's always possible to reduce the margin by increasing the number of + clamp buckets to trade off used memory for run-time tracking + precision. + + If in doubt, use the default value. + +endmenu + # # For architectures that want to enable the support for NUMA-affine scheduler # balancing logic: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e5e02d23e693..d8c1e67afd82 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -772,6 +772,168 @@ static void set_load_weight(struct task_struct *p, bool update_load) } } +#ifdef CONFIG_UCLAMP_TASK + +/* Integer rounded range for each bucket */ +#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) + +#define for_each_clamp_id(clamp_id) \ + for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) + +static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) +{ + return clamp_value / UCLAMP_BUCKET_DELTA; +} + +static inline unsigned int uclamp_none(int clamp_id) +{ + if (clamp_id == UCLAMP_MIN) + return 0; + return SCHED_CAPACITY_SCALE; +} + +static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value) +{ + uc_se->value = value; + uc_se->bucket_id = uclamp_bucket_id(value); +} + +static inline +unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id) +{ + struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; + int bucket_id = UCLAMP_BUCKETS - 1; + + /* + * Since both min and max clamps are max aggregated, find the + * top most bucket with tasks in. + */ + for ( ; bucket_id >= 0; bucket_id--) { + if (!bucket[bucket_id].tasks) + continue; + return bucket[bucket_id].value; + } + + /* No tasks -- default clamp values */ + return uclamp_none(clamp_id); +} + +/* + * When a task is enqueued on a rq, the clamp bucket currently defined by the + * task's uclamp::bucket_id is refcounted on that rq. This also immediately + * updates the rq's clamp value if required. + */ +static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, + unsigned int clamp_id) +{ + struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; + struct uclamp_se *uc_se = &p->uclamp[clamp_id]; + struct uclamp_bucket *bucket; + + lockdep_assert_held(&rq->lock); + + bucket = &uc_rq->bucket[uc_se->bucket_id]; + bucket->tasks++; + + if (uc_se->value > READ_ONCE(uc_rq->value)) + WRITE_ONCE(uc_rq->value, bucket->value); +} + +/* + * When a task is dequeued from a rq, the clamp bucket refcounted by the task + * is released. If this is the last task reference counting the rq's max + * active clamp value, then the rq's clamp value is updated. + * + * Both refcounted tasks and rq's cached clamp values are expected to be + * always valid. If it's detected they are not, as defensive programming, + * enforce the expected state and warn. + */ +static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, + unsigned int clamp_id) +{ + struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; + struct uclamp_se *uc_se = &p->uclamp[clamp_id]; + struct uclamp_bucket *bucket; + unsigned int rq_clamp; + + lockdep_assert_held(&rq->lock); + + bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); + if (likely(bucket->tasks)) + bucket->tasks--; + + if (likely(bucket->tasks)) + return; + + rq_clamp = READ_ONCE(uc_rq->value); + /* + * Defensive programming: this should never happen. If it happens, + * e.g. due to future modification, warn and fixup the expected value. + */ + SCHED_WARN_ON(bucket->value > rq_clamp); + if (bucket->value >= rq_clamp) + WRITE_ONCE(uc_rq->value, uclamp_rq_max_value(rq, clamp_id)); +} + +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) +{ + unsigned int clamp_id; + + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + + for_each_clamp_id(clamp_id) + uclamp_rq_inc_id(rq, p, clamp_id); +} + +static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) +{ + unsigned int clamp_id; + + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + + for_each_clamp_id(clamp_id) + uclamp_rq_dec_id(rq, p, clamp_id); +} + +static void __init init_uclamp(void) +{ + unsigned int clamp_id; + int cpu; + + for_each_possible_cpu(cpu) { + struct uclamp_bucket *bucket; + struct uclamp_rq *uc_rq; + unsigned int bucket_id; + + memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); + + for_each_clamp_id(clamp_id) { + uc_rq = &cpu_rq(cpu)->uclamp[clamp_id]; + + bucket_id = 1; + while (bucket_id < UCLAMP_BUCKETS) { + bucket = &uc_rq->bucket[bucket_id]; + bucket->value = bucket_id * UCLAMP_BUCKET_DELTA; + ++bucket_id; + } + } + } + + for_each_clamp_id(clamp_id) { + uclamp_se_set(&init_task.uclamp[clamp_id], + uclamp_none(clamp_id)); + } +} + +#else /* CONFIG_UCLAMP_TASK */ +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } +static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } +static inline void init_uclamp(void) { } +#endif /* CONFIG_UCLAMP_TASK */ + static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) @@ -782,6 +944,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) psi_enqueue(p, flags & ENQUEUE_WAKEUP); } + uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); } @@ -795,6 +958,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) psi_dequeue(p, flags & DEQUEUE_SLEEP); } + uclamp_rq_dec(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -6093,6 +6257,8 @@ void __init sched_init(void) psi_init(); + init_uclamp(); + scheduler_running = 1; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e58ab597ec88..cecc6baaba93 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -791,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work); #endif #endif /* CONFIG_SMP */ +#ifdef CONFIG_UCLAMP_TASK +/* + * struct uclamp_bucket - Utilization clamp bucket + * @value: utilization clamp value for tasks on this clamp bucket + * @tasks: number of RUNNABLE tasks on this clamp bucket + * + * Keep track of how many tasks are RUNNABLE for a given utilization + * clamp value. + */ +struct uclamp_bucket { + unsigned long value : bits_per(SCHED_CAPACITY_SCALE); + unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); +}; + +/* + * struct uclamp_rq - rq's utilization clamp + * @value: currently active clamp values for a rq + * @bucket: utilization clamp buckets affecting a rq + * + * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. + * A clamp value is affecting a rq when there is at least one task RUNNABLE + * (or actually running) with that value. + * + * There are up to UCLAMP_CNT possible different clamp values, currently there + * are only two: minimum utilization and maximum utilization. + * + * All utilization clamping values are MAX aggregated, since: + * - for util_min: we want to run the CPU at least at the max of the minimum + * utilization required by its currently RUNNABLE tasks. + * - for util_max: we want to allow the CPU to run up to the max of the + * maximum utilization allowed by its currently RUNNABLE tasks. + * + * Since on each system we expect only a limited number of different + * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track + * the metrics required to compute all the per-rq utilization clamp values. + */ +struct uclamp_rq { + unsigned int value; + struct uclamp_bucket bucket[UCLAMP_BUCKETS]; +}; +#endif /* CONFIG_UCLAMP_TASK */ + /* * This is the main, per-CPU runqueue data structure. * @@ -825,6 +867,11 @@ struct rq { unsigned long nr_load_updates; u64 nr_switches; +#ifdef CONFIG_UCLAMP_TASK + /* Utilization clamp values based on CPU's RUNNABLE tasks */ + struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; +#endif + struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; @@ -1639,6 +1686,10 @@ extern const u32 sched_prio_to_wmult[40]; struct sched_class { const struct sched_class *next; +#ifdef CONFIG_UCLAMP_TASK + int uclamp_enabled; +#endif + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); -- cgit v1.2.3-59-g8ed1b From e8f14172c6b11e9a86c65532497087f8eb0f91b1 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:05 +0100 Subject: sched/uclamp: Add system default clamps Tasks without a user-defined clamp value are considered not clamped and by default their utilization can have any value in the [0..SCHED_CAPACITY_SCALE] range. Tasks with a user-defined clamp value are allowed to request any value in that range, and the required clamp is unconditionally enforced. However, a "System Management Software" could be interested in limiting the range of clamp values allowed for all tasks. Add a privileged interface to define a system default configuration via: /proc/sys/kernel/sched_uclamp_util_{min,max} which works as an unconditional clamp range restriction for all tasks. With the default configuration, the full SCHED_CAPACITY_SCALE range of values is allowed for each clamp index. Otherwise, the task-specific clamp is capped by the corresponding system default value. Do that by tracking, for each task, the "effective" clamp value and bucket the task has been refcounted in at enqueue time. This allows to lazy aggregate "requested" and "system default" values at enqueue time and simplifies refcounting updates at dequeue time. The cached bucket ids are used to avoid (relatively) more expensive integer divisions every time a task is enqueued. An active flag is used to report when the "effective" value is valid and thus the task is actually refcounted in the corresponding rq's bucket. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-5-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +++++ include/linux/sched/sysctl.h | 11 +++++ kernel/sched/core.c | 99 +++++++++++++++++++++++++++++++++++++++++++- kernel/sysctl.c | 16 +++++++ 4 files changed, 135 insertions(+), 1 deletion(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index 80235bcd05f2..5485f411e8e1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -586,14 +586,21 @@ struct sched_dl_entity { * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value + * @active: the se is currently refcounted in a rq's bucket * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. + * + * The active bit is set whenever a task has got an "effective" value assigned, + * which can be different from the clamp value "requested" from user-space. + * This allows to know a task is refcounted in the rq's bucket corresponding + * to the "effective" bucket_id. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); + unsigned int active : 1; }; #endif /* CONFIG_UCLAMP_TASK */ @@ -678,6 +685,9 @@ struct task_struct { struct sched_dl_entity dl; #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* Effective clamp values used for a scheduling entity */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 99ce6d728df7..d4f6215ee03f 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -56,6 +56,11 @@ int sched_proc_update_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +#ifdef CONFIG_UCLAMP_TASK +extern unsigned int sysctl_sched_uclamp_util_min; +extern unsigned int sysctl_sched_uclamp_util_max; +#endif + #ifdef CONFIG_CFS_BANDWIDTH extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif @@ -75,6 +80,12 @@ extern int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_UCLAMP_TASK +extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + extern int sysctl_numa_balancing(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2dde735635ec..b74de86b68c7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -773,6 +773,14 @@ static void set_load_weight(struct task_struct *p, bool update_load) } #ifdef CONFIG_UCLAMP_TASK +/* Max allowed minimum utilization */ +unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; + +/* Max allowed maximum utilization */ +unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; + +/* All clamps are required to be less or equal than these values */ +static struct uclamp_se uclamp_default[UCLAMP_CNT]; /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -851,6 +859,25 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +/* + * The effective clamp bucket index of a task depends on, by increasing + * priority: + * - the task specific clamp value, when explicitly requested from userspace + * - the system default clamp value, defined by the sysadmin + */ +static inline struct uclamp_se +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) +{ + struct uclamp_se uc_req = p->uclamp_req[clamp_id]; + struct uclamp_se uc_max = uclamp_default[clamp_id]; + + /* System default restrictions always apply */ + if (unlikely(uc_req.value > uc_max.value)) + return uc_max; + + return uc_req; +} + /* * When a task is enqueued on a rq, the clamp bucket currently defined by the * task's uclamp::bucket_id is refcounted on that rq. This also immediately @@ -870,8 +897,12 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, lockdep_assert_held(&rq->lock); + /* Update task effective clamp */ + p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); + bucket = &uc_rq->bucket[uc_se->bucket_id]; bucket->tasks++; + uc_se->active = true; uclamp_idle_reset(rq, clamp_id, uc_se->value); @@ -910,6 +941,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* * Keep "local max aggregation" simple and accept to (possibly) @@ -958,8 +990,65 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_min, old_max; + static DEFINE_MUTEX(mutex); + int result; + + mutex_lock(&mutex); + old_min = sysctl_sched_uclamp_util_min; + old_max = sysctl_sched_uclamp_util_max; + + result = proc_dointvec(table, write, buffer, lenp, ppos); + if (result) + goto undo; + if (!write) + goto done; + + if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + result = -EINVAL; + goto undo; + } + + if (old_min != sysctl_sched_uclamp_util_min) { + uclamp_se_set(&uclamp_default[UCLAMP_MIN], + sysctl_sched_uclamp_util_min); + } + if (old_max != sysctl_sched_uclamp_util_max) { + uclamp_se_set(&uclamp_default[UCLAMP_MAX], + sysctl_sched_uclamp_util_max); + } + + /* + * Updating all the RUNNABLE task is expensive, keep it simple and do + * just a lazy update at each next enqueue time. + */ + goto done; + +undo: + sysctl_sched_uclamp_util_min = old_min; + sysctl_sched_uclamp_util_max = old_max; +done: + mutex_unlock(&mutex); + + return result; +} + +static void uclamp_fork(struct task_struct *p) +{ + unsigned int clamp_id; + + for_each_clamp_id(clamp_id) + p->uclamp[clamp_id].active = false; +} + static void __init init_uclamp(void) { + struct uclamp_se uc_max = {}; unsigned int clamp_id; int cpu; @@ -969,14 +1058,20 @@ static void __init init_uclamp(void) } for_each_clamp_id(clamp_id) { - uclamp_se_set(&init_task.uclamp[clamp_id], + uclamp_se_set(&init_task.uclamp_req[clamp_id], uclamp_none(clamp_id)); } + + /* System defaults allow max clamp values for both indexes */ + uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX)); + for_each_clamp_id(clamp_id) + uclamp_default[clamp_id] = uc_max; } #else /* CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } +static inline void uclamp_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -2545,6 +2640,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) */ p->prio = current->normal_prio; + uclamp_fork(p); + /* * Revert to default priority/policy on fork if requested. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1beca96fb625..1c1ad1e14f21 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -452,6 +452,22 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", -- cgit v1.2.3-59-g8ed1b