From 1b04722c3b892033f143d056a2876f293a1adbcc Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 8 May 2018 08:33:40 +0100 Subject: Revert "cpufreq: schedutil: Don't restrict kthread to related_cpus unnecessarily" This reverts commit e2cabe48c20efb174ce0c01190f8b9c5f3ea1d13. Lifting the restriction that the sugov kthread is bound to the policy->related_cpus for a system with a slow switching cpufreq driver, which is able to perform DVFS from any cpu (e.g. cpufreq-dt), is not only not beneficial it also harms Enery-Aware Scheduling (EAS) on systems with asymmetric cpu capacities (e.g. Arm big.LITTLE). The sugov kthread which does the update for the little cpus could potentially run on a big cpu. It could prevent that the big cluster goes into deeper idle states although all the tasks are running on the little cluster. Example: hikey960 w/ 4.16.0-rc6-+ Arm big.LITTLE with per-cluster DVFS root@h960:~# cat /proc/cpuinfo | grep "^CPU part" CPU part : 0xd03 (Cortex-A53, little cpu) CPU part : 0xd03 CPU part : 0xd03 CPU part : 0xd03 CPU part : 0xd09 (Cortex-A73, big cpu) CPU part : 0xd09 CPU part : 0xd09 CPU part : 0xd09 root@h960:/sys/devices/system/cpu/cpufreq# ls policy0 policy4 schedutil root@h960:/sys/devices/system/cpu/cpufreq# cat policy*/related_cpus 0 1 2 3 4 5 6 7 (1) w/o the revert: root@h960:~# ps -eo pid,class,rtprio,pri,psr,comm | awk 'NR == 1 || /sugov/' PID CLS RTPRIO PRI PSR COMMAND 1489 #6 0 140 1 sugov:0 1490 #6 0 140 0 sugov:4 The sugov kthread sugov:4 responsible for policy4 runs on cpu0. (In this case both sugov kthreads run on little cpus). cross policy (cluster) remote callback example: ... migration/1-14 [001] enqueue_task_fair: this_cpu=1 cpu_of(rq)=5 migration/1-14 [001] sugov_update_shared: this_cpu=1 sg_cpu->cpu=5 sg_cpu->sg_policy->policy->related_cpus=4-7 sugov:4-1490 [000] sugov_work: this_cpu=0 sg_cpu->sg_policy->policy->related_cpus=4-7 ... The remote callback (this_cpu=1, target_cpu=5) is executed on cpu=0. (2) w/ the revert: root@h960:~# ps -eo pid,class,rtprio,pri,psr,comm | awk 'NR == 1 || /sugov/' PID CLS RTPRIO PRI PSR COMMAND 1491 #6 0 140 2 sugov:0 1492 #6 0 140 4 sugov:4 The sugov kthread sugov:4 responsible for policy4 runs on cpu4. cross policy (cluster) remote callback example: ... migration/1-14 [001] enqueue_task_fair: this_cpu=1 cpu_of(rq)=7 migration/1-14 [001] sugov_update_shared: this_cpu=1 sg_cpu->cpu=7 sg_cpu->sg_policy->policy->related_cpus=4-7 sugov:4-1492 [004] sugov_work: this_cpu=4 sg_cpu->sg_policy->policy->related_cpus=4-7 ... The remote callback (this_cpu=1, target_cpu=7) is executed on cpu=4. Now the sugov kthread executes again on the policy (cluster) for which the Operating Performance Point (OPP) should be changed. It avoids the problem that an otherwise idle policy (cluster) is running schedutil (the sugov kthread) for another one. Signed-off-by: Dietmar Eggemann Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e13df951aca7..d7e5194a820d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -511,11 +511,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - - /* Kthread is bound to all CPUs by default */ - if (!policy->dvfs_possible_from_any_cpu) - kthread_bind_mask(thread, policy->related_cpus); - + kthread_bind_mask(thread, policy->related_cpus); init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); -- cgit v1.2.3-59-g8ed1b From ecd2884291261e3fddbc7651ee11a20d596bb514 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 9 May 2018 16:05:24 +0530 Subject: cpufreq: schedutil: Don't set next_freq to UINT_MAX The schedutil driver sets sg_policy->next_freq to UINT_MAX on certain occasions to discard the cached value of next freq: - In sugov_start(), when the schedutil governor is started for a group of CPUs. - And whenever we need to force a freq update before rate-limit duration, which happens when: - there is an update in cpufreq policy limits. - Or when the utilization of DL scheduling class increases. In return, get_next_freq() doesn't return a cached next_freq value but recalculates the next frequency instead. But having special meaning for a particular value of frequency makes the code less readable and error prone. We recently fixed a bug where the UINT_MAX value was considered as valid frequency in sugov_update_single(). All we need is a flag which can be used to discard the value of sg_policy->next_freq and we already have need_freq_update for that. Lets reuse it instead of setting next_freq to UINT_MAX. Signed-off-by: Viresh Kumar Reviewed-by: Joel Fernandes (Google) Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d7e5194a820d..2442decbfec7 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -95,15 +95,8 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (sg_policy->work_in_progress) return false; - if (unlikely(sg_policy->need_freq_update)) { - sg_policy->need_freq_update = false; - /* - * This happens when limits change, so forget the previous - * next_freq value and force an update. - */ - sg_policy->next_freq = UINT_MAX; + if (unlikely(sg_policy->need_freq_update)) return true; - } delta_ns = time - sg_policy->last_freq_update_time; @@ -165,8 +158,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, freq = (freq + (freq >> 2)) * util / max; - if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; + + sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -305,8 +300,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq && - sg_policy->next_freq != UINT_MAX) { + if (busy && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; /* Reset cached freq as next_freq has changed */ @@ -654,7 +648,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; sg_policy->last_freq_update_time = 0; - sg_policy->next_freq = UINT_MAX; + sg_policy->next_freq = 0; sg_policy->work_in_progress = false; sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; -- cgit v1.2.3-59-g8ed1b From 295f1a99536b87bb8c58baa3a294d3b081cd46a5 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 22 May 2018 12:07:53 +0100 Subject: cpufreq: schedutil: Fix iowait boost reset A more energy efficient update of the IO wait boosting mechanism has been introduced in: commit a5a0809bc58e ("cpufreq: schedutil: Make iowait boost more energy efficient") where the boost value is expected to be: - doubled at each successive wakeup from IO staring from the minimum frequency supported by a CPU - reset when a CPU is not updated for more then one tick by either disabling the IO wait boost or resetting its value to the minimum frequency if this new update requires an IO boost. This approach is supposed to "ignore" boosting for sporadic wakeups from IO, while still getting the frequency boosted to the maximum to benefit long sequence of wakeup from IO operations. However, these assumptions are not always satisfied. For example, when an IO boosted CPU enters idle for more the one tick and then wakes up after an IO wait, since in sugov_set_iowait_boost() we first check the IOWAIT flag, we keep doubling the iowait boost instead of restarting from the minimum frequency value. This misbehavior could happen mainly on non-shared frequency domains, thus defeating the energy efficiency optimization, but it can also happen on shared frequency domain systems. Let fix this issue in sugov_set_iowait_boost() by: - first check the IO wait boost reset conditions to eventually reset the boost value - then applying the correct IO boost value if required by the caller Fixes: a5a0809bc58e (cpufreq: schedutil: Make iowait boost more energy efficient) Reported-by: Viresh Kumar Signed-off-by: Patrick Bellasi Reviewed-by: Joel Fernandes (Google) Acked-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2442decbfec7..6192e0ed7a7c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -198,6 +198,16 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { + /* Clear iowait_boost if the CPU apprears to have been idle. */ + if (sg_cpu->iowait_boost) { + s64 delta_ns = time - sg_cpu->last_update; + + if (delta_ns > TICK_NSEC) { + sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_pending = false; + } + } + if (flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->iowait_boost_pending) return; @@ -211,14 +221,6 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned } else { sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; } - } else if (sg_cpu->iowait_boost) { - s64 delta_ns = time - sg_cpu->last_update; - - /* Clear iowait_boost if the CPU apprears to have been idle. */ - if (delta_ns > TICK_NSEC) { - sg_cpu->iowait_boost = 0; - sg_cpu->iowait_boost_pending = false; - } } } -- cgit v1.2.3-59-g8ed1b From fd7d5287fd65df054bdade3e52ceb645cb411e72 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 22 May 2018 12:07:54 +0100 Subject: cpufreq: schedutil: Cleanup and document iowait boost The iowait boosting code has been recently updated to add a progressive boosting behavior which allows to be less aggressive in boosting tasks doing only sporadic IO operations, thus being more energy efficient for example on mobile platforms. The current code is now however a bit convoluted. Some functionalities (e.g. iowait boost reset) are replicated in different paths and their documentation is slightly misaligned. Let's cleanup the code by consolidating all the IO wait boosting related functionality within within few dedicated functions and better define their role: - sugov_iowait_boost: set/increase the IO wait boost of a CPU - sugov_iowait_apply: apply/reduce the IO wait boost of a CPU Both these two function are used at every sugov update and they make use of a unified IO wait boost reset policy provided by: - sugov_iowait_reset: reset/disable the IO wait boost of a CPU if a CPU is not updated for more then one tick This makes possible a cleaner and more self-contained design for the IO wait boosting code since the rest of the sugov update routines, both for single and shared frequency domains, follow the same template: /* Configure IO boost, if required */ sugov_iowait_boost() /* Return here if freq change is in progress or throttled */ /* Collect and aggregate utilization information */ sugov_get_util() sugov_aggregate_util() /* * Add IO boost, if currently enabled, on top of the aggregated * utilization value */ sugov_iowait_apply() As a extra bonus, let's also add the documentation for the new functions and better align the in-code documentation. Signed-off-by: Patrick Bellasi Reviewed-by: Joel Fernandes (Google) Acked-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 152 +++++++++++++++++++++++++++------------ 1 file changed, 107 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6192e0ed7a7c..416b7d7853d4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -51,7 +51,7 @@ struct sugov_cpu { bool iowait_boost_pending; unsigned int iowait_boost; unsigned int iowait_boost_max; - u64 last_update; + u64 last_update; /* The fields below are only needed when sharing a policy: */ unsigned long util_cfs; @@ -196,45 +196,120 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) return min(util, sg_cpu->max); } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) +/** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. + * @sg_cpu: the sugov data for the CPU to boost + * @time: the update time from the caller + * @set_iowait_boost: true if an IO boost has been requested + * + * The IO wait boost of a task is disabled after a tick since the last update + * of a CPU. If a new IO wait boost is requested after more then a tick, then + * we enable the boost starting from the minimum frequency, which improves + * energy efficiency by ignoring sporadic wakeups from IO. + */ +static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, + bool set_iowait_boost) { - /* Clear iowait_boost if the CPU apprears to have been idle. */ - if (sg_cpu->iowait_boost) { - s64 delta_ns = time - sg_cpu->last_update; + s64 delta_ns = time - sg_cpu->last_update; - if (delta_ns > TICK_NSEC) { - sg_cpu->iowait_boost = 0; - sg_cpu->iowait_boost_pending = false; - } - } + /* Reset boost only if a tick has elapsed since last request */ + if (delta_ns <= TICK_NSEC) + return false; - if (flags & SCHED_CPUFREQ_IOWAIT) { - if (sg_cpu->iowait_boost_pending) - return; + sg_cpu->iowait_boost = set_iowait_boost + ? sg_cpu->sg_policy->policy->min : 0; + sg_cpu->iowait_boost_pending = set_iowait_boost; - sg_cpu->iowait_boost_pending = true; + return true; +} - if (sg_cpu->iowait_boost) { - sg_cpu->iowait_boost <<= 1; - if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; - } else { - sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; - } +/** + * sugov_iowait_boost() - Updates the IO boost status of a CPU. + * @sg_cpu: the sugov data for the CPU to boost + * @time: the update time from the caller + * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait + * + * Each time a task wakes up after an IO operation, the CPU utilization can be + * boosted to a certain utilization which doubles at each "frequent and + * successive" wakeup from IO, ranging from the utilization of the minimum + * OPP to the utilization of the maximum OPP. + * To keep doubling, an IO boost has to be requested at least once per tick, + * otherwise we restart from the utilization of the minimum OPP. + */ +static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + unsigned int flags) +{ + bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; + + /* Reset boost if the CPU appears to have been idle enough */ + if (sg_cpu->iowait_boost && + sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) + return; + + /* Boost only tasks waking up after IO */ + if (!set_iowait_boost) + return; + + /* Ensure boost doubles only one time at each request */ + if (sg_cpu->iowait_boost_pending) + return; + sg_cpu->iowait_boost_pending = true; + + /* Double the boost at each request */ + if (sg_cpu->iowait_boost) { + sg_cpu->iowait_boost <<= 1; + if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + return; } + + /* First wakeup after IO: start with minimum boost */ + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; } -static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, - unsigned long *max) +/** + * sugov_iowait_apply() - Apply the IO boost to a CPU. + * @sg_cpu: the sugov data for the cpu to boost + * @time: the update time from the caller + * @util: the utilization to (eventually) boost + * @max: the maximum value the utilization can be boosted to + * + * A CPU running a task which woken up after an IO operation can have its + * utilization boosted to speed up the completion of those IO operations. + * The IO boost value is increased each time a task wakes up from IO, in + * sugov_iowait_apply(), and it's instead decreased by this function, + * each time an increase has not been requested (!iowait_boost_pending). + * + * A CPU which also appears to have been idle for at least one tick has also + * its IO boost utilization reset. + * + * This mechanism is designed to boost high frequently IO waiting tasks, while + * being more conservative on tasks which does sporadic IO operations. + */ +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long *util, unsigned long *max) { unsigned int boost_util, boost_max; + /* No boost currently required */ if (!sg_cpu->iowait_boost) return; + /* Reset boost if the CPU appears to have been idle enough */ + if (sugov_iowait_reset(sg_cpu, time, false)) + return; + + /* + * An IO waiting task has just woken up: + * allow to further double the boost value + */ if (sg_cpu->iowait_boost_pending) { sg_cpu->iowait_boost_pending = false; } else { + /* + * Otherwise: reduce the boost value and disable it when we + * reach the minimum. + */ sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { sg_cpu->iowait_boost = 0; @@ -242,9 +317,12 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, } } + /* + * Apply the current boost value: a CPU is boosted only if its current + * utilization is smaller then the current IO boost level. + */ boost_util = sg_cpu->iowait_boost; boost_max = sg_cpu->iowait_boost_max; - if (*util * boost_max < *max * boost_util) { *util = boost_util; *max = boost_max; @@ -283,7 +361,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; ignore_dl_rate_limit(sg_cpu, sg_policy); @@ -296,7 +374,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_get_util(sg_cpu); max = sg_cpu->max; util = sugov_aggregate_util(sg_cpu); - sugov_iowait_boost(sg_cpu, &util, &max); + sugov_iowait_apply(sg_cpu, time, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* * Do not reduce the frequency if the CPU has not been idle @@ -322,28 +400,12 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; - s64 delta_ns; sugov_get_util(j_sg_cpu); - - /* - * If the CFS CPU utilization was last updated before the - * previous frequency update and the time elapsed between the - * last update of the CPU utilization and the last frequency - * update is long enough, reset iowait_boost and util_cfs, as - * they are now probably stale. However, still consider the - * CPU contribution if it has some DEADLINE utilization - * (util_dl). - */ - delta_ns = time - j_sg_cpu->last_update; - if (delta_ns > TICK_NSEC) { - j_sg_cpu->iowait_boost = 0; - j_sg_cpu->iowait_boost_pending = false; - } - j_max = j_sg_cpu->max; j_util = sugov_aggregate_util(j_sg_cpu); - sugov_iowait_boost(j_sg_cpu, &j_util, &j_max); + sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); + if (j_util * max > j_max * util) { util = j_util; max = j_max; @@ -362,7 +424,7 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) raw_spin_lock(&sg_policy->update_lock); - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; ignore_dl_rate_limit(sg_cpu, sg_policy); -- cgit v1.2.3-59-g8ed1b From 036399782bf51dafb932b680b260936b2b5f8dd6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 22 May 2018 15:31:30 +0530 Subject: cpufreq: Rename cpufreq_can_do_remote_dvfs() This routine checks if the CPU running this code belongs to the policy of the target CPU or if not, can it do remote DVFS for it remotely. But the current name of it implies as if it is only about doing remote updates. Rename it to make it more relevant. Suggested-by: Rafael J. Wysocki Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_governor.c | 2 +- include/linux/cpufreq.h | 2 +- kernel/sched/cpufreq_schedutil.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index ca38229b045a..871bf9cf55cf 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -278,7 +278,7 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time, struct policy_dbs_info *policy_dbs = cdbs->policy_dbs; u64 delta_ns, lst; - if (!cpufreq_can_do_remote_dvfs(policy_dbs->policy)) + if (!cpufreq_this_cpu_can_update(policy_dbs->policy)) return; /* diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 87f48dd932eb..882a9b9e34bc 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -571,7 +571,7 @@ struct governor_attr { size_t count); }; -static inline bool cpufreq_can_do_remote_dvfs(struct cpufreq_policy *policy) +static inline bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy) { /* * Allow remote callbacks if: diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 416b7d7853d4..caf435c14a52 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -89,7 +89,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) * schedule the kthread. */ if (sg_policy->policy->fast_switch_enabled && - !cpufreq_can_do_remote_dvfs(sg_policy->policy)) + !cpufreq_this_cpu_can_update(sg_policy->policy)) return false; if (sg_policy->work_in_progress) -- cgit v1.2.3-59-g8ed1b From 152db033d77589df9ff1b93c1b311d4cd2e93bd0 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 22 May 2018 15:55:53 -0700 Subject: schedutil: Allow cpufreq requests to be made even when kthread kicked Currently there is a chance of a schedutil cpufreq update request to be dropped if there is a pending update request. This pending request can be delayed if there is a scheduling delay of the irq_work and the wake up of the schedutil governor kthread. A very bad scenario is when a schedutil request was already just made, such as to reduce the CPU frequency, then a newer request to increase CPU frequency (even sched deadline urgent frequency increase requests) can be dropped, even though the rate limits suggest that its Ok to process a request. This is because of the way the work_in_progress flag is used. This patch improves the situation by allowing new requests to happen even though the old one is still being processed. Note that in this approach, if an irq_work was already issued, we just update next_freq and don't bother to queue another request so there's no extra work being done to make this happen. Acked-by: Viresh Kumar Acked-by: Juri Lelli Signed-off-by: Joel Fernandes (Google) Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index caf435c14a52..178946e36393 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -92,9 +92,6 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) !cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (sg_policy->work_in_progress) - return false; - if (unlikely(sg_policy->need_freq_update)) return true; @@ -121,7 +118,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, policy->cur = next_freq; trace_cpu_frequency(next_freq, smp_processor_id()); - } else { + } else if (!sg_policy->work_in_progress) { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } @@ -366,6 +363,13 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, ignore_dl_rate_limit(sg_cpu, sg_policy); + /* + * For slow-switch systems, single policy requests can't run at the + * moment if update is in progress, unless we acquire update_lock. + */ + if (sg_policy->work_in_progress) + return; + if (!sugov_should_update_freq(sg_policy, time)) return; @@ -440,13 +444,27 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + unsigned int freq; + unsigned long flags; + + /* + * Hold sg_policy->update_lock shortly to handle the case where: + * incase sg_policy->next_freq is read here, and then updated by + * sugov_update_shared just before work_in_progress is set to false + * here, we may miss queueing the new update. + * + * Note: If a work was queued after the update_lock is released, + * sugov_work will just be called again by kthread_work code; and the + * request will be proceed before the sugov thread sleeps. + */ + raw_spin_lock_irqsave(&sg_policy->update_lock, flags); + freq = sg_policy->next_freq; + sg_policy->work_in_progress = false; + raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags); mutex_lock(&sg_policy->work_lock); - __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, - CPUFREQ_RELATION_L); + __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); mutex_unlock(&sg_policy->work_lock); - - sg_policy->work_in_progress = false; } static void sugov_irq_work(struct irq_work *irq_work) -- cgit v1.2.3-59-g8ed1b From a61dec7447456858dfc88fe056017a91ab903ed0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 May 2018 11:47:45 +0200 Subject: cpufreq: schedutil: Avoid missing updates for one-CPU policies Commit 152db033d775 (schedutil: Allow cpufreq requests to be made even when kthread kicked) made changes to prevent utilization updates from being discarded during processing a previous request, but it left a small window in which that still can happen in the one-CPU policy case. Namely, updates coming in after setting work_in_progress in sugov_update_commit() and clearing it in sugov_work() will still be dropped due to the work_in_progress check in sugov_update_single(). To close that window, rearrange the code so as to acquire the update lock around the deferred update branch in sugov_update_single() and drop the work_in_progress check from it. Signed-off-by: Rafael J. Wysocki Reviewed-by: Juri Lelli Acked-by: Viresh Kumar Reviewed-by: Joel Fernandes (Google) --- kernel/sched/cpufreq_schedutil.c | 70 +++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 178946e36393..fd76497efeb1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -100,25 +100,41 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) return delta_ns >= sg_policy->freq_update_delay_ns; } -static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) +static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) { - struct cpufreq_policy *policy = sg_policy->policy; - if (sg_policy->next_freq == next_freq) - return; + return false; sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; - if (policy->fast_switch_enabled) { - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; + return true; +} + +static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + struct cpufreq_policy *policy = sg_policy->policy; + + if (!sugov_update_next_freq(sg_policy, time, next_freq)) + return; + + next_freq = cpufreq_driver_fast_switch(policy, next_freq); + if (!next_freq) + return; - policy->cur = next_freq; - trace_cpu_frequency(next_freq, smp_processor_id()); - } else if (!sg_policy->work_in_progress) { + policy->cur = next_freq; + trace_cpu_frequency(next_freq, smp_processor_id()); +} + +static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + if (!sugov_update_next_freq(sg_policy, time, next_freq)) + return; + + if (!sg_policy->work_in_progress) { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } @@ -363,13 +379,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, ignore_dl_rate_limit(sg_cpu, sg_policy); - /* - * For slow-switch systems, single policy requests can't run at the - * moment if update is in progress, unless we acquire update_lock. - */ - if (sg_policy->work_in_progress) - return; - if (!sugov_should_update_freq(sg_policy, time)) return; @@ -391,7 +400,18 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sg_policy->cached_raw_freq = 0; } - sugov_update_commit(sg_policy, time, next_f); + /* + * This code runs under rq->lock for the target CPU, so it won't run + * concurrently on two different CPUs for the same target and it is not + * necessary to acquire the lock in the fast switch case. + */ + if (sg_policy->policy->fast_switch_enabled) { + sugov_fast_switch(sg_policy, time, next_f); + } else { + raw_spin_lock(&sg_policy->update_lock); + sugov_deferred_update(sg_policy, time, next_f); + raw_spin_unlock(&sg_policy->update_lock); + } } static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) @@ -435,7 +455,11 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) if (sugov_should_update_freq(sg_policy, time)) { next_f = sugov_next_freq_shared(sg_cpu, time); - sugov_update_commit(sg_policy, time, next_f); + + if (sg_policy->policy->fast_switch_enabled) + sugov_fast_switch(sg_policy, time, next_f); + else + sugov_deferred_update(sg_policy, time, next_f); } raw_spin_unlock(&sg_policy->update_lock); @@ -450,11 +474,11 @@ static void sugov_work(struct kthread_work *work) /* * Hold sg_policy->update_lock shortly to handle the case where: * incase sg_policy->next_freq is read here, and then updated by - * sugov_update_shared just before work_in_progress is set to false + * sugov_deferred_update() just before work_in_progress is set to false * here, we may miss queueing the new update. * * Note: If a work was queued after the update_lock is released, - * sugov_work will just be called again by kthread_work code; and the + * sugov_work() will just be called again by kthread_work code; and the * request will be proceed before the sugov thread sleeps. */ raw_spin_lock_irqsave(&sg_policy->update_lock, flags); -- cgit v1.2.3-59-g8ed1b