From 389e4ecf5fec9464320971ec707893ccec5a04d1 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Wed, 24 Feb 2021 14:39:27 +0800 Subject: cpufreq: schedutil: Call sugov_update_next_freq() before check to fast_switch_enabled Note that sugov_update_next_freq() may return false, that means the caller sugov_fast_switch() will do nothing except fast switch check. Similarly, sugov_deferred_update() also has unnecessary operations of raw_spin_{lock,unlock} in sugov_update_single_freq() for that case. So, let's call sugov_update_next_freq() before the fast switch check to avoid unnecessary behaviors above. Accordingly, update interface definition to sugov_deferred_update() and remove sugov_fast_switch() since we will call cpufreq_driver_fast_switch() directly instead. Signed-off-by: Yue Hu Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 50cbad89f7fa..6ee9c9bbe505 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -114,19 +114,8 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, return true; } -static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) +static void sugov_deferred_update(struct sugov_policy *sg_policy) { - if (sugov_update_next_freq(sg_policy, time, next_freq)) - cpufreq_driver_fast_switch(sg_policy->policy, next_freq); -} - -static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) -{ - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; - if (!sg_policy->work_in_progress) { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); @@ -366,16 +355,19 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, sg_policy->cached_raw_freq = cached_freq; } + if (!sugov_update_next_freq(sg_policy, time, next_f)) + return; + /* * This code runs under rq->lock for the target CPU, so it won't run * concurrently on two different CPUs for the same target and it is not * necessary to acquire the lock in the fast switch case. */ if (sg_policy->policy->fast_switch_enabled) { - sugov_fast_switch(sg_policy, time, next_f); + cpufreq_driver_fast_switch(sg_policy->policy, next_f); } else { raw_spin_lock(&sg_policy->update_lock); - sugov_deferred_update(sg_policy, time, next_f); + sugov_deferred_update(sg_policy); raw_spin_unlock(&sg_policy->update_lock); } } @@ -454,12 +446,15 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) if (sugov_should_update_freq(sg_policy, time)) { next_f = sugov_next_freq_shared(sg_cpu, time); + if (!sugov_update_next_freq(sg_policy, time, next_f)) + goto unlock; + if (sg_policy->policy->fast_switch_enabled) - sugov_fast_switch(sg_policy, time, next_f); + cpufreq_driver_fast_switch(sg_policy->policy, next_f); else - sugov_deferred_update(sg_policy, time, next_f); + sugov_deferred_update(sg_policy); } - +unlock: raw_spin_unlock(&sg_policy->update_lock); } -- cgit v1.2.3-59-g8ed1b From 4c38f2df71c8e33c0b64865992d693f5022eeaad Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 23 Jun 2020 15:49:40 +0530 Subject: cpufreq: CPPC: Add support for frequency invariance The Frequency Invariance Engine (FIE) is providing a frequency scaling correction factor that helps achieve more accurate load-tracking. Normally, this scaling factor can be obtained directly with the help of the cpufreq drivers as they know the exact frequency the hardware is running at. But that isn't the case for CPPC cpufreq driver. Another way of obtaining that is using the arch specific counter support, which is already present in kernel, but that hardware is optional for platforms. This patch updates the CPPC driver to register itself with the topology core to provide its own implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which gets called by the scheduler on every tick. Note that the arch specific counters have higher priority than CPPC counters, if available, though the CPPC driver doesn't need to have any special handling for that. On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we reach here from hard-irq context), which then schedules a normal work item and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable based on the counter updates since the last tick. To allow platforms to disable this CPPC counter-based frequency invariance support, this is all done under CONFIG_ACPI_CPPC_CPUFREQ_FIE, which is enabled by default. This also exports sched_setattr_nocheck() as the CPPC driver can be built as a module. Cc: linux-acpi@vger.kernel.org Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Tested-by: Vincent Guittot Acked-by: Rafael J. Wysocki Signed-off-by: Viresh Kumar --- drivers/cpufreq/Kconfig.arm | 10 ++ drivers/cpufreq/cppc_cpufreq.c | 245 +++++++++++++++++++++++++++++++++++++++-- include/linux/arch_topology.h | 1 + kernel/sched/core.c | 1 + 4 files changed, 245 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index e65e0a43be64..a5c5f70acfc9 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -19,6 +19,16 @@ config ACPI_CPPC_CPUFREQ If in doubt, say N. +config ACPI_CPPC_CPUFREQ_FIE + bool "Frequency Invariance support for CPPC cpufreq driver" + depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY + default y + help + This extends frequency invariance support in the CPPC cpufreq driver, + by using CPPC delivered and reference performance counters. + + If in doubt, say N. + config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM tristate "Allwinner nvmem based SUN50I CPUFreq driver" depends on ARCH_SUNXI diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 8a482c434ea6..b8e1b8ea628c 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -10,14 +10,18 @@ #define pr_fmt(fmt) "CPPC Cpufreq:" fmt +#include #include #include #include #include #include #include +#include +#include #include #include +#include #include @@ -57,6 +61,204 @@ static struct cppc_workaround_oem_info wa_info[] = { } }; +#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE + +/* Frequency invariance support */ +struct cppc_freq_invariance { + int cpu; + struct irq_work irq_work; + struct kthread_work work; + struct cppc_perf_fb_ctrs prev_perf_fb_ctrs; + struct cppc_cpudata *cpu_data; +}; + +static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv); +static struct kthread_worker *kworker_fie; +static bool fie_disabled; + +static struct cpufreq_driver cppc_cpufreq_driver; +static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu); +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs fb_ctrs_t0, + struct cppc_perf_fb_ctrs fb_ctrs_t1); + +/** + * cppc_scale_freq_workfn - CPPC arch_freq_scale updater for frequency invariance + * @work: The work item. + * + * The CPPC driver register itself with the topology core to provide its own + * implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which + * gets called by the scheduler on every tick. + * + * Note that the arch specific counters have higher priority than CPPC counters, + * if available, though the CPPC driver doesn't need to have any special + * handling for that. + * + * On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we + * reach here from hard-irq context), which then schedules a normal work item + * and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable + * based on the counter updates since the last tick. + */ +static void cppc_scale_freq_workfn(struct kthread_work *work) +{ + struct cppc_freq_invariance *cppc_fi; + struct cppc_perf_fb_ctrs fb_ctrs = {0}; + struct cppc_cpudata *cpu_data; + unsigned long local_freq_scale; + u64 perf; + + cppc_fi = container_of(work, struct cppc_freq_invariance, work); + cpu_data = cppc_fi->cpu_data; + + if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) { + pr_warn("%s: failed to read perf counters\n", __func__); + return; + } + + cppc_fi->prev_perf_fb_ctrs = fb_ctrs; + perf = cppc_perf_from_fbctrs(cpu_data, cppc_fi->prev_perf_fb_ctrs, + fb_ctrs); + + perf <<= SCHED_CAPACITY_SHIFT; + local_freq_scale = div64_u64(perf, cpu_data->perf_caps.highest_perf); + if (WARN_ON(local_freq_scale > 1024)) + local_freq_scale = 1024; + + per_cpu(arch_freq_scale, cppc_fi->cpu) = local_freq_scale; +} + +static void cppc_irq_work(struct irq_work *irq_work) +{ + struct cppc_freq_invariance *cppc_fi; + + cppc_fi = container_of(irq_work, struct cppc_freq_invariance, irq_work); + kthread_queue_work(kworker_fie, &cppc_fi->work); +} + +static void cppc_scale_freq_tick(void) +{ + struct cppc_freq_invariance *cppc_fi = &per_cpu(cppc_freq_inv, smp_processor_id()); + + /* + * cppc_get_perf_ctrs() can potentially sleep, call that from the right + * context. + */ + irq_work_queue(&cppc_fi->irq_work); +} + +static struct scale_freq_data cppc_sftd = { + .source = SCALE_FREQ_SOURCE_CPPC, + .set_freq_scale = cppc_scale_freq_tick, +}; + +static void cppc_freq_invariance_policy_init(struct cpufreq_policy *policy, + struct cppc_cpudata *cpu_data) +{ + struct cppc_perf_fb_ctrs fb_ctrs = {0}; + struct cppc_freq_invariance *cppc_fi; + int i, ret; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + if (fie_disabled) + return; + + for_each_cpu(i, policy->cpus) { + cppc_fi = &per_cpu(cppc_freq_inv, i); + cppc_fi->cpu = i; + cppc_fi->cpu_data = cpu_data; + kthread_init_work(&cppc_fi->work, cppc_scale_freq_workfn); + init_irq_work(&cppc_fi->irq_work, cppc_irq_work); + + ret = cppc_get_perf_ctrs(i, &fb_ctrs); + if (ret) { + pr_warn("%s: failed to read perf counters: %d\n", + __func__, ret); + fie_disabled = true; + } else { + cppc_fi->prev_perf_fb_ctrs = fb_ctrs; + } + } +} + +static void __init cppc_freq_invariance_init(void) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_nice = 0, + .sched_priority = 0, + /* + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ + .sched_runtime = 1000000, + .sched_deadline = 10000000, + .sched_period = 10000000, + }; + int ret; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + if (fie_disabled) + return; + + kworker_fie = kthread_create_worker(0, "cppc_fie"); + if (IS_ERR(kworker_fie)) + return; + + ret = sched_setattr_nocheck(kworker_fie->task, &attr); + if (ret) { + pr_warn("%s: failed to set SCHED_DEADLINE: %d\n", __func__, + ret); + kthread_destroy_worker(kworker_fie); + return; + } + + /* Register for freq-invariance */ + topology_set_scale_freq_source(&cppc_sftd, cpu_present_mask); +} + +static void cppc_freq_invariance_exit(void) +{ + struct cppc_freq_invariance *cppc_fi; + int i; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + if (fie_disabled) + return; + + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, cpu_present_mask); + + for_each_possible_cpu(i) { + cppc_fi = &per_cpu(cppc_freq_inv, i); + irq_work_sync(&cppc_fi->irq_work); + } + + kthread_destroy_worker(kworker_fie); + kworker_fie = NULL; +} + +#else +static inline void +cppc_freq_invariance_policy_init(struct cpufreq_policy *policy, + struct cppc_cpudata *cpu_data) +{ +} + +static inline void cppc_freq_invariance_init(void) +{ +} + +static inline void cppc_freq_invariance_exit(void) +{ +} +#endif /* CONFIG_ACPI_CPPC_CPUFREQ_FIE */ + /* Callback function used to retrieve the max frequency from DMI */ static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) { @@ -355,9 +557,12 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) cpu_data->perf_ctrls.desired_perf = caps->highest_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); - if (ret) + if (ret) { pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", caps->highest_perf, cpu, ret); + } else { + cppc_freq_invariance_policy_init(policy, cpu_data); + } return ret; } @@ -370,12 +575,12 @@ static inline u64 get_delta(u64 t1, u64 t0) return (u32)t1 - (u32)t0; } -static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, - struct cppc_perf_fb_ctrs fb_ctrs_t0, - struct cppc_perf_fb_ctrs fb_ctrs_t1) +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs fb_ctrs_t0, + struct cppc_perf_fb_ctrs fb_ctrs_t1) { u64 delta_reference, delta_delivered; - u64 reference_perf, delivered_perf; + u64 reference_perf; reference_perf = fb_ctrs_t0.reference_perf; @@ -384,12 +589,21 @@ static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, delta_delivered = get_delta(fb_ctrs_t1.delivered, fb_ctrs_t0.delivered); - /* Check to avoid divide-by zero */ - if (delta_reference || delta_delivered) - delivered_perf = (reference_perf * delta_delivered) / - delta_reference; - else - delivered_perf = cpu_data->perf_ctrls.desired_perf; + /* Check to avoid divide-by zero and invalid delivered_perf */ + if (!delta_reference || !delta_delivered) + return cpu_data->perf_ctrls.desired_perf; + + return (reference_perf * delta_delivered) / delta_reference; +} + +static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs fb_ctrs_t0, + struct cppc_perf_fb_ctrs fb_ctrs_t1) +{ + u64 delivered_perf; + + delivered_perf = cppc_perf_from_fbctrs(cpu_data, fb_ctrs_t0, + fb_ctrs_t1); return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf); } @@ -514,6 +728,8 @@ static void cppc_check_hisi_workaround(void) static int __init cppc_cpufreq_init(void) { + int ret; + if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; @@ -521,7 +737,11 @@ static int __init cppc_cpufreq_init(void) cppc_check_hisi_workaround(); - return cpufreq_register_driver(&cppc_cpufreq_driver); + ret = cpufreq_register_driver(&cppc_cpufreq_driver); + if (!ret) + cppc_freq_invariance_init(); + + return ret; } static inline void free_cpu_data(void) @@ -538,6 +758,7 @@ static inline void free_cpu_data(void) static void __exit cppc_cpufreq_exit(void) { + cppc_freq_invariance_exit(); cpufreq_unregister_driver(&cppc_cpufreq_driver); free_cpu_data(); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 11e555cfaecb..f180240dc95f 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -37,6 +37,7 @@ bool topology_scale_freq_invariant(void); enum scale_freq_source { SCALE_FREQ_SOURCE_CPUFREQ = 0, SCALE_FREQ_SOURCE_ARCH, + SCALE_FREQ_SOURCE_CPPC, }; struct scale_freq_data { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca2bb629595f..3adedc7b1725 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6386,6 +6386,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -- cgit v1.2.3-59-g8ed1b From 4c81cb7e64436a729cf20cdddaf18a9b4a638430 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Mar 2021 20:13:37 +0200 Subject: tick/nohz: Improve tick_nohz_get_next_hrtimer() kerneldoc Make the tick_nohz_get_next_hrtimer() kerneldoc comment state clearly that the function may return negative numbers. Signed-off-by: Rafael J. Wysocki --- kernel/time/tick-sched.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e10a4af88737..ee0032b95a7c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1124,7 +1124,11 @@ ktime_t tick_nohz_get_next_hrtimer(void) * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * - * Called from power state control code with interrupts disabled + * Called from power state control code with interrupts disabled. + * + * The return value of this function and/or the value returned by it through the + * @delta_next pointer can be negative which must be taken into account by its + * callers. */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { -- cgit v1.2.3-59-g8ed1b From e4b2897ae1a81cce79a8f5fcceda74c3f31c4b73 Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 8 Apr 2021 16:14:44 +0800 Subject: PM: sleep: fix typos in comments Change "occured" to "occurred" in kernel/power/autosleep.c. Change "consiting" to "consisting" in kernel/power/snapshot.c. Change "avaiable" to "available" in kernel/power/swap.c. No functionality changed. Signed-off-by: Lu Jialin Signed-off-by: Rafael J. Wysocki --- kernel/power/autosleep.c | 2 +- kernel/power/snapshot.c | 2 +- kernel/power/swap.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index 9af5a50d3489..b29c8aca7486 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -54,7 +54,7 @@ static void try_to_suspend(struct work_struct *work) goto out; /* - * If the wakeup occured for an unknown reason, wait to prevent the + * If the wakeup occurred for an unknown reason, wait to prevent the * system from trying to suspend and waking up in a tight loop. */ if (final_count == initial_count) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d63560e1cf87..1a221dcb3c01 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -329,7 +329,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) /** * Data types related to memory bitmaps. * - * Memory bitmap is a structure consiting of many linked lists of + * Memory bitmap is a structure consisting of many linked lists of * objects. The main list's elements are of type struct zone_bitmap * and each of them corresonds to one zone. For each zone bitmap * object there is a list of objects of type struct bm_block that diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 72e33054a2e1..bea3cb8afa11 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -884,7 +884,7 @@ out_clean: * enough_swap - Make sure we have enough swap to save the image. * * Returns TRUE or FALSE after checking the total amount of swap - * space avaiable from the resume partition. + * space available from the resume partition. */ static int enough_swap(unsigned int nr_pages) -- cgit v1.2.3-59-g8ed1b