diff options
Diffstat (limited to 'drivers/cpuidle')
28 files changed, 2402 insertions, 676 deletions
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index c0aeedd66f02..ff71dd662880 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -47,6 +47,10 @@ config CPU_IDLE_GOV_HALTPOLL config DT_IDLE_STATES bool +config DT_IDLE_GENPD + depends on PM_GENERIC_DOMAINS_OF + bool + menu "ARM CPU Idle Drivers" depends on ARM || ARM64 source "drivers/cpuidle/Kconfig.arm" @@ -62,6 +66,11 @@ depends on PPC source "drivers/cpuidle/Kconfig.powerpc" endmenu +menu "RISC-V CPU Idle Drivers" +depends on RISCV +source "drivers/cpuidle/Kconfig.riscv" +endmenu + config HALTPOLL_CPUIDLE tristate "Halt poll cpuidle driver" depends on X86 && KVM_GUEST diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm index 62272ecfa771..747aa537389b 100644 --- a/drivers/cpuidle/Kconfig.arm +++ b/drivers/cpuidle/Kconfig.arm @@ -3,7 +3,8 @@ # ARM CPU Idle drivers # config ARM_CPUIDLE - bool "Generic ARM/ARM64 CPU idle Driver" + bool "Generic ARM CPU idle Driver" + depends on ARM select DT_IDLE_STATES select CPU_IDLE_MULTIPLE_DRIVERS help @@ -23,6 +24,17 @@ config ARM_PSCI_CPUIDLE It provides an idle driver that is capable of detecting and managing idle states through the PSCI firmware interface. +config ARM_PSCI_CPUIDLE_DOMAIN + bool "PSCI CPU idle Domain" + depends on ARM_PSCI_CPUIDLE + depends on PM_GENERIC_DOMAINS_OF + select DT_IDLE_GENPD + default y + help + Select this to enable the PSCI based CPUidle driver to use PM domains, + which is needed to support the hierarchical DT based layout of the + idle states. + config ARM_BIG_LITTLE_CPUIDLE bool "Support for ARM big.LITTLE processors" depends on ARCH_VEXPRESS_TC2_PM || ARCH_EXYNOS || COMPILE_TEST @@ -86,3 +98,25 @@ config ARM_MVEBU_V7_CPUIDLE depends on (ARCH_MVEBU || COMPILE_TEST) && !ARM64 help Select this to enable cpuidle on Armada 370, 38x and XP processors. + +config ARM_TEGRA_CPUIDLE + bool "CPU Idle Driver for NVIDIA Tegra SoCs" + depends on (ARCH_TEGRA || COMPILE_TEST) && !ARM64 && MMU + select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP + select ARM_CPU_SUSPEND + help + Select this to enable cpuidle for NVIDIA Tegra20/30/114/124 SoCs. + +config ARM_QCOM_SPM_CPUIDLE + bool "CPU Idle Driver for Qualcomm Subsystem Power Manager (SPM)" + depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64 && MMU + select ARM_CPU_SUSPEND + select CPU_IDLE_MULTIPLE_DRIVERS + select DT_IDLE_STATES + select QCOM_SCM + select QCOM_SPM + help + Select this to enable cpuidle for Qualcomm processors. + The Subsystem Power Manager (SPM) controls low power modes for the + CPU and L2 cores. It interface with various system drivers to put + the cores in low power modes. diff --git a/drivers/cpuidle/Kconfig.riscv b/drivers/cpuidle/Kconfig.riscv new file mode 100644 index 000000000000..78518c26af74 --- /dev/null +++ b/drivers/cpuidle/Kconfig.riscv @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# RISC-V CPU Idle drivers +# + +config RISCV_SBI_CPUIDLE + bool "RISC-V SBI CPU idle Driver" + depends on RISCV_SBI + select DT_IDLE_STATES + select CPU_IDLE_MULTIPLE_DRIVERS + select DT_IDLE_GENPD if PM_GENERIC_DOMAINS_OF + help + Select this option to enable RISC-V SBI firmware based CPU idle + driver for RISC-V systems. This drivers also supports hierarchical + DT based layout of the idle state. diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile index cc8c769d7fa9..d103342b7cfc 100644 --- a/drivers/cpuidle/Makefile +++ b/drivers/cpuidle/Makefile @@ -6,6 +6,7 @@ obj-y += cpuidle.o driver.o governor.o sysfs.o governors/ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o obj-$(CONFIG_DT_IDLE_STATES) += dt_idle_states.o +obj-$(CONFIG_DT_IDLE_GENPD) += dt_idle_genpd.o obj-$(CONFIG_ARCH_HAS_CPU_RELAX) += poll_state.o obj-$(CONFIG_HALTPOLL_CPUIDLE) += cpuidle-haltpoll.o @@ -21,9 +22,10 @@ obj-$(CONFIG_ARM_U8500_CPUIDLE) += cpuidle-ux500.o obj-$(CONFIG_ARM_AT91_CPUIDLE) += cpuidle-at91.o obj-$(CONFIG_ARM_EXYNOS_CPUIDLE) += cpuidle-exynos.o obj-$(CONFIG_ARM_CPUIDLE) += cpuidle-arm.o -obj-$(CONFIG_ARM_PSCI_CPUIDLE) += cpuidle_psci.o -cpuidle_psci-y := cpuidle-psci.o -cpuidle_psci-$(CONFIG_PM_GENERIC_DOMAINS_OF) += cpuidle-psci-domain.o +obj-$(CONFIG_ARM_PSCI_CPUIDLE) += cpuidle-psci.o +obj-$(CONFIG_ARM_PSCI_CPUIDLE_DOMAIN) += cpuidle-psci-domain.o +obj-$(CONFIG_ARM_TEGRA_CPUIDLE) += cpuidle-tegra.o +obj-$(CONFIG_ARM_QCOM_SPM_CPUIDLE) += cpuidle-qcom-spm.o ############################################################################### # MIPS drivers @@ -33,3 +35,7 @@ obj-$(CONFIG_MIPS_CPS_CPUIDLE) += cpuidle-cps.o # POWERPC drivers obj-$(CONFIG_PSERIES_CPUIDLE) += cpuidle-pseries.o obj-$(CONFIG_POWERNV_CPUIDLE) += cpuidle-powernv.o + +############################################################################### +# RISC-V drivers +obj-$(CONFIG_RISCV_SBI_CPUIDLE) += cpuidle-riscv-sbi.o diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c index 04003b90dc49..9acde71558d5 100644 --- a/drivers/cpuidle/coupled.c +++ b/drivers/cpuidle/coupled.c @@ -54,7 +54,7 @@ * variable is not locked. It is only written from the cpu that * it stores (or by the on/offlining cpu if that cpu is offline), * and only read after all the cpus are ready for the coupled idle - * state are are no longer updating it. + * state are no longer updating it. * * Three atomic counters are used. alive_count tracks the number * of cpus in the coupled set that are currently or soon will be @@ -674,8 +674,7 @@ have_coupled: coupled->refcnt++; csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu); - csd->func = cpuidle_coupled_handle_poke; - csd->info = (void *)(unsigned long)dev->cpu; + INIT_CSD(csd, cpuidle_coupled_handle_poke, (void *)(unsigned long)dev->cpu); return 0; } diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c index 9e5156d39627..8c758920d699 100644 --- a/drivers/cpuidle/cpuidle-arm.c +++ b/drivers/cpuidle/cpuidle-arm.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "CPUidle arm: " fmt +#include <linux/cpu_cooling.h> #include <linux/cpuidle.h> #include <linux/cpumask.h> #include <linux/cpu_pm.h> @@ -124,6 +125,8 @@ static int __init arm_idle_init_cpu(int cpu) if (ret) goto out_kfree_drv; + cpuidle_cooling_register(drv); + return 0; out_kfree_drv: diff --git a/drivers/cpuidle/cpuidle-at91.c b/drivers/cpuidle/cpuidle-at91.c index 9c5853b6ca4a..45ee8e1e71ae 100644 --- a/drivers/cpuidle/cpuidle-at91.c +++ b/drivers/cpuidle/cpuidle-at91.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * based on arch/arm/mach-kirkwood/cpuidle.c * * CPU idle support for AT91 SoC * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. - * * The cpu idle uses wait-for-interrupt and RAM self refresh in order * to implement two idle states - * #1 wait-for-interrupt diff --git a/drivers/cpuidle/cpuidle-big_little.c b/drivers/cpuidle/cpuidle-big_little.c index 7f8ddc04342d..abe51185f243 100644 --- a/drivers/cpuidle/cpuidle-big_little.c +++ b/drivers/cpuidle/cpuidle-big_little.c @@ -155,8 +155,7 @@ static int __init bl_idle_driver_init(struct cpuidle_driver *drv, int part_id) static const struct of_device_id compatible_machine_match[] = { { .compatible = "arm,vexpress,v2p-ca15_a7" }, - { .compatible = "samsung,exynos5420" }, - { .compatible = "samsung,exynos5800" }, + { .compatible = "google,peach" }, {}, }; diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index b0ce9bc78113..3a39a7f48b77 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -18,6 +18,10 @@ #include <linux/kvm_para.h> #include <linux/cpuidle_haltpoll.h> +static bool force __read_mostly; +module_param(force, bool, 0444); +MODULE_PARM_DESC(force, "Load unconditionally"); + static struct cpuidle_device __percpu *haltpoll_cpuidle_devices; static enum cpuhp_state haltpoll_hp_state; @@ -90,6 +94,11 @@ static void haltpoll_uninit(void) haltpoll_cpuidle_devices = NULL; } +static bool haltpoll_want(void) +{ + return kvm_para_has_hint(KVM_HINTS_REALTIME) || force; +} + static int __init haltpoll_init(void) { int ret; @@ -99,12 +108,11 @@ static int __init haltpoll_init(void) if (boot_option_idle_override != IDLE_NO_OVERRIDE) return -ENODEV; - cpuidle_poll_state_init(drv); - - if (!kvm_para_available() || - !kvm_para_has_hint(KVM_HINTS_REALTIME)) + if (!kvm_para_available() || !haltpoll_want()) return -ENODEV; + cpuidle_poll_state_init(drv); + ret = cpuidle_register_driver(drv); if (ret < 0) return ret; diff --git a/drivers/cpuidle/cpuidle-kirkwood.c b/drivers/cpuidle/cpuidle-kirkwood.c index 511c4f46027a..13bf743f885b 100644 --- a/drivers/cpuidle/cpuidle-kirkwood.c +++ b/drivers/cpuidle/cpuidle-kirkwood.c @@ -1,10 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * CPU idle Marvell Kirkwood SoCs * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. - * * The cpu idle uses wait-for-interrupt and DDR self refresh in order * to implement two idle states - * #1 wait-for-interrupt diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1b299e801f74..0b5461b3d7dd 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -141,7 +141,7 @@ static int stop_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - power9_idle_type(stop_psscr_table[index].val, + arch300_idle_type(stop_psscr_table[index].val, stop_psscr_table[index].mask); return index; } @@ -233,8 +233,8 @@ static inline void add_powernv_state(int index, const char *name, unsigned int exit_latency, u64 psscr_val, u64 psscr_mask) { - strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN); - strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN); + strscpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN); + strscpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN); powernv_states[index].flags = flags; powernv_states[index].target_residency = target_residency; powernv_states[index].exit_latency = exit_latency; @@ -244,20 +244,6 @@ static inline void add_powernv_state(int index, const char *name, stop_psscr_table[index].mask = psscr_mask; } -/* - * Returns 0 if prop1_len == prop2_len. Else returns -1 - */ -static inline int validate_dt_prop_sizes(const char *prop1, int prop1_len, - const char *prop2, int prop2_len) -{ - if (prop1_len == prop2_len) - return 0; - - pr_warn("cpuidle-powernv: array sizes don't match for %s and %s\n", - prop1, prop2); - return -1; -} - extern u32 pnv_get_supported_cpuidle_states(void); static int powernv_add_idle_states(void) { diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c index 423f03bbeb74..821984947ed9 100644 --- a/drivers/cpuidle/cpuidle-psci-domain.c +++ b/drivers/cpuidle/cpuidle-psci-domain.c @@ -12,6 +12,7 @@ #include <linux/cpu.h> #include <linux/device.h> #include <linux/kernel.h> +#include <linux/platform_device.h> #include <linux/pm_domain.h> #include <linux/pm_runtime.h> #include <linux/psci.h> @@ -26,7 +27,7 @@ struct psci_pd_provider { }; static LIST_HEAD(psci_pd_providers); -static bool osi_mode_enabled __initdata; +static bool psci_pd_allow_domain_state; static int psci_pd_power_off(struct generic_pm_domain *pd) { @@ -36,6 +37,9 @@ static int psci_pd_power_off(struct generic_pm_domain *pd) if (!state->data) return 0; + if (!psci_pd_allow_domain_state) + return -EBUSY; + /* OSI mode is enabled, set the corresponding domain state. */ pd_state = state->data; psci_set_domain_state(*pd_state); @@ -43,73 +47,14 @@ static int psci_pd_power_off(struct generic_pm_domain *pd) return 0; } -static int __init psci_pd_parse_state_nodes(struct genpd_power_state *states, - int state_count) -{ - int i, ret; - u32 psci_state, *psci_state_buf; - - for (i = 0; i < state_count; i++) { - ret = psci_dt_parse_state_node(to_of_node(states[i].fwnode), - &psci_state); - if (ret) - goto free_state; - - psci_state_buf = kmalloc(sizeof(u32), GFP_KERNEL); - if (!psci_state_buf) { - ret = -ENOMEM; - goto free_state; - } - *psci_state_buf = psci_state; - states[i].data = psci_state_buf; - } - - return 0; - -free_state: - i--; - for (; i >= 0; i--) - kfree(states[i].data); - return ret; -} - -static int __init psci_pd_parse_states(struct device_node *np, - struct genpd_power_state **states, int *state_count) -{ - int ret; - - /* Parse the domain idle states. */ - ret = of_genpd_parse_idle_states(np, states, state_count); - if (ret) - return ret; - - /* Fill out the PSCI specifics for each found state. */ - ret = psci_pd_parse_state_nodes(*states, *state_count); - if (ret) - kfree(*states); - - return ret; -} - -static void psci_pd_free_states(struct genpd_power_state *states, - unsigned int state_count) -{ - int i; - - for (i = 0; i < state_count; i++) - kfree(states[i].data); - kfree(states); -} - -static int __init psci_pd_init(struct device_node *np) +static int psci_pd_init(struct device_node *np, bool use_osi) { struct generic_pm_domain *pd; struct psci_pd_provider *pd_provider; struct dev_power_governor *pd_gov; - struct genpd_power_state *states = NULL; - int ret = -ENOMEM, state_count = 0; + int ret = -ENOMEM; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); + pd = dt_idle_pd_alloc(np, psci_dt_parse_state_node); if (!pd) goto out; @@ -117,33 +62,20 @@ static int __init psci_pd_init(struct device_node *np) if (!pd_provider) goto free_pd; - pd->name = kasprintf(GFP_KERNEL, "%pOF", np); - if (!pd->name) - goto free_pd_prov; - - /* - * Parse the domain idle states and let genpd manage the state selection - * for those being compatible with "domain-idle-state". - */ - ret = psci_pd_parse_states(np, &states, &state_count); - if (ret) - goto free_name; - - pd->free_states = psci_pd_free_states; - pd->name = kbasename(pd->name); - pd->power_off = psci_pd_power_off; - pd->states = states; - pd->state_count = state_count; pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN; + /* Allow power off when OSI has been successfully enabled. */ + if (use_osi) + pd->power_off = psci_pd_power_off; + else + pd->flags |= GENPD_FLAG_ALWAYS_ON; + /* Use governor for CPU PM domains if it has some states to manage. */ - pd_gov = state_count > 0 ? &pm_domain_cpu_gov : NULL; + pd_gov = pd->states ? &pm_domain_cpu_gov : NULL; ret = pm_genpd_init(pd, pd_gov, false); - if (ret) { - psci_pd_free_states(states, state_count); - goto free_name; - } + if (ret) + goto free_pd_prov; ret = of_genpd_add_provider_simple(np, pd); if (ret) @@ -157,18 +89,16 @@ static int __init psci_pd_init(struct device_node *np) remove_pd: pm_genpd_remove(pd); -free_name: - kfree(pd->name); free_pd_prov: kfree(pd_provider); free_pd: - kfree(pd); + dt_idle_pd_free(pd); out: pr_err("failed to init PM domain ret=%d %pOF\n", ret, np); return ret; } -static void __init psci_pd_remove(void) +static void psci_pd_remove(void) { struct psci_pd_provider *pd_provider, *it; struct generic_pm_domain *genpd; @@ -186,59 +116,46 @@ static void __init psci_pd_remove(void) } } -static int __init psci_pd_init_topology(struct device_node *np, bool add) +static bool psci_pd_try_set_osi_mode(void) { - struct device_node *node; - struct of_phandle_args child, parent; int ret; - for_each_child_of_node(np, node) { - if (of_parse_phandle_with_args(node, "power-domains", - "#power-domain-cells", 0, &parent)) - continue; - - child.np = node; - child.args_count = 0; - - ret = add ? of_genpd_add_subdomain(&parent, &child) : - of_genpd_remove_subdomain(&parent, &child); - of_node_put(parent.np); - if (ret) { - of_node_put(node); - return ret; - } - } + if (!psci_has_osi_support()) + return false; - return 0; -} + ret = psci_set_osi_mode(true); + if (ret) + return false; -static int __init psci_pd_add_topology(struct device_node *np) -{ - return psci_pd_init_topology(np, true); + return true; } -static void __init psci_pd_remove_topology(struct device_node *np) +static void psci_cpuidle_domain_sync_state(struct device *dev) { - psci_pd_init_topology(np, false); + /* + * All devices have now been attached/probed to the PM domain topology, + * hence it's fine to allow domain states to be picked. + */ + psci_pd_allow_domain_state = true; } -static const struct of_device_id psci_of_match[] __initconst = { +static const struct of_device_id psci_of_match[] = { { .compatible = "arm,psci-1.0" }, {} }; -static int __init psci_idle_init_domains(void) +static int psci_cpuidle_domain_probe(struct platform_device *pdev) { - struct device_node *np = of_find_matching_node(NULL, psci_of_match); + struct device_node *np = pdev->dev.of_node; struct device_node *node; + bool use_osi; int ret = 0, pd_count = 0; if (!np) return -ENODEV; - /* Currently limit the hierarchical topology to be used in OSI mode. */ - if (!psci_has_osi_support()) - goto out; + /* If OSI mode is supported, let's try to enable it. */ + use_osi = psci_pd_try_set_osi_mode(); /* * Parse child nodes for the "#power-domain-cells" property and @@ -248,7 +165,7 @@ static int __init psci_idle_init_domains(void) if (!of_find_property(node, "#power-domain-cells", NULL)) continue; - ret = psci_pd_init(node); + ret = psci_pd_init(node, use_osi); if (ret) goto put_node; @@ -257,52 +174,38 @@ static int __init psci_idle_init_domains(void) /* Bail out if not using the hierarchical CPU topology. */ if (!pd_count) - goto out; + goto no_pd; /* Link genpd masters/subdomains to model the CPU topology. */ - ret = psci_pd_add_topology(np); + ret = dt_idle_pd_init_topology(np); if (ret) goto remove_pd; - /* Try to enable OSI mode. */ - ret = psci_set_osi_mode(); - if (ret) { - pr_warn("failed to enable OSI mode: %d\n", ret); - psci_pd_remove_topology(np); - goto remove_pd; - } - - osi_mode_enabled = true; - of_node_put(np); pr_info("Initialized CPU PM domain topology\n"); - return pd_count; + return 0; put_node: of_node_put(node); remove_pd: - if (pd_count) - psci_pd_remove(); + psci_pd_remove(); pr_err("failed to create CPU PM domains ret=%d\n", ret); -out: - of_node_put(np); +no_pd: + if (use_osi) + psci_set_osi_mode(false); return ret; } -subsys_initcall(psci_idle_init_domains); -struct device __init *psci_dt_attach_cpu(int cpu) -{ - struct device *dev; - - if (!osi_mode_enabled) - return NULL; - - dev = dev_pm_domain_attach_by_name(get_cpu_device(cpu), "psci"); - if (IS_ERR_OR_NULL(dev)) - return dev; - - pm_runtime_irq_safe(dev); - if (cpu_online(cpu)) - pm_runtime_get_sync(dev); +static struct platform_driver psci_cpuidle_domain_driver = { + .probe = psci_cpuidle_domain_probe, + .driver = { + .name = "psci-cpuidle-domain", + .of_match_table = psci_of_match, + .sync_state = psci_cpuidle_domain_sync_state, + }, +}; - return dev; +static int __init psci_idle_init_domains(void) +{ + return platform_driver_register(&psci_cpuidle_domain_driver); } +subsys_initcall(psci_idle_init_domains); diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index edd7a54ef0d3..57bc3e3ae391 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -9,6 +9,7 @@ #define pr_fmt(fmt) "CPUidle PSCI: " fmt #include <linux/cpuhotplug.h> +#include <linux/cpu_cooling.h> #include <linux/cpuidle.h> #include <linux/cpumask.h> #include <linux/cpu_pm.h> @@ -16,9 +17,13 @@ #include <linux/module.h> #include <linux/of.h> #include <linux/of_device.h> +#include <linux/platform_device.h> #include <linux/psci.h> +#include <linux/pm_domain.h> #include <linux/pm_runtime.h> #include <linux/slab.h> +#include <linux/string.h> +#include <linux/syscore_ops.h> #include <asm/cpuidle.h> @@ -32,7 +37,7 @@ struct psci_cpuidle_data { static DEFINE_PER_CPU_READ_MOSTLY(struct psci_cpuidle_data, psci_cpuidle_data); static DEFINE_PER_CPU(u32, domain_state); -static bool psci_cpuidle_use_cpuhp __initdata; +static bool psci_cpuidle_use_cpuhp; void psci_set_domain_state(u32 state) { @@ -49,8 +54,9 @@ static inline int psci_enter_state(int idx, u32 state) return CPU_PM_CPU_IDLE_ENTER_PARAM(psci_cpu_suspend_enter, idx, state); } -static int psci_enter_domain_idle_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx) +static int __psci_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx, + bool s2idle) { struct psci_cpuidle_data *data = this_cpu_ptr(&psci_cpuidle_data); u32 *states = data->psci_states; @@ -58,22 +64,51 @@ static int psci_enter_domain_idle_state(struct cpuidle_device *dev, u32 state; int ret; + ret = cpu_pm_enter(); + if (ret) + return -1; + /* Do runtime PM to manage a hierarchical CPU toplogy. */ - pm_runtime_put_sync_suspend(pd_dev); + ct_irq_enter_irqson(); + if (s2idle) + dev_pm_genpd_suspend(pd_dev); + else + pm_runtime_put_sync_suspend(pd_dev); + ct_irq_exit_irqson(); state = psci_get_domain_state(); if (!state) state = states[idx]; - ret = psci_enter_state(idx, state); + ret = psci_cpu_suspend_enter(state) ? -1 : idx; + + ct_irq_enter_irqson(); + if (s2idle) + dev_pm_genpd_resume(pd_dev); + else + pm_runtime_get_sync(pd_dev); + ct_irq_exit_irqson(); - pm_runtime_get_sync(pd_dev); + cpu_pm_exit(); /* Clear the domain state to start fresh when back from idle. */ psci_set_domain_state(0); return ret; } +static int psci_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) +{ + return __psci_enter_domain_idle_state(dev, drv, idx, false); +} + +static int psci_enter_s2idle_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int idx) +{ + return __psci_enter_domain_idle_state(dev, drv, idx, true); +} + static int psci_idle_cpuhp_up(unsigned int cpu) { struct device *pd_dev = __this_cpu_read(psci_cpuidle_data.dev); @@ -97,13 +132,58 @@ static int psci_idle_cpuhp_down(unsigned int cpu) return 0; } -static void __init psci_idle_init_cpuhp(void) +static void psci_idle_syscore_switch(bool suspend) +{ + bool cleared = false; + struct device *dev; + int cpu; + + for_each_possible_cpu(cpu) { + dev = per_cpu_ptr(&psci_cpuidle_data, cpu)->dev; + + if (dev && suspend) { + dev_pm_genpd_suspend(dev); + } else if (dev) { + dev_pm_genpd_resume(dev); + + /* Account for userspace having offlined a CPU. */ + if (pm_runtime_status_suspended(dev)) + pm_runtime_set_active(dev); + + /* Clear domain state to re-start fresh. */ + if (!cleared) { + psci_set_domain_state(0); + cleared = true; + } + } + } +} + +static int psci_idle_syscore_suspend(void) +{ + psci_idle_syscore_switch(true); + return 0; +} + +static void psci_idle_syscore_resume(void) +{ + psci_idle_syscore_switch(false); +} + +static struct syscore_ops psci_idle_syscore_ops = { + .suspend = psci_idle_syscore_suspend, + .resume = psci_idle_syscore_resume, +}; + +static void psci_idle_init_cpuhp(void) { int err; if (!psci_cpuidle_use_cpuhp) return; + register_syscore_ops(&psci_idle_syscore_ops); + err = cpuhp_setup_state_nocalls(CPUHP_AP_CPU_PM_STARTING, "cpuidle/psci:online", psci_idle_cpuhp_up, @@ -120,30 +200,13 @@ static int psci_enter_idle_state(struct cpuidle_device *dev, return psci_enter_state(idx, state[idx]); } -static struct cpuidle_driver psci_idle_driver __initdata = { - .name = "psci_idle", - .owner = THIS_MODULE, - /* - * PSCI idle states relies on architectural WFI to - * be represented as state index 0. - */ - .states[0] = { - .enter = psci_enter_idle_state, - .exit_latency = 1, - .target_residency = 1, - .power_usage = UINT_MAX, - .name = "WFI", - .desc = "ARM WFI", - } -}; - -static const struct of_device_id psci_idle_state_match[] __initconst = { +static const struct of_device_id psci_idle_state_match[] = { { .compatible = "arm,idle-state", .data = psci_enter_idle_state }, { }, }; -int __init psci_dt_parse_state_node(struct device_node *np, u32 *state) +int psci_dt_parse_state_node(struct device_node *np, u32 *state) { int err = of_property_read_u32(np, "arm,psci-suspend-param", state); @@ -160,9 +223,33 @@ int __init psci_dt_parse_state_node(struct device_node *np, u32 *state) return 0; } -static int __init psci_dt_cpu_init_idle(struct cpuidle_driver *drv, - struct device_node *cpu_node, - unsigned int state_count, int cpu) +static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv, + struct psci_cpuidle_data *data, + unsigned int state_count, int cpu) +{ + /* Currently limit the hierarchical topology to be used in OSI mode. */ + if (!psci_has_osi_support()) + return 0; + + data->dev = psci_dt_attach_cpu(cpu); + if (IS_ERR_OR_NULL(data->dev)) + return PTR_ERR_OR_ZERO(data->dev); + + /* + * Using the deepest state for the CPU to trigger a potential selection + * of a shared state for the domain, assumes the domain states are all + * deeper states. + */ + drv->states[state_count - 1].enter = psci_enter_domain_idle_state; + drv->states[state_count - 1].enter_s2idle = psci_enter_s2idle_domain_idle_state; + psci_cpuidle_use_cpuhp = true; + + return 0; +} + +static int psci_dt_cpu_init_idle(struct device *dev, struct cpuidle_driver *drv, + struct device_node *cpu_node, + unsigned int state_count, int cpu) { int i, ret = 0; u32 *psci_states; @@ -170,7 +257,8 @@ static int __init psci_dt_cpu_init_idle(struct cpuidle_driver *drv, struct psci_cpuidle_data *data = per_cpu_ptr(&psci_cpuidle_data, cpu); state_count++; /* Add WFI state too */ - psci_states = kcalloc(state_count, sizeof(*psci_states), GFP_KERNEL); + psci_states = devm_kcalloc(dev, state_count, sizeof(*psci_states), + GFP_KERNEL); if (!psci_states) return -ENOMEM; @@ -183,47 +271,26 @@ static int __init psci_dt_cpu_init_idle(struct cpuidle_driver *drv, of_node_put(state_node); if (ret) - goto free_mem; + return ret; pr_debug("psci-power-state %#x index %d\n", psci_states[i], i); } - if (i != state_count) { - ret = -ENODEV; - goto free_mem; - } - - /* Currently limit the hierarchical topology to be used in OSI mode. */ - if (psci_has_osi_support()) { - data->dev = psci_dt_attach_cpu(cpu); - if (IS_ERR(data->dev)) { - ret = PTR_ERR(data->dev); - goto free_mem; - } + if (i != state_count) + return -ENODEV; - /* - * Using the deepest state for the CPU to trigger a potential - * selection of a shared state for the domain, assumes the - * domain states are all deeper states. - */ - if (data->dev) { - drv->states[state_count - 1].enter = - psci_enter_domain_idle_state; - psci_cpuidle_use_cpuhp = true; - } - } + /* Initialize optional data, used for the hierarchical topology. */ + ret = psci_dt_cpu_init_topology(drv, data, state_count, cpu); + if (ret < 0) + return ret; /* Idle states parsed correctly, store them in the per-cpu struct. */ data->psci_states = psci_states; return 0; - -free_mem: - kfree(psci_states); - return ret; } -static __init int psci_cpu_init_idle(struct cpuidle_driver *drv, - unsigned int cpu, unsigned int state_count) +static int psci_cpu_init_idle(struct device *dev, struct cpuidle_driver *drv, + unsigned int cpu, unsigned int state_count) { struct device_node *cpu_node; int ret; @@ -239,14 +306,22 @@ static __init int psci_cpu_init_idle(struct cpuidle_driver *drv, if (!cpu_node) return -ENODEV; - ret = psci_dt_cpu_init_idle(drv, cpu_node, state_count, cpu); + ret = psci_dt_cpu_init_idle(dev, drv, cpu_node, state_count, cpu); of_node_put(cpu_node); return ret; } -static int __init psci_idle_init_cpu(int cpu) +static void psci_cpu_deinit_idle(int cpu) +{ + struct psci_cpuidle_data *data = per_cpu_ptr(&psci_cpuidle_data, cpu); + + psci_dt_detach_cpu(data->dev); + psci_cpuidle_use_cpuhp = false; +} + +static int psci_idle_init_cpu(struct device *dev, int cpu) { struct cpuidle_driver *drv; struct device_node *cpu_node; @@ -269,17 +344,26 @@ static int __init psci_idle_init_cpu(int cpu) if (ret) return ret; - drv = kmemdup(&psci_idle_driver, sizeof(*drv), GFP_KERNEL); + drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL); if (!drv) return -ENOMEM; + drv->name = "psci_idle"; + drv->owner = THIS_MODULE; drv->cpumask = (struct cpumask *)cpumask_of(cpu); /* - * Initialize idle states data, starting at index 1, since - * by default idle state 0 is the quiescent state reached - * by the cpu by executing the wfi instruction. - * + * PSCI idle states relies on architectural WFI to be represented as + * state index 0. + */ + drv->states[0].enter = psci_enter_idle_state; + drv->states[0].exit_latency = 1; + drv->states[0].target_residency = 1; + drv->states[0].power_usage = UINT_MAX; + strcpy(drv->states[0].name, "WFI"); + strcpy(drv->states[0].desc, "ARM WFI"); + + /* * If no DT idle states are detected (ret == 0) let the driver * initialization fail accordingly since there is no reason to * initialize the idle driver if only wfi is supported, the @@ -287,46 +371,45 @@ static int __init psci_idle_init_cpu(int cpu) * on idle entry. */ ret = dt_init_idle_driver(drv, psci_idle_state_match, 1); - if (ret <= 0) { - ret = ret ? : -ENODEV; - goto out_kfree_drv; - } + if (ret <= 0) + return ret ? : -ENODEV; /* * Initialize PSCI idle states. */ - ret = psci_cpu_init_idle(drv, cpu, ret); + ret = psci_cpu_init_idle(dev, drv, cpu, ret); if (ret) { pr_err("CPU %d failed to PSCI idle\n", cpu); - goto out_kfree_drv; + return ret; } ret = cpuidle_register(drv, NULL); if (ret) - goto out_kfree_drv; + goto deinit; - return 0; + cpuidle_cooling_register(drv); -out_kfree_drv: - kfree(drv); + return 0; +deinit: + psci_cpu_deinit_idle(cpu); return ret; } /* - * psci_idle_init - Initializes PSCI cpuidle driver + * psci_idle_probe - Initializes PSCI cpuidle driver * * Initializes PSCI cpuidle driver for all CPUs, if any CPU fails * to register cpuidle driver then rollback to cancel all CPUs * registration. */ -static int __init psci_idle_init(void) +static int psci_cpuidle_probe(struct platform_device *pdev) { int cpu, ret; struct cpuidle_driver *drv; struct cpuidle_device *dev; for_each_possible_cpu(cpu) { - ret = psci_idle_init_cpu(cpu); + ret = psci_idle_init_cpu(&pdev->dev, cpu); if (ret) goto out_fail; } @@ -339,9 +422,34 @@ out_fail: dev = per_cpu(cpuidle_devices, cpu); drv = cpuidle_get_cpu_driver(dev); cpuidle_unregister(drv); - kfree(drv); + psci_cpu_deinit_idle(cpu); } return ret; } + +static struct platform_driver psci_cpuidle_driver = { + .probe = psci_cpuidle_probe, + .driver = { + .name = "psci-cpuidle", + }, +}; + +static int __init psci_idle_init(void) +{ + struct platform_device *pdev; + int ret; + + ret = platform_driver_register(&psci_cpuidle_driver); + if (ret) + return ret; + + pdev = platform_device_register_simple("psci-cpuidle", -1, NULL, 0); + if (IS_ERR(pdev)) { + platform_driver_unregister(&psci_cpuidle_driver); + return PTR_ERR(pdev); + } + + return 0; +} device_initcall(psci_idle_init); diff --git a/drivers/cpuidle/cpuidle-psci.h b/drivers/cpuidle/cpuidle-psci.h index 7299a04dd467..4e132640ed64 100644 --- a/drivers/cpuidle/cpuidle-psci.h +++ b/drivers/cpuidle/cpuidle-psci.h @@ -3,15 +3,29 @@ #ifndef __CPUIDLE_PSCI_H #define __CPUIDLE_PSCI_H +struct device; struct device_node; void psci_set_domain_state(u32 state); -int __init psci_dt_parse_state_node(struct device_node *np, u32 *state); +int psci_dt_parse_state_node(struct device_node *np, u32 *state); + +#ifdef CONFIG_ARM_PSCI_CPUIDLE_DOMAIN + +#include "dt_idle_genpd.h" + +static inline struct device *psci_dt_attach_cpu(int cpu) +{ + return dt_idle_attach_cpu(cpu, "psci"); +} + +static inline void psci_dt_detach_cpu(struct device *dev) +{ + dt_idle_detach_cpu(dev); +} -#ifdef CONFIG_PM_GENERIC_DOMAINS_OF -struct device __init *psci_dt_attach_cpu(int cpu); #else -static inline struct device __init *psci_dt_attach_cpu(int cpu) { return NULL; } +static inline struct device *psci_dt_attach_cpu(int cpu) { return NULL; } +static inline void psci_dt_detach_cpu(struct device *dev) { } #endif #endif /* __CPUIDLE_PSCI_H */ diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 74c247972bb3..7e7ab5597d7a 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -19,9 +19,11 @@ #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/runlatch.h> +#include <asm/idle.h> #include <asm/plpar_wrappers.h> +#include <asm/rtas.h> -struct cpuidle_driver pseries_idle_driver = { +static struct cpuidle_driver pseries_idle_driver = { .name = "pseries_idle", .owner = THIS_MODULE, }; @@ -31,39 +33,15 @@ static struct cpuidle_state *cpuidle_state_table __read_mostly; static u64 snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; -static inline void idle_loop_prolog(unsigned long *in_purr) -{ - ppc64_runlatch_off(); - *in_purr = mfspr(SPRN_PURR); - /* - * Indicate to the HV that we are idle. Now would be - * a good time to find other work to dispatch. - */ - get_lppaca()->idle = 1; -} - -static inline void idle_loop_epilog(unsigned long in_purr) -{ - u64 wait_cycles; - - wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles); - wait_cycles += mfspr(SPRN_PURR) - in_purr; - get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles); - get_lppaca()->idle = 0; - - ppc64_runlatch_on(); -} - static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - unsigned long in_purr; u64 snooze_exit_time; set_thread_flag(TIF_POLLING_NRFLAG); - idle_loop_prolog(&in_purr); + pseries_idle_prolog(); local_irq_enable(); snooze_exit_time = get_tb() + snooze_timeout; @@ -87,7 +65,7 @@ static int snooze_loop(struct cpuidle_device *dev, local_irq_disable(); - idle_loop_epilog(in_purr); + pseries_idle_epilog(); return index; } @@ -109,22 +87,152 @@ static void check_and_cede_processor(void) } } +/* + * XCEDE: Extended CEDE states discovered through the + * "ibm,get-systems-parameter" RTAS call with the token + * CEDE_LATENCY_TOKEN + */ + +/* + * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a + * table with all the parameters to ibm,get-system-parameters. + * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency + * Settings Information. + */ +#define CEDE_LATENCY_TOKEN 45 + +/* + * If the platform supports the cede latency settings information system + * parameter it must provide the following information in the NULL terminated + * parameter string: + * + * a. The first byte is the length “N” of each cede latency setting record minus + * one (zero indicates a length of 1 byte). + * + * b. For each supported cede latency setting a cede latency setting record + * consisting of the first “N” bytes as per the following table. + * + * ----------------------------- + * | Field | Field | + * | Name | Length | + * ----------------------------- + * | Cede Latency | 1 Byte | + * | Specifier Value | | + * ----------------------------- + * | Maximum wakeup | | + * | latency in | 8 Bytes | + * | tb-ticks | | + * ----------------------------- + * | Responsive to | | + * | external | 1 Byte | + * | interrupts | | + * ----------------------------- + * + * This version has cede latency record size = 10. + * + * The structure xcede_latency_payload represents a) and b) with + * xcede_latency_record representing the table in b). + * + * xcede_latency_parameter is what gets returned by + * ibm,get-systems-parameter RTAS call when made with + * CEDE_LATENCY_TOKEN. + * + * These structures are only used to represent the data obtained by the RTAS + * call. The data is in big-endian. + */ +struct xcede_latency_record { + u8 hint; + __be64 latency_ticks; + u8 wake_on_irqs; +} __packed; + +// Make space for 16 records, which "should be enough". +struct xcede_latency_payload { + u8 record_size; + struct xcede_latency_record records[16]; +} __packed; + +struct xcede_latency_parameter { + __be16 payload_size; + struct xcede_latency_payload payload; + u8 null_char; +} __packed; + +static unsigned int nr_xcede_records; +static struct xcede_latency_parameter xcede_latency_parameter __initdata; + +static int __init parse_cede_parameters(void) +{ + struct xcede_latency_payload *payload; + u32 total_xcede_records_size; + u8 xcede_record_size; + u16 payload_size; + int ret, i; + + ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter), + sizeof(xcede_latency_parameter)); + if (ret) { + pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n"); + return ret; + } + + payload_size = be16_to_cpu(xcede_latency_parameter.payload_size); + payload = &xcede_latency_parameter.payload; + + xcede_record_size = payload->record_size + 1; + + if (xcede_record_size != sizeof(struct xcede_latency_record)) { + pr_err("xcede: Expected record-size %lu. Observed size %u.\n", + sizeof(struct xcede_latency_record), xcede_record_size); + return -EINVAL; + } + + pr_info("xcede: xcede_record_size = %d\n", xcede_record_size); + + /* + * Since the payload_size includes the last NULL byte and the + * xcede_record_size, the remaining bytes correspond to array of all + * cede_latency settings. + */ + total_xcede_records_size = payload_size - 2; + nr_xcede_records = total_xcede_records_size / xcede_record_size; + + for (i = 0; i < nr_xcede_records; i++) { + struct xcede_latency_record *record = &payload->records[i]; + u64 latency_ticks = be64_to_cpu(record->latency_ticks); + u8 wake_on_irqs = record->wake_on_irqs; + u8 hint = record->hint; + + pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n", + i, hint, latency_ticks, wake_on_irqs); + } + + return 0; +} + +#define NR_DEDICATED_STATES 2 /* snooze, CEDE */ +static u8 cede_latency_hint[NR_DEDICATED_STATES]; + static int dedicated_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - unsigned long in_purr; + u8 old_latency_hint; - idle_loop_prolog(&in_purr); + pseries_idle_prolog(); get_lppaca()->donate_dedicated_cpu = 1; + old_latency_hint = get_lppaca()->cede_latency_hint; + get_lppaca()->cede_latency_hint = cede_latency_hint[index]; HMT_medium(); check_and_cede_processor(); local_irq_disable(); get_lppaca()->donate_dedicated_cpu = 0; + get_lppaca()->cede_latency_hint = old_latency_hint; - idle_loop_epilog(in_purr); + pseries_idle_epilog(); return index; } @@ -133,9 +241,8 @@ static int shared_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - unsigned long in_purr; - idle_loop_prolog(&in_purr); + pseries_idle_prolog(); /* * Yield the processor to the hypervisor. We return if @@ -147,7 +254,7 @@ static int shared_cede_loop(struct cpuidle_device *dev, check_and_cede_processor(); local_irq_disable(); - idle_loop_epilog(in_purr); + pseries_idle_epilog(); return index; } @@ -155,7 +262,7 @@ static int shared_cede_loop(struct cpuidle_device *dev, /* * States for dedicated partition case. */ -static struct cpuidle_state dedicated_states[] = { +static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = { { /* Snooze */ .name = "snooze", .desc = "snooze", @@ -236,11 +343,67 @@ static int pseries_cpuidle_driver_init(void) return 0; } +static void __init fixup_cede0_latency(void) +{ + struct xcede_latency_payload *payload; + u64 min_xcede_latency_us = UINT_MAX; + int i; + + if (parse_cede_parameters()) + return; + + pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n", + nr_xcede_records); + + payload = &xcede_latency_parameter.payload; + + /* + * The CEDE idle state maps to CEDE(0). While the hypervisor + * does not advertise CEDE(0) exit latency values, it does + * advertise the latency values of the extended CEDE states. + * We use the lowest advertised exit latency value as a proxy + * for the exit latency of CEDE(0). + */ + for (i = 0; i < nr_xcede_records; i++) { + struct xcede_latency_record *record = &payload->records[i]; + u8 hint = record->hint; + u64 latency_tb = be64_to_cpu(record->latency_ticks); + u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC); + + /* + * We expect the exit latency of an extended CEDE + * state to be non-zero, it to since it takes at least + * a few nanoseconds to wakeup the idle CPU and + * dispatch the virtual processor into the Linux + * Guest. + * + * So we consider only non-zero value for performing + * the fixup of CEDE(0) latency. + */ + if (latency_us == 0) { + pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n", + i, hint); + continue; + } + + if (latency_us < min_xcede_latency_us) + min_xcede_latency_us = latency_us; + } + + if (min_xcede_latency_us != UINT_MAX) { + dedicated_states[1].exit_latency = min_xcede_latency_us; + dedicated_states[1].target_residency = 10 * (min_xcede_latency_us); + pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n", + min_xcede_latency_us); + } + +} + /* * pseries_idle_probe() * Choose state table for shared versus dedicated partition */ -static int pseries_idle_probe(void) +static int __init pseries_idle_probe(void) { if (cpuidle_disable != IDLE_NO_OVERRIDE) @@ -257,8 +420,23 @@ static int pseries_idle_probe(void) cpuidle_state_table = shared_states; max_idle_state = ARRAY_SIZE(shared_states); } else { + /* + * Use firmware provided latency values + * starting with POWER10 platforms. In the + * case that we are running on a POWER10 + * platform but in an earlier compat mode, we + * can still use the firmware provided values. + * + * However, on platforms prior to POWER10, we + * cannot rely on the accuracy of the firmware + * provided latency values. On such platforms, + * go with the conservative default estimate + * of 10us. + */ + if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10)) + fixup_cede0_latency(); cpuidle_state_table = dedicated_states; - max_idle_state = ARRAY_SIZE(dedicated_states); + max_idle_state = NR_DEDICATED_STATES; } } else return -ENODEV; diff --git a/drivers/cpuidle/cpuidle-qcom-spm.c b/drivers/cpuidle/cpuidle-qcom-spm.c new file mode 100644 index 000000000000..beedf22cbe78 --- /dev/null +++ b/drivers/cpuidle/cpuidle-qcom-spm.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2011-2014, The Linux Foundation. All rights reserved. + * Copyright (c) 2014,2015, Linaro Ltd. + * + * SAW power controller driver + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_device.h> +#include <linux/err.h> +#include <linux/platform_device.h> +#include <linux/cpuidle.h> +#include <linux/cpu_pm.h> +#include <linux/qcom_scm.h> +#include <soc/qcom/spm.h> + +#include <asm/proc-fns.h> +#include <asm/suspend.h> + +#include "dt_idle_states.h" + +struct cpuidle_qcom_spm_data { + struct cpuidle_driver cpuidle_driver; + struct spm_driver_data *spm; +}; + +static int qcom_pm_collapse(unsigned long int unused) +{ + qcom_scm_cpu_power_down(QCOM_SCM_CPU_PWR_DOWN_L2_ON); + + /* + * Returns here only if there was a pending interrupt and we did not + * power down as a result. + */ + return -1; +} + +static int qcom_cpu_spc(struct spm_driver_data *drv) +{ + int ret; + + spm_set_low_power_mode(drv, PM_SLEEP_MODE_SPC); + ret = cpu_suspend(0, qcom_pm_collapse); + /* + * ARM common code executes WFI without calling into our driver and + * if the SPM mode is not reset, then we may accidently power down the + * cpu when we intended only to gate the cpu clock. + * Ensure the state is set to standby before returning. + */ + spm_set_low_power_mode(drv, PM_SLEEP_MODE_STBY); + + return ret; +} + +static int spm_enter_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) +{ + struct cpuidle_qcom_spm_data *data = container_of(drv, struct cpuidle_qcom_spm_data, + cpuidle_driver); + + return CPU_PM_CPU_IDLE_ENTER_PARAM(qcom_cpu_spc, idx, data->spm); +} + +static struct cpuidle_driver qcom_spm_idle_driver = { + .name = "qcom_spm", + .owner = THIS_MODULE, + .states[0] = { + .enter = spm_enter_idle_state, + .exit_latency = 1, + .target_residency = 1, + .power_usage = UINT_MAX, + .name = "WFI", + .desc = "ARM WFI", + } +}; + +static const struct of_device_id qcom_idle_state_match[] = { + { .compatible = "qcom,idle-state-spc", .data = spm_enter_idle_state }, + { }, +}; + +static int spm_cpuidle_register(struct device *cpuidle_dev, int cpu) +{ + struct platform_device *pdev = NULL; + struct device_node *cpu_node, *saw_node; + struct cpuidle_qcom_spm_data *data = NULL; + int ret; + + cpu_node = of_cpu_device_node_get(cpu); + if (!cpu_node) + return -ENODEV; + + saw_node = of_parse_phandle(cpu_node, "qcom,saw", 0); + if (!saw_node) + return -ENODEV; + + pdev = of_find_device_by_node(saw_node); + of_node_put(saw_node); + of_node_put(cpu_node); + if (!pdev) + return -ENODEV; + + data = devm_kzalloc(cpuidle_dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->spm = dev_get_drvdata(&pdev->dev); + if (!data->spm) + return -EINVAL; + + data->cpuidle_driver = qcom_spm_idle_driver; + data->cpuidle_driver.cpumask = (struct cpumask *)cpumask_of(cpu); + + ret = dt_init_idle_driver(&data->cpuidle_driver, + qcom_idle_state_match, 1); + if (ret <= 0) + return ret ? : -ENODEV; + + return cpuidle_register(&data->cpuidle_driver, NULL); +} + +static int spm_cpuidle_drv_probe(struct platform_device *pdev) +{ + int cpu, ret; + + if (!qcom_scm_is_available()) + return -EPROBE_DEFER; + + ret = qcom_scm_set_warm_boot_addr(cpu_resume_arm); + if (ret) + return dev_err_probe(&pdev->dev, ret, "set warm boot addr failed"); + + for_each_possible_cpu(cpu) { + ret = spm_cpuidle_register(&pdev->dev, cpu); + if (ret && ret != -ENODEV) { + dev_err(&pdev->dev, + "Cannot register for CPU%d: %d\n", cpu, ret); + } + } + + return 0; +} + +static struct platform_driver spm_cpuidle_driver = { + .probe = spm_cpuidle_drv_probe, + .driver = { + .name = "qcom-spm-cpuidle", + .suppress_bind_attrs = true, + }, +}; + +static bool __init qcom_spm_find_any_cpu(void) +{ + struct device_node *cpu_node, *saw_node; + + for_each_of_cpu_node(cpu_node) { + saw_node = of_parse_phandle(cpu_node, "qcom,saw", 0); + if (of_device_is_available(saw_node)) { + of_node_put(saw_node); + of_node_put(cpu_node); + return true; + } + of_node_put(saw_node); + } + return false; +} + +static int __init qcom_spm_cpuidle_init(void) +{ + struct platform_device *pdev; + int ret; + + ret = platform_driver_register(&spm_cpuidle_driver); + if (ret) + return ret; + + /* Make sure there is actually any CPU managed by the SPM */ + if (!qcom_spm_find_any_cpu()) + return 0; + + pdev = platform_device_register_simple("qcom-spm-cpuidle", + -1, NULL, 0); + if (IS_ERR(pdev)) { + platform_driver_unregister(&spm_cpuidle_driver); + return PTR_ERR(pdev); + } + + return 0; +} +device_initcall(qcom_spm_cpuidle_init); diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c new file mode 100644 index 000000000000..05fe2902df9a --- /dev/null +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -0,0 +1,633 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RISC-V SBI CPU idle driver. + * + * Copyright (c) 2021 Western Digital Corporation or its affiliates. + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#define pr_fmt(fmt) "cpuidle-riscv-sbi: " fmt + +#include <linux/cpuidle.h> +#include <linux/cpumask.h> +#include <linux/cpu_pm.h> +#include <linux/cpu_cooling.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/of_device.h> +#include <linux/slab.h> +#include <linux/platform_device.h> +#include <linux/pm_domain.h> +#include <linux/pm_runtime.h> +#include <asm/cpuidle.h> +#include <asm/sbi.h> +#include <asm/smp.h> +#include <asm/suspend.h> + +#include "dt_idle_states.h" +#include "dt_idle_genpd.h" + +struct sbi_cpuidle_data { + u32 *states; + struct device *dev; +}; + +struct sbi_domain_state { + bool available; + u32 state; +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct sbi_cpuidle_data, sbi_cpuidle_data); +static DEFINE_PER_CPU(struct sbi_domain_state, domain_state); +static bool sbi_cpuidle_use_osi; +static bool sbi_cpuidle_use_cpuhp; +static bool sbi_cpuidle_pd_allow_domain_state; + +static inline void sbi_set_domain_state(u32 state) +{ + struct sbi_domain_state *data = this_cpu_ptr(&domain_state); + + data->available = true; + data->state = state; +} + +static inline u32 sbi_get_domain_state(void) +{ + struct sbi_domain_state *data = this_cpu_ptr(&domain_state); + + return data->state; +} + +static inline void sbi_clear_domain_state(void) +{ + struct sbi_domain_state *data = this_cpu_ptr(&domain_state); + + data->available = false; +} + +static inline bool sbi_is_domain_state_available(void) +{ + struct sbi_domain_state *data = this_cpu_ptr(&domain_state); + + return data->available; +} + +static int sbi_suspend_finisher(unsigned long suspend_type, + unsigned long resume_addr, + unsigned long opaque) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND, + suspend_type, resume_addr, opaque, 0, 0, 0); + + return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0; +} + +static int sbi_suspend(u32 state) +{ + if (state & SBI_HSM_SUSP_NON_RET_BIT) + return cpu_suspend(state, sbi_suspend_finisher); + else + return sbi_suspend_finisher(state, 0, 0); +} + +static int sbi_cpuidle_enter_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) +{ + u32 *states = __this_cpu_read(sbi_cpuidle_data.states); + u32 state = states[idx]; + + if (state & SBI_HSM_SUSP_NON_RET_BIT) + return CPU_PM_CPU_IDLE_ENTER_PARAM(sbi_suspend, idx, state); + else + return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(sbi_suspend, + idx, state); +} + +static int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx, + bool s2idle) +{ + struct sbi_cpuidle_data *data = this_cpu_ptr(&sbi_cpuidle_data); + u32 *states = data->states; + struct device *pd_dev = data->dev; + u32 state; + int ret; + + ret = cpu_pm_enter(); + if (ret) + return -1; + + /* Do runtime PM to manage a hierarchical CPU toplogy. */ + ct_irq_enter_irqson(); + if (s2idle) + dev_pm_genpd_suspend(pd_dev); + else + pm_runtime_put_sync_suspend(pd_dev); + ct_irq_exit_irqson(); + + if (sbi_is_domain_state_available()) + state = sbi_get_domain_state(); + else + state = states[idx]; + + ret = sbi_suspend(state) ? -1 : idx; + + ct_irq_enter_irqson(); + if (s2idle) + dev_pm_genpd_resume(pd_dev); + else + pm_runtime_get_sync(pd_dev); + ct_irq_exit_irqson(); + + cpu_pm_exit(); + + /* Clear the domain state to start fresh when back from idle. */ + sbi_clear_domain_state(); + return ret; +} + +static int sbi_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) +{ + return __sbi_enter_domain_idle_state(dev, drv, idx, false); +} + +static int sbi_enter_s2idle_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int idx) +{ + return __sbi_enter_domain_idle_state(dev, drv, idx, true); +} + +static int sbi_cpuidle_cpuhp_up(unsigned int cpu) +{ + struct device *pd_dev = __this_cpu_read(sbi_cpuidle_data.dev); + + if (pd_dev) + pm_runtime_get_sync(pd_dev); + + return 0; +} + +static int sbi_cpuidle_cpuhp_down(unsigned int cpu) +{ + struct device *pd_dev = __this_cpu_read(sbi_cpuidle_data.dev); + + if (pd_dev) { + pm_runtime_put_sync(pd_dev); + /* Clear domain state to start fresh at next online. */ + sbi_clear_domain_state(); + } + + return 0; +} + +static void sbi_idle_init_cpuhp(void) +{ + int err; + + if (!sbi_cpuidle_use_cpuhp) + return; + + err = cpuhp_setup_state_nocalls(CPUHP_AP_CPU_PM_STARTING, + "cpuidle/sbi:online", + sbi_cpuidle_cpuhp_up, + sbi_cpuidle_cpuhp_down); + if (err) + pr_warn("Failed %d while setup cpuhp state\n", err); +} + +static const struct of_device_id sbi_cpuidle_state_match[] = { + { .compatible = "riscv,idle-state", + .data = sbi_cpuidle_enter_state }, + { }, +}; + +static bool sbi_suspend_state_is_valid(u32 state) +{ + if (state > SBI_HSM_SUSPEND_RET_DEFAULT && + state < SBI_HSM_SUSPEND_RET_PLATFORM) + return false; + if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT && + state < SBI_HSM_SUSPEND_NON_RET_PLATFORM) + return false; + return true; +} + +static int sbi_dt_parse_state_node(struct device_node *np, u32 *state) +{ + int err = of_property_read_u32(np, "riscv,sbi-suspend-param", state); + + if (err) { + pr_warn("%pOF missing riscv,sbi-suspend-param property\n", np); + return err; + } + + if (!sbi_suspend_state_is_valid(*state)) { + pr_warn("Invalid SBI suspend state %#x\n", *state); + return -EINVAL; + } + + return 0; +} + +static int sbi_dt_cpu_init_topology(struct cpuidle_driver *drv, + struct sbi_cpuidle_data *data, + unsigned int state_count, int cpu) +{ + /* Currently limit the hierarchical topology to be used in OSI mode. */ + if (!sbi_cpuidle_use_osi) + return 0; + + data->dev = dt_idle_attach_cpu(cpu, "sbi"); + if (IS_ERR_OR_NULL(data->dev)) + return PTR_ERR_OR_ZERO(data->dev); + + /* + * Using the deepest state for the CPU to trigger a potential selection + * of a shared state for the domain, assumes the domain states are all + * deeper states. + */ + drv->states[state_count - 1].enter = sbi_enter_domain_idle_state; + drv->states[state_count - 1].enter_s2idle = + sbi_enter_s2idle_domain_idle_state; + sbi_cpuidle_use_cpuhp = true; + + return 0; +} + +static int sbi_cpuidle_dt_init_states(struct device *dev, + struct cpuidle_driver *drv, + unsigned int cpu, + unsigned int state_count) +{ + struct sbi_cpuidle_data *data = per_cpu_ptr(&sbi_cpuidle_data, cpu); + struct device_node *state_node; + struct device_node *cpu_node; + u32 *states; + int i, ret; + + cpu_node = of_cpu_device_node_get(cpu); + if (!cpu_node) + return -ENODEV; + + states = devm_kcalloc(dev, state_count, sizeof(*states), GFP_KERNEL); + if (!states) { + ret = -ENOMEM; + goto fail; + } + + /* Parse SBI specific details from state DT nodes */ + for (i = 1; i < state_count; i++) { + state_node = of_get_cpu_state_node(cpu_node, i - 1); + if (!state_node) + break; + + ret = sbi_dt_parse_state_node(state_node, &states[i]); + of_node_put(state_node); + + if (ret) + return ret; + + pr_debug("sbi-state %#x index %d\n", states[i], i); + } + if (i != state_count) { + ret = -ENODEV; + goto fail; + } + + /* Initialize optional data, used for the hierarchical topology. */ + ret = sbi_dt_cpu_init_topology(drv, data, state_count, cpu); + if (ret < 0) + return ret; + + /* Store states in the per-cpu struct. */ + data->states = states; + +fail: + of_node_put(cpu_node); + + return ret; +} + +static void sbi_cpuidle_deinit_cpu(int cpu) +{ + struct sbi_cpuidle_data *data = per_cpu_ptr(&sbi_cpuidle_data, cpu); + + dt_idle_detach_cpu(data->dev); + sbi_cpuidle_use_cpuhp = false; +} + +static int sbi_cpuidle_init_cpu(struct device *dev, int cpu) +{ + struct cpuidle_driver *drv; + unsigned int state_count = 0; + int ret = 0; + + drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL); + if (!drv) + return -ENOMEM; + + drv->name = "sbi_cpuidle"; + drv->owner = THIS_MODULE; + drv->cpumask = (struct cpumask *)cpumask_of(cpu); + + /* RISC-V architectural WFI to be represented as state index 0. */ + drv->states[0].enter = sbi_cpuidle_enter_state; + drv->states[0].exit_latency = 1; + drv->states[0].target_residency = 1; + drv->states[0].power_usage = UINT_MAX; + strcpy(drv->states[0].name, "WFI"); + strcpy(drv->states[0].desc, "RISC-V WFI"); + + /* + * If no DT idle states are detected (ret == 0) let the driver + * initialization fail accordingly since there is no reason to + * initialize the idle driver if only wfi is supported, the + * default archictectural back-end already executes wfi + * on idle entry. + */ + ret = dt_init_idle_driver(drv, sbi_cpuidle_state_match, 1); + if (ret <= 0) { + pr_debug("HART%ld: failed to parse DT idle states\n", + cpuid_to_hartid_map(cpu)); + return ret ? : -ENODEV; + } + state_count = ret + 1; /* Include WFI state as well */ + + /* Initialize idle states from DT. */ + ret = sbi_cpuidle_dt_init_states(dev, drv, cpu, state_count); + if (ret) { + pr_err("HART%ld: failed to init idle states\n", + cpuid_to_hartid_map(cpu)); + return ret; + } + + ret = cpuidle_register(drv, NULL); + if (ret) + goto deinit; + + cpuidle_cooling_register(drv); + + return 0; +deinit: + sbi_cpuidle_deinit_cpu(cpu); + return ret; +} + +static void sbi_cpuidle_domain_sync_state(struct device *dev) +{ + /* + * All devices have now been attached/probed to the PM domain + * topology, hence it's fine to allow domain states to be picked. + */ + sbi_cpuidle_pd_allow_domain_state = true; +} + +#ifdef CONFIG_DT_IDLE_GENPD + +static int sbi_cpuidle_pd_power_off(struct generic_pm_domain *pd) +{ + struct genpd_power_state *state = &pd->states[pd->state_idx]; + u32 *pd_state; + + if (!state->data) + return 0; + + if (!sbi_cpuidle_pd_allow_domain_state) + return -EBUSY; + + /* OSI mode is enabled, set the corresponding domain state. */ + pd_state = state->data; + sbi_set_domain_state(*pd_state); + + return 0; +} + +struct sbi_pd_provider { + struct list_head link; + struct device_node *node; +}; + +static LIST_HEAD(sbi_pd_providers); + +static int sbi_pd_init(struct device_node *np) +{ + struct generic_pm_domain *pd; + struct sbi_pd_provider *pd_provider; + struct dev_power_governor *pd_gov; + int ret = -ENOMEM; + + pd = dt_idle_pd_alloc(np, sbi_dt_parse_state_node); + if (!pd) + goto out; + + pd_provider = kzalloc(sizeof(*pd_provider), GFP_KERNEL); + if (!pd_provider) + goto free_pd; + + pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN; + + /* Allow power off when OSI is available. */ + if (sbi_cpuidle_use_osi) + pd->power_off = sbi_cpuidle_pd_power_off; + else + pd->flags |= GENPD_FLAG_ALWAYS_ON; + + /* Use governor for CPU PM domains if it has some states to manage. */ + pd_gov = pd->states ? &pm_domain_cpu_gov : NULL; + + ret = pm_genpd_init(pd, pd_gov, false); + if (ret) + goto free_pd_prov; + + ret = of_genpd_add_provider_simple(np, pd); + if (ret) + goto remove_pd; + + pd_provider->node = of_node_get(np); + list_add(&pd_provider->link, &sbi_pd_providers); + + pr_debug("init PM domain %s\n", pd->name); + return 0; + +remove_pd: + pm_genpd_remove(pd); +free_pd_prov: + kfree(pd_provider); +free_pd: + dt_idle_pd_free(pd); +out: + pr_err("failed to init PM domain ret=%d %pOF\n", ret, np); + return ret; +} + +static void sbi_pd_remove(void) +{ + struct sbi_pd_provider *pd_provider, *it; + struct generic_pm_domain *genpd; + + list_for_each_entry_safe(pd_provider, it, &sbi_pd_providers, link) { + of_genpd_del_provider(pd_provider->node); + + genpd = of_genpd_remove_last(pd_provider->node); + if (!IS_ERR(genpd)) + kfree(genpd); + + of_node_put(pd_provider->node); + list_del(&pd_provider->link); + kfree(pd_provider); + } +} + +static int sbi_genpd_probe(struct device_node *np) +{ + struct device_node *node; + int ret = 0, pd_count = 0; + + if (!np) + return -ENODEV; + + /* + * Parse child nodes for the "#power-domain-cells" property and + * initialize a genpd/genpd-of-provider pair when it's found. + */ + for_each_child_of_node(np, node) { + if (!of_find_property(node, "#power-domain-cells", NULL)) + continue; + + ret = sbi_pd_init(node); + if (ret) + goto put_node; + + pd_count++; + } + + /* Bail out if not using the hierarchical CPU topology. */ + if (!pd_count) + goto no_pd; + + /* Link genpd masters/subdomains to model the CPU topology. */ + ret = dt_idle_pd_init_topology(np); + if (ret) + goto remove_pd; + + return 0; + +put_node: + of_node_put(node); +remove_pd: + sbi_pd_remove(); + pr_err("failed to create CPU PM domains ret=%d\n", ret); +no_pd: + return ret; +} + +#else + +static inline int sbi_genpd_probe(struct device_node *np) +{ + return 0; +} + +#endif + +static int sbi_cpuidle_probe(struct platform_device *pdev) +{ + int cpu, ret; + struct cpuidle_driver *drv; + struct cpuidle_device *dev; + struct device_node *np, *pds_node; + + /* Detect OSI support based on CPU DT nodes */ + sbi_cpuidle_use_osi = true; + for_each_possible_cpu(cpu) { + np = of_cpu_device_node_get(cpu); + if (np && + of_find_property(np, "power-domains", NULL) && + of_find_property(np, "power-domain-names", NULL)) { + continue; + } else { + sbi_cpuidle_use_osi = false; + break; + } + } + + /* Populate generic power domains from DT nodes */ + pds_node = of_find_node_by_path("/cpus/power-domains"); + if (pds_node) { + ret = sbi_genpd_probe(pds_node); + of_node_put(pds_node); + if (ret) + return ret; + } + + /* Initialize CPU idle driver for each CPU */ + for_each_possible_cpu(cpu) { + ret = sbi_cpuidle_init_cpu(&pdev->dev, cpu); + if (ret) { + pr_debug("HART%ld: idle driver init failed\n", + cpuid_to_hartid_map(cpu)); + goto out_fail; + } + } + + /* Setup CPU hotplut notifiers */ + sbi_idle_init_cpuhp(); + + pr_info("idle driver registered for all CPUs\n"); + + return 0; + +out_fail: + while (--cpu >= 0) { + dev = per_cpu(cpuidle_devices, cpu); + drv = cpuidle_get_cpu_driver(dev); + cpuidle_unregister(drv); + sbi_cpuidle_deinit_cpu(cpu); + } + + return ret; +} + +static struct platform_driver sbi_cpuidle_driver = { + .probe = sbi_cpuidle_probe, + .driver = { + .name = "sbi-cpuidle", + .sync_state = sbi_cpuidle_domain_sync_state, + }, +}; + +static int __init sbi_cpuidle_init(void) +{ + int ret; + struct platform_device *pdev; + + /* + * The SBI HSM suspend function is only available when: + * 1) SBI version is 0.3 or higher + * 2) SBI HSM extension is available + */ + if ((sbi_spec_version < sbi_mk_version(0, 3)) || + sbi_probe_extension(SBI_EXT_HSM) <= 0) { + pr_info("HSM suspend not available\n"); + return 0; + } + + ret = platform_driver_register(&sbi_cpuidle_driver); + if (ret) + return ret; + + pdev = platform_device_register_simple("sbi-cpuidle", + -1, NULL, 0); + if (IS_ERR(pdev)) { + platform_driver_unregister(&sbi_cpuidle_driver); + return PTR_ERR(pdev); + } + + return 0; +} +device_initcall(sbi_cpuidle_init); diff --git a/drivers/cpuidle/cpuidle-tegra.c b/drivers/cpuidle/cpuidle-tegra.c new file mode 100644 index 000000000000..9845629aeb6d --- /dev/null +++ b/drivers/cpuidle/cpuidle-tegra.c @@ -0,0 +1,391 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CPU idle driver for Tegra CPUs + * + * Copyright (c) 2010-2013, NVIDIA Corporation. + * Copyright (c) 2011 Google, Inc. + * Author: Colin Cross <ccross@android.com> + * Gary King <gking@nvidia.com> + * + * Rework for 3.3 by Peter De Schrijver <pdeschrijver@nvidia.com> + * + * Tegra20/124 driver unification by Dmitry Osipenko <digetx@gmail.com> + */ + +#define pr_fmt(fmt) "tegra-cpuidle: " fmt + +#include <linux/atomic.h> +#include <linux/cpuidle.h> +#include <linux/cpumask.h> +#include <linux/cpu_pm.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/platform_device.h> +#include <linux/types.h> + +#include <linux/clk/tegra.h> +#include <linux/firmware/trusted_foundations.h> + +#include <soc/tegra/cpuidle.h> +#include <soc/tegra/flowctrl.h> +#include <soc/tegra/fuse.h> +#include <soc/tegra/irq.h> +#include <soc/tegra/pm.h> +#include <soc/tegra/pmc.h> + +#include <asm/cpuidle.h> +#include <asm/firmware.h> +#include <asm/smp_plat.h> +#include <asm/suspend.h> + +enum tegra_state { + TEGRA_C1, + TEGRA_C7, + TEGRA_CC6, + TEGRA_STATE_COUNT, +}; + +static atomic_t tegra_idle_barrier; +static atomic_t tegra_abort_flag; + +static void tegra_cpuidle_report_cpus_state(void) +{ + unsigned long cpu, lcpu, csr; + + for_each_cpu(lcpu, cpu_possible_mask) { + cpu = cpu_logical_map(lcpu); + csr = flowctrl_read_cpu_csr(cpu); + + pr_err("cpu%lu: online=%d flowctrl_csr=0x%08lx\n", + cpu, cpu_online(lcpu), csr); + } +} + +static int tegra_cpuidle_wait_for_secondary_cpus_parking(void) +{ + unsigned int retries = 3; + + while (retries--) { + unsigned int delay_us = 10; + unsigned int timeout_us = 500 * 1000 / delay_us; + + /* + * The primary CPU0 core shall wait for the secondaries + * shutdown in order to power-off CPU's cluster safely. + * The timeout value depends on the current CPU frequency, + * it takes about 40-150us in average and over 1000us in + * a worst case scenario. + */ + do { + if (tegra_cpu_rail_off_ready()) + return 0; + + udelay(delay_us); + + } while (timeout_us--); + + pr_err("secondary CPU taking too long to park\n"); + + tegra_cpuidle_report_cpus_state(); + } + + pr_err("timed out waiting secondaries to park\n"); + + return -ETIMEDOUT; +} + +static void tegra_cpuidle_unpark_secondary_cpus(void) +{ + unsigned int cpu, lcpu; + + for_each_cpu(lcpu, cpu_online_mask) { + cpu = cpu_logical_map(lcpu); + + if (cpu > 0) { + tegra_enable_cpu_clock(cpu); + tegra_cpu_out_of_reset(cpu); + flowctrl_write_cpu_halt(cpu, 0); + } + } +} + +static int tegra_cpuidle_cc6_enter(unsigned int cpu) +{ + int ret; + + if (cpu > 0) { + ret = cpu_suspend(cpu, tegra_pm_park_secondary_cpu); + } else { + ret = tegra_cpuidle_wait_for_secondary_cpus_parking(); + if (!ret) + ret = tegra_pm_enter_lp2(); + + tegra_cpuidle_unpark_secondary_cpus(); + } + + return ret; +} + +static int tegra_cpuidle_c7_enter(void) +{ + int err; + + err = call_firmware_op(prepare_idle, TF_PM_MODE_LP2_NOFLUSH_L2); + if (err && err != -ENOSYS) + return err; + + return cpu_suspend(0, tegra30_pm_secondary_cpu_suspend); +} + +static int tegra_cpuidle_coupled_barrier(struct cpuidle_device *dev) +{ + if (tegra_pending_sgi()) { + /* + * CPU got local interrupt that will be lost after GIC's + * shutdown because GIC driver doesn't save/restore the + * pending SGI state across CPU cluster PM. Abort and retry + * next time. + */ + atomic_set(&tegra_abort_flag, 1); + } + + cpuidle_coupled_parallel_barrier(dev, &tegra_idle_barrier); + + if (atomic_read(&tegra_abort_flag)) { + cpuidle_coupled_parallel_barrier(dev, &tegra_idle_barrier); + atomic_set(&tegra_abort_flag, 0); + return -EINTR; + } + + return 0; +} + +static int tegra_cpuidle_state_enter(struct cpuidle_device *dev, + int index, unsigned int cpu) +{ + int err; + + /* + * CC6 state is the "CPU cluster power-off" state. In order to + * enter this state, at first the secondary CPU cores need to be + * parked into offline mode, then the last CPU should clean out + * remaining dirty cache lines into DRAM and trigger Flow Controller + * logic that turns off the cluster's power domain (which includes + * CPU cores, GIC and L2 cache). + */ + if (index == TEGRA_CC6) { + err = tegra_cpuidle_coupled_barrier(dev); + if (err) + return err; + } + + local_fiq_disable(); + RCU_NONIDLE(tegra_pm_set_cpu_in_lp2()); + cpu_pm_enter(); + + switch (index) { + case TEGRA_C7: + err = tegra_cpuidle_c7_enter(); + break; + + case TEGRA_CC6: + err = tegra_cpuidle_cc6_enter(cpu); + break; + + default: + err = -EINVAL; + break; + } + + cpu_pm_exit(); + RCU_NONIDLE(tegra_pm_clear_cpu_in_lp2()); + local_fiq_enable(); + + return err ?: index; +} + +static int tegra_cpuidle_adjust_state_index(int index, unsigned int cpu) +{ + /* + * On Tegra30 CPU0 can't be power-gated separately from secondary + * cores because it gates the whole CPU cluster. + */ + if (cpu > 0 || index != TEGRA_C7 || tegra_get_chip_id() != TEGRA30) + return index; + + /* put CPU0 into C1 if C7 is requested and secondaries are online */ + if (!IS_ENABLED(CONFIG_PM_SLEEP) || num_online_cpus() > 1) + index = TEGRA_C1; + else + index = TEGRA_CC6; + + return index; +} + +static int tegra_cpuidle_enter(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + unsigned int cpu = cpu_logical_map(dev->cpu); + int ret; + + index = tegra_cpuidle_adjust_state_index(index, cpu); + if (dev->states_usage[index].disable) + return -1; + + if (index == TEGRA_C1) + ret = arm_cpuidle_simple_enter(dev, drv, index); + else + ret = tegra_cpuidle_state_enter(dev, index, cpu); + + if (ret < 0) { + if (ret != -EINTR || index != TEGRA_CC6) + pr_err_once("failed to enter state %d err: %d\n", + index, ret); + index = -1; + } else { + index = ret; + } + + return index; +} + +static int tegra114_enter_s2idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + tegra_cpuidle_enter(dev, drv, index); + + return 0; +} + +/* + * The previous versions of Tegra CPUIDLE driver used a different "legacy" + * terminology for naming of the idling states, while this driver uses the + * new terminology. + * + * Mapping of the old terms into the new ones: + * + * Old | New + * --------- + * LP3 | C1 (CPU core clock gating) + * LP2 | C7 (CPU core power gating) + * LP2 | CC6 (CPU cluster power gating) + * + * Note that that the older CPUIDLE driver versions didn't explicitly + * differentiate the LP2 states because these states either used the same + * code path or because CC6 wasn't supported. + */ +static struct cpuidle_driver tegra_idle_driver = { + .name = "tegra_idle", + .states = { + [TEGRA_C1] = ARM_CPUIDLE_WFI_STATE_PWR(600), + [TEGRA_C7] = { + .enter = tegra_cpuidle_enter, + .exit_latency = 2000, + .target_residency = 2200, + .power_usage = 100, + .flags = CPUIDLE_FLAG_TIMER_STOP, + .name = "C7", + .desc = "CPU core powered off", + }, + [TEGRA_CC6] = { + .enter = tegra_cpuidle_enter, + .exit_latency = 5000, + .target_residency = 10000, + .power_usage = 0, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_COUPLED, + .name = "CC6", + .desc = "CPU cluster powered off", + }, + }, + .state_count = TEGRA_STATE_COUNT, + .safe_state_index = TEGRA_C1, +}; + +static inline void tegra_cpuidle_disable_state(enum tegra_state state) +{ + cpuidle_driver_state_disabled(&tegra_idle_driver, state, true); +} + +/* + * Tegra20 HW appears to have a bug such that PCIe device interrupts, whether + * they are legacy IRQs or MSI, are lost when CC6 is enabled. To work around + * this, simply disable CC6 if the PCI driver and DT node are both enabled. + */ +void tegra_cpuidle_pcie_irqs_in_use(void) +{ + struct cpuidle_state *state_cc6 = &tegra_idle_driver.states[TEGRA_CC6]; + + if ((state_cc6->flags & CPUIDLE_FLAG_UNUSABLE) || + tegra_get_chip_id() != TEGRA20) + return; + + pr_info("disabling CC6 state, since PCIe IRQs are in use\n"); + tegra_cpuidle_disable_state(TEGRA_CC6); +} + +static void tegra_cpuidle_setup_tegra114_c7_state(void) +{ + struct cpuidle_state *s = &tegra_idle_driver.states[TEGRA_C7]; + + s->enter_s2idle = tegra114_enter_s2idle; + s->target_residency = 1000; + s->exit_latency = 500; +} + +static int tegra_cpuidle_probe(struct platform_device *pdev) +{ + if (tegra_pmc_get_suspend_mode() == TEGRA_SUSPEND_NOT_READY) + return -EPROBE_DEFER; + + /* LP2 could be disabled in device-tree */ + if (tegra_pmc_get_suspend_mode() < TEGRA_SUSPEND_LP2) + tegra_cpuidle_disable_state(TEGRA_CC6); + + /* + * Required suspend-resume functionality, which is provided by the + * Tegra-arch core and PMC driver, is unavailable if PM-sleep option + * is disabled. + */ + if (!IS_ENABLED(CONFIG_PM_SLEEP)) { + tegra_cpuidle_disable_state(TEGRA_C7); + tegra_cpuidle_disable_state(TEGRA_CC6); + } + + /* + * Generic WFI state (also known as C1 or LP3) and the coupled CPU + * cluster power-off (CC6 or LP2) states are common for all Tegra SoCs. + */ + switch (tegra_get_chip_id()) { + case TEGRA20: + /* Tegra20 isn't capable to power-off individual CPU cores */ + tegra_cpuidle_disable_state(TEGRA_C7); + break; + + case TEGRA30: + break; + + case TEGRA114: + case TEGRA124: + tegra_cpuidle_setup_tegra114_c7_state(); + + /* coupled CC6 (LP2) state isn't implemented yet */ + tegra_cpuidle_disable_state(TEGRA_CC6); + break; + + default: + return -EINVAL; + } + + return cpuidle_register(&tegra_idle_driver, cpu_possible_mask); +} + +static struct platform_driver tegra_cpuidle_driver = { + .probe = tegra_cpuidle_probe, + .driver = { + .name = "tegra-cpuidle", + }, +}; +builtin_platform_driver(tegra_cpuidle_driver); diff --git a/drivers/cpuidle/cpuidle-ux500.c b/drivers/cpuidle/cpuidle-ux500.c index a2d34be17a09..f7d778580e9b 100644 --- a/drivers/cpuidle/cpuidle-ux500.c +++ b/drivers/cpuidle/cpuidle-ux500.c @@ -117,7 +117,7 @@ static int dbx500_cpuidle_probe(struct platform_device *pdev) static struct platform_driver dbx500_cpuidle_plat_driver = { .driver = { - .name = "cpuidle-dbx500", + .name = "db8500-cpuidle", }, .probe = dbx500_cpuidle_probe, }; diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index de81298051b3..6eceb1988243 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -8,6 +8,7 @@ * This code is licenced under the GPL. */ +#include "linux/percpu-defs.h" #include <linux/clockchips.h> #include <linux/kernel.h> #include <linux/mutex.h> @@ -22,6 +23,8 @@ #include <linux/module.h> #include <linux/suspend.h> #include <linux/tick.h> +#include <linux/mmu_context.h> +#include <linux/context_tracking.h> #include <trace/events/power.h> #include "cpuidle.h" @@ -137,29 +140,25 @@ static void enter_s2idle_proper(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) { ktime_t time_start, time_end; + struct cpuidle_state *target_state = &drv->states[index]; time_start = ns_to_ktime(local_clock()); - /* - * trace_suspend_resume() called by tick_freeze() for the last CPU - * executing it contains RCU usage regarded as invalid in the idle - * context, so tell RCU about that. - */ - RCU_NONIDLE(tick_freeze()); + tick_freeze(); /* * The state used here cannot be a "coupled" one, because the "coupled" * cpuidle mechanism enables interrupts and doing that with timekeeping * suspended is generally unsafe. */ stop_critical_timings(); - drv->states[index].enter_s2idle(dev, drv, index); - WARN_ON(!irqs_disabled()); - /* - * timekeeping_resume() that will be called by tick_unfreeze() for the - * first CPU executing it calls functions containing RCU read-side - * critical sections, so tell RCU about that. - */ - RCU_NONIDLE(tick_unfreeze()); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + ct_idle_enter(); + target_state->enter_s2idle(dev, drv, index); + if (WARN_ON_ONCE(!irqs_disabled())) + local_irq_disable(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + ct_idle_exit(); + tick_unfreeze(); start_critical_timings(); time_end = ns_to_ktime(local_clock()); @@ -186,9 +185,10 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) * be frozen safely. */ index = find_deepest_state(drv, dev, U64_MAX, 0, true); - if (index > 0) + if (index > 0) { enter_s2idle_proper(drv, dev, index); - + local_irq_enable(); + } return index; } #endif /* CONFIG_SUSPEND */ @@ -224,19 +224,26 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, broadcast = false; } + if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED) + leave_mm(dev->cpu); + /* Take note of the planned idle state. */ sched_idle_set_state(target_state); - trace_cpu_idle_rcuidle(index, dev->cpu); + trace_cpu_idle(index, dev->cpu); time_start = ns_to_ktime(local_clock()); stop_critical_timings(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + ct_idle_enter(); entered_state = target_state->enter(dev, drv, index); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + ct_idle_exit(); start_critical_timings(); sched_clock_idle_wakeup_event(); time_end = ns_to_ktime(local_clock()); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ sched_idle_set_state(NULL); @@ -273,6 +280,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, /* Shallower states are enabled, so update. */ dev->states_usage[entered_state].above++; + trace_cpu_idle_miss(dev->cpu, entered_state, false); break; } } else if (diff > delay) { @@ -284,14 +292,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, * Update if a deeper state would have been a * better match for the observed idle duration. */ - if (diff - delay >= drv->states[i].target_residency_ns) + if (diff - delay >= drv->states[i].target_residency_ns) { dev->states_usage[entered_state].below++; + trace_cpu_idle_miss(dev->cpu, entered_state, true); + } break; } } } else { dev->last_residency_ns = 0; + dev->states_usage[index].rejected++; } return entered_state; @@ -362,6 +373,19 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index) cpuidle_curr_governor->reflect(dev, index); } +/* + * Min polling interval of 10usec is a guess. It is assuming that + * for most users, the time for a single ping-pong workload like + * perf bench pipe would generally complete within 10usec but + * this is hardware dependant. Actual time can be estimated with + * + * perf bench sched pipe -l 10000 + * + * Run multiple times to avoid cpufreq effects. + */ +#define CPUIDLE_POLL_MIN 10000 +#define CPUIDLE_POLL_MAX (TICK_NSEC / 16) + /** * cpuidle_poll_time - return amount of time to poll for, * governors can override dev->poll_limit_ns if necessary @@ -376,15 +400,23 @@ u64 cpuidle_poll_time(struct cpuidle_driver *drv, int i; u64 limit_ns; + BUILD_BUG_ON(CPUIDLE_POLL_MIN > CPUIDLE_POLL_MAX); + if (dev->poll_limit_ns) return dev->poll_limit_ns; - limit_ns = TICK_NSEC; + limit_ns = CPUIDLE_POLL_MAX; for (i = 1; i < drv->state_count; i++) { + u64 state_limit; + if (dev->states_usage[i].disable) continue; - limit_ns = drv->states[i].target_residency_ns; + state_limit = drv->states[i].target_residency_ns; + if (state_limit < CPUIDLE_POLL_MIN) + continue; + + limit_ns = min_t(u64, state_limit, CPUIDLE_POLL_MAX); break; } @@ -736,53 +768,15 @@ int cpuidle_register(struct cpuidle_driver *drv, } EXPORT_SYMBOL_GPL(cpuidle_register); -#ifdef CONFIG_SMP - -/* - * This function gets called when a part of the kernel has a new latency - * requirement. This means we need to get all processors out of their C-state, - * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that - * wakes them all right up. - */ -static int cpuidle_latency_notify(struct notifier_block *b, - unsigned long l, void *v) -{ - wake_up_all_idle_cpus(); - return NOTIFY_OK; -} - -static struct notifier_block cpuidle_latency_notifier = { - .notifier_call = cpuidle_latency_notify, -}; - -static inline void latency_notifier_init(struct notifier_block *n) -{ - pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY, n); -} - -#else /* CONFIG_SMP */ - -#define latency_notifier_init(x) do { } while (0) - -#endif /* CONFIG_SMP */ - /** * cpuidle_init - core initializer */ static int __init cpuidle_init(void) { - int ret; - if (cpuidle_disabled()) return -ENODEV; - ret = cpuidle_add_interface(cpu_subsys.dev_root); - if (ret) - return ret; - - latency_notifier_init(&cpuidle_latency_notifier); - - return 0; + return cpuidle_add_interface(cpu_subsys.dev_root); } module_param(off, int, 0444); diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 4070e573bf43..f70aa17e2a8e 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -181,9 +181,13 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) */ if (s->target_residency > 0) s->target_residency_ns = s->target_residency * NSEC_PER_USEC; + else if (s->target_residency_ns < 0) + s->target_residency_ns = 0; if (s->exit_latency > 0) s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC; + else if (s->exit_latency_ns < 0) + s->exit_latency_ns = 0; } } diff --git a/drivers/cpuidle/dt_idle_genpd.c b/drivers/cpuidle/dt_idle_genpd.c new file mode 100644 index 000000000000..b37165514d4e --- /dev/null +++ b/drivers/cpuidle/dt_idle_genpd.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * PM domains for CPUs via genpd. + * + * Copyright (C) 2019 Linaro Ltd. + * Author: Ulf Hansson <ulf.hansson@linaro.org> + * + * Copyright (c) 2021 Western Digital Corporation or its affiliates. + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#define pr_fmt(fmt) "dt-idle-genpd: " fmt + +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/pm_domain.h> +#include <linux/pm_runtime.h> +#include <linux/slab.h> +#include <linux/string.h> + +#include "dt_idle_genpd.h" + +static int pd_parse_state_nodes( + int (*parse_state)(struct device_node *, u32 *), + struct genpd_power_state *states, int state_count) +{ + int i, ret; + u32 state, *state_buf; + + for (i = 0; i < state_count; i++) { + ret = parse_state(to_of_node(states[i].fwnode), &state); + if (ret) + goto free_state; + + state_buf = kmalloc(sizeof(u32), GFP_KERNEL); + if (!state_buf) { + ret = -ENOMEM; + goto free_state; + } + *state_buf = state; + states[i].data = state_buf; + } + + return 0; + +free_state: + i--; + for (; i >= 0; i--) + kfree(states[i].data); + return ret; +} + +static int pd_parse_states(struct device_node *np, + int (*parse_state)(struct device_node *, u32 *), + struct genpd_power_state **states, + int *state_count) +{ + int ret; + + /* Parse the domain idle states. */ + ret = of_genpd_parse_idle_states(np, states, state_count); + if (ret) + return ret; + + /* Fill out the dt specifics for each found state. */ + ret = pd_parse_state_nodes(parse_state, *states, *state_count); + if (ret) + kfree(*states); + + return ret; +} + +static void pd_free_states(struct genpd_power_state *states, + unsigned int state_count) +{ + int i; + + for (i = 0; i < state_count; i++) + kfree(states[i].data); + kfree(states); +} + +void dt_idle_pd_free(struct generic_pm_domain *pd) +{ + pd_free_states(pd->states, pd->state_count); + kfree(pd->name); + kfree(pd); +} + +struct generic_pm_domain *dt_idle_pd_alloc(struct device_node *np, + int (*parse_state)(struct device_node *, u32 *)) +{ + struct generic_pm_domain *pd; + struct genpd_power_state *states = NULL; + int ret, state_count = 0; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + goto out; + + pd->name = kasprintf(GFP_KERNEL, "%pOF", np); + if (!pd->name) + goto free_pd; + + /* + * Parse the domain idle states and let genpd manage the state selection + * for those being compatible with "domain-idle-state". + */ + ret = pd_parse_states(np, parse_state, &states, &state_count); + if (ret) + goto free_name; + + pd->free_states = pd_free_states; + pd->name = kbasename(pd->name); + pd->states = states; + pd->state_count = state_count; + + pr_debug("alloc PM domain %s\n", pd->name); + return pd; + +free_name: + kfree(pd->name); +free_pd: + kfree(pd); +out: + pr_err("failed to alloc PM domain %pOF\n", np); + return NULL; +} + +int dt_idle_pd_init_topology(struct device_node *np) +{ + struct device_node *node; + struct of_phandle_args child, parent; + int ret; + + for_each_child_of_node(np, node) { + if (of_parse_phandle_with_args(node, "power-domains", + "#power-domain-cells", 0, &parent)) + continue; + + child.np = node; + child.args_count = 0; + ret = of_genpd_add_subdomain(&parent, &child); + of_node_put(parent.np); + if (ret) { + of_node_put(node); + return ret; + } + } + + return 0; +} + +struct device *dt_idle_attach_cpu(int cpu, const char *name) +{ + struct device *dev; + + dev = dev_pm_domain_attach_by_name(get_cpu_device(cpu), name); + if (IS_ERR_OR_NULL(dev)) + return dev; + + pm_runtime_irq_safe(dev); + if (cpu_online(cpu)) + pm_runtime_get_sync(dev); + + dev_pm_syscore_device(dev, true); + + return dev; +} + +void dt_idle_detach_cpu(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev)) + return; + + dev_pm_domain_detach(dev, false); +} diff --git a/drivers/cpuidle/dt_idle_genpd.h b/drivers/cpuidle/dt_idle_genpd.h new file mode 100644 index 000000000000..a95483d08a02 --- /dev/null +++ b/drivers/cpuidle/dt_idle_genpd.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __DT_IDLE_GENPD +#define __DT_IDLE_GENPD + +struct device_node; +struct generic_pm_domain; + +#ifdef CONFIG_DT_IDLE_GENPD + +void dt_idle_pd_free(struct generic_pm_domain *pd); + +struct generic_pm_domain *dt_idle_pd_alloc(struct device_node *np, + int (*parse_state)(struct device_node *, u32 *)); + +int dt_idle_pd_init_topology(struct device_node *np); + +struct device *dt_idle_attach_cpu(int cpu, const char *name); + +void dt_idle_detach_cpu(struct device *dev); + +#else + +static inline void dt_idle_pd_free(struct generic_pm_domain *pd) +{ +} + +static inline struct generic_pm_domain *dt_idle_pd_alloc( + struct device_node *np, + int (*parse_state)(struct device_node *, u32 *)) +{ + return NULL; +} + +static inline int dt_idle_pd_init_topology(struct device_node *np) +{ + return 0; +} + +static inline struct device *dt_idle_attach_cpu(int cpu, const char *name) +{ + return NULL; +} + +static inline void dt_idle_detach_cpu(struct device *dev) +{ +} + +#endif + +#endif diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c index e48271e117a3..0d0f9751ff8f 100644 --- a/drivers/cpuidle/governor.c +++ b/drivers/cpuidle/governor.c @@ -63,12 +63,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov) cpuidle_curr_governor = gov; - if (gov) { - list_for_each_entry(dev, &cpuidle_detected_devices, device_list) - cpuidle_enable_device(dev); - cpuidle_install_idle_handler(); - printk(KERN_INFO "cpuidle: using governor %s\n", gov->name); - } + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + cpuidle_enable_device(dev); + + cpuidle_install_idle_handler(); + pr_info("cpuidle: using governor %s\n", gov->name); return 0; } @@ -109,9 +108,9 @@ int cpuidle_register_governor(struct cpuidle_governor *gov) */ s64 cpuidle_governor_latency_req(unsigned int cpu) { - int global_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); struct device *device = get_cpu_device(cpu); int device_req = dev_pm_qos_raw_resume_latency(device); + int global_req = cpu_latency_qos_limit(); if (device_req > global_req) device_req = global_req; diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c index cb2a96eafc02..1dff3a52917d 100644 --- a/drivers/cpuidle/governors/haltpoll.c +++ b/drivers/cpuidle/governors/haltpoll.c @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/kvm_para.h> +#include <trace/events/power.h> static unsigned int guest_halt_poll_ns __read_mostly = 200000; module_param(guest_halt_poll_ns, uint, 0644); @@ -90,6 +91,7 @@ static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns) if (val > guest_halt_poll_ns) val = guest_halt_poll_ns; + trace_guest_halt_poll_ns_grow(val, dev->poll_limit_ns); dev->poll_limit_ns = val; } else if (block_ns > guest_halt_poll_ns && guest_halt_poll_allow_shrink) { @@ -100,6 +102,7 @@ static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns) val = 0; else val /= shrink; + trace_guest_halt_poll_ns_shrink(val, dev->poll_limit_ns); dev->poll_limit_ns = val; } } diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index b0a7ad566081..c4922684f305 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -34,7 +34,7 @@ * 1) Energy break even point * 2) Performance impact * 3) Latency tolerance (from pmqos infrastructure) - * These these three factors are treated independently. + * These three factors are treated independently. * * Energy break even point * ----------------------- @@ -117,7 +117,7 @@ struct menu_device { int interval_ptr; }; -static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters) +static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) { int bucket = 0; @@ -150,7 +150,7 @@ static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters) * to be, the higher this multiplier, and thus the higher * the barrier to go to an expensive C state. */ -static inline int performance_multiplier(unsigned long nr_iowaiters) +static inline int performance_multiplier(unsigned int nr_iowaiters) { /* for IO wait tasks (per cpu!) we add 10x each */ return 1 + 10 * nr_iowaiters; @@ -270,8 +270,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, unsigned int predicted_us; u64 predicted_ns; u64 interactivity_req; - unsigned long nr_iowaiters; - ktime_t delta_next; + unsigned int nr_iowaiters; + ktime_t delta, delta_tick; int i, idx; if (data->needs_update) { @@ -280,7 +280,12 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } /* determine the expected residency time, round up */ - data->next_timer_ns = tick_nohz_get_sleep_length(&delta_next); + delta = tick_nohz_get_sleep_length(&delta_tick); + if (unlikely(delta < 0)) { + delta = 0; + delta_tick = 0; + } + data->next_timer_ns = delta; nr_iowaiters = nr_iowait_cpu(dev->cpu); data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); @@ -318,7 +323,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * state selection. */ if (predicted_ns < TICK_NSEC) - predicted_ns = delta_next; + predicted_ns = data->next_timer_ns; } else { /* * Use the performance multiplier and the user-configurable @@ -377,7 +382,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * stuck in the shallow one for too long. */ if (drv->states[idx].target_residency_ns < TICK_NSEC && - s->target_residency_ns <= delta_next) + s->target_residency_ns <= delta_tick) idx = i; return idx; @@ -399,7 +404,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { *stop_tick = false; - if (idx > 0 && drv->states[idx].target_residency_ns > delta_next) { + if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) { /* * The tick is not going to be stopped and the target * residency of the state to be returned is not within @@ -411,7 +416,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, continue; idx = i; - if (drv->states[i].target_residency_ns <= delta_next) + if (drv->states[i].target_residency_ns <= delta_tick) break; } } diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 6deaaf5f05b5..d9262db79cae 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -2,47 +2,103 @@ /* * Timer events oriented CPU idle governor * - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + */ + +/** + * DOC: teo-description * * The idea of this governor is based on the observation that on many systems * timer events are two or more orders of magnitude more frequent than any - * other interrupts, so they are likely to be the most significant source of CPU + * other interrupts, so they are likely to be the most significant cause of CPU * wakeups from idle states. Moreover, information about what happened in the * (relatively recent) past can be used to estimate whether or not the deepest - * idle state with target residency within the time to the closest timer is - * likely to be suitable for the upcoming idle time of the CPU and, if not, then - * which of the shallower idle states to choose. + * idle state with target residency within the (known) time till the closest + * timer event, referred to as the sleep length, is likely to be suitable for + * the upcoming CPU idle period and, if not, then which of the shallower idle + * states to choose instead of it. + * + * Of course, non-timer wakeup sources are more important in some use cases + * which can be covered by taking a few most recent idle time intervals of the + * CPU into account. However, even in that context it is not necessary to + * consider idle duration values greater than the sleep length, because the + * closest timer will ultimately wake up the CPU anyway unless it is woken up + * earlier. + * + * Thus this governor estimates whether or not the prospective idle duration of + * a CPU is likely to be significantly shorter than the sleep length and selects + * an idle state for it accordingly. + * + * The computations carried out by this governor are based on using bins whose + * boundaries are aligned with the target residency parameter values of the CPU + * idle states provided by the %CPUIdle driver in the ascending order. That is, + * the first bin spans from 0 up to, but not including, the target residency of + * the second idle state (idle state 1), the second bin spans from the target + * residency of idle state 1 up to, but not including, the target residency of + * idle state 2, the third bin spans from the target residency of idle state 2 + * up to, but not including, the target residency of idle state 3 and so on. + * The last bin spans from the target residency of the deepest idle state + * supplied by the driver to infinity. + * + * Two metrics called "hits" and "intercepts" are associated with each bin. + * They are updated every time before selecting an idle state for the given CPU + * in accordance with what happened last time. + * + * The "hits" metric reflects the relative frequency of situations in which the + * sleep length and the idle duration measured after CPU wakeup fall into the + * same bin (that is, the CPU appears to wake up "on time" relative to the sleep + * length). In turn, the "intercepts" metric reflects the relative frequency of + * situations in which the measured idle duration is so much shorter than the + * sleep length that the bin it falls into corresponds to an idle state + * shallower than the one whose bin is fallen into by the sleep length (these + * situations are referred to as "intercepts" below). + * + * In addition to the metrics described above, the governor counts recent + * intercepts (that is, intercepts that have occurred during the last + * %NR_RECENT invocations of it for the given CPU) for each bin. * - * Of course, non-timer wakeup sources are more important in some use cases and - * they can be covered by taking a few most recent idle time intervals of the - * CPU into account. However, even in that case it is not necessary to consider - * idle duration values greater than the time till the closest timer, as the - * patterns that they may belong to produce average values close enough to - * the time till the closest timer (sleep length) anyway. + * In order to select an idle state for a CPU, the governor takes the following + * steps (modulo the possible latency constraint that must be taken into account + * too): * - * Thus this governor estimates whether or not the upcoming idle time of the CPU - * is likely to be significantly shorter than the sleep length and selects an - * idle state for it in accordance with that, as follows: + * 1. Find the deepest CPU idle state whose target residency does not exceed + * the current sleep length (the candidate idle state) and compute 3 sums as + * follows: * - * - Find an idle state on the basis of the sleep length and state statistics - * collected over time: + * - The sum of the "hits" and "intercepts" metrics for the candidate state + * and all of the deeper idle states (it represents the cases in which the + * CPU was idle long enough to avoid being intercepted if the sleep length + * had been equal to the current one). * - * o Find the deepest idle state whose target residency is less than or equal - * to the sleep length. + * - The sum of the "intercepts" metrics for all of the idle states shallower + * than the candidate one (it represents the cases in which the CPU was not + * idle long enough to avoid being intercepted if the sleep length had been + * equal to the current one). * - * o Select it if it matched both the sleep length and the observed idle - * duration in the past more often than it matched the sleep length alone - * (i.e. the observed idle duration was significantly shorter than the sleep - * length matched by it). + * - The sum of the numbers of recent intercepts for all of the idle states + * shallower than the candidate one. * - * o Otherwise, select the shallower state with the greatest matched "early" - * wakeups metric. + * 2. If the second sum is greater than the first one or the third sum is + * greater than %NR_RECENT / 2, the CPU is likely to wake up early, so look + * for an alternative idle state to select. * - * - If the majority of the most recent idle duration values are below the - * target residency of the idle state selected so far, use those values to - * compute the new expected idle duration and find an idle state matching it - * (which has to be shallower than the one selected so far). + * - Traverse the idle states shallower than the candidate one in the + * descending order. + * + * - For each of them compute the sum of the "intercepts" metrics and the sum + * of the numbers of recent intercepts over all of the idle states between + * it and the candidate one (including the former and excluding the + * latter). + * + * - If each of these sums that needs to be taken into account (because the + * check related to it has indicated that the CPU is likely to wake up + * early) is greater than a half of the corresponding sum computed in step + * 1 (which means that the target residency of the state in question had + * not exceeded the idle duration in over a half of the relevant cases), + * select the given idle state instead of the candidate one. + * + * 3. By default, select the candidate state. */ #include <linux/cpuidle.h> @@ -60,64 +116,51 @@ /* * Number of the most recent idle duration values to take into consideration for - * the detection of wakeup patterns. + * the detection of recent early wakeup patterns. */ -#define INTERVALS 8 +#define NR_RECENT 9 /** - * struct teo_idle_state - Idle state data used by the TEO cpuidle governor. - * @early_hits: "Early" CPU wakeups "matching" this state. - * @hits: "On time" CPU wakeups "matching" this state. - * @misses: CPU wakeups "missing" this state. - * - * A CPU wakeup is "matched" by a given idle state if the idle duration measured - * after the wakeup is between the target residency of that state and the target - * residency of the next one (or if this is the deepest available idle state, it - * "matches" a CPU wakeup when the measured idle duration is at least equal to - * its target residency). - * - * Also, from the TEO governor perspective, a CPU wakeup from idle is "early" if - * it occurs significantly earlier than the closest expected timer event (that - * is, early enough to match an idle state shallower than the one matching the - * time till the closest timer event). Otherwise, the wakeup is "on time", or - * it is a "hit". - * - * A "miss" occurs when the given state doesn't match the wakeup, but it matches - * the time till the closest timer event used for idle state selection. + * struct teo_bin - Metrics used by the TEO cpuidle governor. + * @intercepts: The "intercepts" metric. + * @hits: The "hits" metric. + * @recent: The number of recent "intercepts". */ -struct teo_idle_state { - unsigned int early_hits; +struct teo_bin { + unsigned int intercepts; unsigned int hits; - unsigned int misses; + unsigned int recent; }; /** * struct teo_cpu - CPU data used by the TEO cpuidle governor. * @time_span_ns: Time between idle state selection and post-wakeup update. * @sleep_length_ns: Time till the closest timer event (at the selection time). - * @states: Idle states data corresponding to this CPU. - * @interval_idx: Index of the most recent saved idle interval. - * @intervals: Saved idle duration values. + * @state_bins: Idle state data bins for this CPU. + * @total: Grand total of the "intercepts" and "hits" mertics for all bins. + * @next_recent_idx: Index of the next @recent_idx entry to update. + * @recent_idx: Indices of bins corresponding to recent "intercepts". */ struct teo_cpu { - u64 time_span_ns; - u64 sleep_length_ns; - struct teo_idle_state states[CPUIDLE_STATE_MAX]; - int interval_idx; - u64 intervals[INTERVALS]; + s64 time_span_ns; + s64 sleep_length_ns; + struct teo_bin state_bins[CPUIDLE_STATE_MAX]; + unsigned int total; + int next_recent_idx; + int recent_idx[NR_RECENT]; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); /** - * teo_update - Update CPU data after wakeup. + * teo_update - Update CPU metrics after wakeup. * @drv: cpuidle driver containing state data. * @dev: Target CPU. */ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); - int i, idx_hit = -1, idx_timer = -1; + int i, idx_timer = 0, idx_duration = 0; u64 measured_ns; if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { @@ -150,56 +193,52 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) measured_ns /= 2; } + cpu_data->total = 0; + /* - * Decay the "early hits" metric for all of the states and find the - * states matching the sleep length and the measured idle duration. + * Decay the "hits" and "intercepts" metrics for all of the bins and + * find the bins that the sleep length and the measured idle duration + * fall into. */ for (i = 0; i < drv->state_count; i++) { - unsigned int early_hits = cpu_data->states[i].early_hits; + s64 target_residency_ns = drv->states[i].target_residency_ns; + struct teo_bin *bin = &cpu_data->state_bins[i]; + + bin->hits -= bin->hits >> DECAY_SHIFT; + bin->intercepts -= bin->intercepts >> DECAY_SHIFT; - cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT; + cpu_data->total += bin->hits + bin->intercepts; - if (drv->states[i].target_residency_ns <= cpu_data->sleep_length_ns) { + if (target_residency_ns <= cpu_data->sleep_length_ns) { idx_timer = i; - if (drv->states[i].target_residency_ns <= measured_ns) - idx_hit = i; + if (target_residency_ns <= measured_ns) + idx_duration = i; } } - /* - * Update the "hits" and "misses" data for the state matching the sleep - * length. If it matches the measured idle duration too, this is a hit, - * so increase the "hits" metric for it then. Otherwise, this is a - * miss, so increase the "misses" metric for it. In the latter case - * also increase the "early hits" metric for the state that actually - * matches the measured idle duration. - */ - if (idx_timer >= 0) { - unsigned int hits = cpu_data->states[idx_timer].hits; - unsigned int misses = cpu_data->states[idx_timer].misses; - - hits -= hits >> DECAY_SHIFT; - misses -= misses >> DECAY_SHIFT; - - if (idx_timer > idx_hit) { - misses += PULSE; - if (idx_hit >= 0) - cpu_data->states[idx_hit].early_hits += PULSE; - } else { - hits += PULSE; - } + i = cpu_data->next_recent_idx++; + if (cpu_data->next_recent_idx >= NR_RECENT) + cpu_data->next_recent_idx = 0; - cpu_data->states[idx_timer].misses = misses; - cpu_data->states[idx_timer].hits = hits; - } + if (cpu_data->recent_idx[i] >= 0) + cpu_data->state_bins[cpu_data->recent_idx[i]].recent--; /* - * Save idle duration values corresponding to non-timer wakeups for - * pattern detection. + * If the measured idle duration falls into the same bin as the sleep + * length, this is a "hit", so update the "hits" metric for that bin. + * Otherwise, update the "intercepts" metric for the bin fallen into by + * the measured idle duration. */ - cpu_data->intervals[cpu_data->interval_idx++] = measured_ns; - if (cpu_data->interval_idx >= INTERVALS) - cpu_data->interval_idx = 0; + if (idx_timer == idx_duration) { + cpu_data->state_bins[idx_timer].hits += PULSE; + cpu_data->recent_idx[i] = -1; + } else { + cpu_data->state_bins[idx_duration].intercepts += PULSE; + cpu_data->state_bins[idx_duration].recent++; + cpu_data->recent_idx[i] = idx_duration; + } + + cpu_data->total += PULSE; } static bool teo_time_ok(u64 interval_ns) @@ -207,6 +246,12 @@ static bool teo_time_ok(u64 interval_ns) return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC; } +static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv) +{ + return (drv->states[idx].target_residency_ns + + drv->states[idx+1].target_residency_ns) / 2; +} + /** * teo_find_shallower_state - Find shallower idle state matching given duration. * @drv: cpuidle driver containing state data. @@ -216,7 +261,7 @@ static bool teo_time_ok(u64 interval_ns) */ static int teo_find_shallower_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, int state_idx, - u64 duration_ns) + s64 duration_ns) { int i; @@ -242,10 +287,18 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); - u64 duration_ns; - unsigned int hits, misses, early_hits; - int max_early_idx, prev_max_early_idx, constraint_idx, idx, i; + unsigned int idx_intercept_sum = 0; + unsigned int intercept_sum = 0; + unsigned int idx_recent_sum = 0; + unsigned int recent_sum = 0; + unsigned int idx_hit_sum = 0; + unsigned int hit_sum = 0; + int constraint_idx = 0; + int idx0 = 0, idx = -1; + bool alt_intercepts, alt_recent; ktime_t delta_tick; + s64 duration_ns; + int i; if (dev->last_state_idx >= 0) { teo_update(drv, dev); @@ -257,164 +310,150 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, duration_ns = tick_nohz_get_sleep_length(&delta_tick); cpu_data->sleep_length_ns = duration_ns; - hits = 0; - misses = 0; - early_hits = 0; - max_early_idx = -1; - prev_max_early_idx = -1; - constraint_idx = drv->state_count; - idx = -1; + /* Check if there is any choice in the first place. */ + if (drv->state_count < 2) { + idx = 0; + goto end; + } + if (!dev->states_usage[0].disable) { + idx = 0; + if (drv->states[1].target_residency_ns > duration_ns) + goto end; + } - for (i = 0; i < drv->state_count; i++) { + /* + * Find the deepest idle state whose target residency does not exceed + * the current sleep length and the deepest idle state not deeper than + * the former whose exit latency does not exceed the current latency + * constraint. Compute the sums of metrics for early wakeup pattern + * detection. + */ + for (i = 1; i < drv->state_count; i++) { + struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; struct cpuidle_state *s = &drv->states[i]; - if (dev->states_usage[i].disable) { - /* - * Ignore disabled states with target residencies beyond - * the anticipated idle duration. - */ - if (s->target_residency_ns > duration_ns) - continue; - - /* - * This state is disabled, so the range of idle duration - * values corresponding to it is covered by the current - * candidate state, but still the "hits" and "misses" - * metrics of the disabled state need to be used to - * decide whether or not the state covering the range in - * question is good enough. - */ - hits = cpu_data->states[i].hits; - misses = cpu_data->states[i].misses; - - if (early_hits >= cpu_data->states[i].early_hits || - idx < 0) - continue; - - /* - * If the current candidate state has been the one with - * the maximum "early hits" metric so far, the "early - * hits" metric of the disabled state replaces the - * current "early hits" count to avoid selecting a - * deeper state with lower "early hits" metric. - */ - if (max_early_idx == idx) { - early_hits = cpu_data->states[i].early_hits; - continue; - } - - /* - * The current candidate state is closer to the disabled - * one than the current maximum "early hits" state, so - * replace the latter with it, but in case the maximum - * "early hits" state index has not been set so far, - * check if the current candidate state is not too - * shallow for that role. - */ - if (teo_time_ok(drv->states[idx].target_residency_ns)) { - prev_max_early_idx = max_early_idx; - early_hits = cpu_data->states[i].early_hits; - max_early_idx = idx; - } + /* + * Update the sums of idle state mertics for all of the states + * shallower than the current one. + */ + intercept_sum += prev_bin->intercepts; + hit_sum += prev_bin->hits; + recent_sum += prev_bin->recent; + if (dev->states_usage[i].disable) continue; - } if (idx < 0) { idx = i; /* first enabled state */ - hits = cpu_data->states[i].hits; - misses = cpu_data->states[i].misses; + idx0 = i; } if (s->target_residency_ns > duration_ns) break; - if (s->exit_latency_ns > latency_req && constraint_idx > i) + idx = i; + + if (s->exit_latency_ns <= latency_req) constraint_idx = i; - idx = i; - hits = cpu_data->states[i].hits; - misses = cpu_data->states[i].misses; - - if (early_hits < cpu_data->states[i].early_hits && - teo_time_ok(drv->states[i].target_residency_ns)) { - prev_max_early_idx = max_early_idx; - early_hits = cpu_data->states[i].early_hits; - max_early_idx = i; - } + idx_intercept_sum = intercept_sum; + idx_hit_sum = hit_sum; + idx_recent_sum = recent_sum; } - /* - * If the "hits" metric of the idle state matching the sleep length is - * greater than its "misses" metric, that is the one to use. Otherwise, - * it is more likely that one of the shallower states will match the - * idle duration observed after wakeup, so take the one with the maximum - * "early hits" metric, but if that cannot be determined, just use the - * state selected so far. - */ - if (hits <= misses) { - /* - * The current candidate state is not suitable, so take the one - * whose "early hits" metric is the maximum for the range of - * shallower states. - */ - if (idx == max_early_idx) - max_early_idx = prev_max_early_idx; - - if (max_early_idx >= 0) { - idx = max_early_idx; - duration_ns = drv->states[idx].target_residency_ns; - } + /* Avoid unnecessary overhead. */ + if (idx < 0) { + idx = 0; /* No states enabled, must use 0. */ + goto end; + } else if (idx == idx0) { + goto end; } /* - * If there is a latency constraint, it may be necessary to use a - * shallower idle state than the one selected so far. + * If the sum of the intercepts metric for all of the idle states + * shallower than the current candidate one (idx) is greater than the + * sum of the intercepts and hits metrics for the candidate state and + * all of the deeper states, or the sum of the numbers of recent + * intercepts over all of the states shallower than the candidate one + * is greater than a half of the number of recent events taken into + * account, the CPU is likely to wake up early, so find an alternative + * idle state to select. */ - if (constraint_idx < idx) - idx = constraint_idx; - - if (idx < 0) { - idx = 0; /* No states enabled. Must use 0. */ - } else if (idx > 0) { - unsigned int count = 0; - u64 sum = 0; + alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum; + alt_recent = idx_recent_sum > NR_RECENT / 2; + if (alt_recent || alt_intercepts) { + s64 first_suitable_span_ns = duration_ns; + int first_suitable_idx = idx; /* - * Count and sum the most recent idle duration values less than - * the current expected idle duration value. + * Look for the deepest idle state whose target residency had + * not exceeded the idle duration in over a half of the relevant + * cases (both with respect to intercepts overall and with + * respect to the recent intercepts only) in the past. + * + * Take the possible latency constraint and duration limitation + * present if the tick has been stopped already into account. */ - for (i = 0; i < INTERVALS; i++) { - u64 val = cpu_data->intervals[i]; + intercept_sum = 0; + recent_sum = 0; + + for (i = idx - 1; i >= 0; i--) { + struct teo_bin *bin = &cpu_data->state_bins[i]; + s64 span_ns; + + intercept_sum += bin->intercepts; + recent_sum += bin->recent; + + span_ns = teo_middle_of_bin(i, drv); + + if ((!alt_recent || 2 * recent_sum > idx_recent_sum) && + (!alt_intercepts || + 2 * intercept_sum > idx_intercept_sum)) { + if (teo_time_ok(span_ns) && + !dev->states_usage[i].disable) { + idx = i; + duration_ns = span_ns; + } else { + /* + * The current state is too shallow or + * disabled, so take the first enabled + * deeper state with suitable time span. + */ + idx = first_suitable_idx; + duration_ns = first_suitable_span_ns; + } + break; + } - if (val >= duration_ns) + if (dev->states_usage[i].disable) continue; - count++; - sum += val; - } + if (!teo_time_ok(span_ns)) { + /* + * The current state is too shallow, but if an + * alternative candidate state has been found, + * it may still turn out to be a better choice. + */ + if (first_suitable_idx != idx) + continue; - /* - * Give up unless the majority of the most recent idle duration - * values are in the interesting range. - */ - if (count > INTERVALS / 2) { - u64 avg_ns = div64_u64(sum, count); - - /* - * Avoid spending too much time in an idle state that - * would be too shallow. - */ - if (teo_time_ok(avg_ns)) { - duration_ns = avg_ns; - if (drv->states[idx].target_residency_ns > avg_ns) - idx = teo_find_shallower_state(drv, dev, - idx, avg_ns); + break; } + + first_suitable_span_ns = span_ns; + first_suitable_idx = i; } } /* + * If there is a latency constraint, it may be necessary to select an + * idle state shallower than the current candidate one. + */ + if (idx > constraint_idx) + idx = constraint_idx; + +end: + /* * Don't stop the tick if the selected state is a polling one or if the * expected idle duration is shorter than the tick period length. */ @@ -428,7 +467,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * till the closest timer including the tick, try to correct * that. */ - if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) + if (idx > idx0 && + drv->states[idx].target_residency_ns > delta_tick) idx = teo_find_shallower_state(drv, dev, idx, delta_tick); } @@ -472,8 +512,8 @@ static int teo_enable_device(struct cpuidle_driver *drv, memset(cpu_data, 0, sizeof(*cpu_data)); - for (i = 0; i < INTERVALS; i++) - cpu_data->intervals[i] = U64_MAX; + for (i = 0; i < NR_RECENT; i++) + cpu_data->recent_idx[i] = -1; return 0; } diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index cdeedbf02646..2b496a53cbca 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -18,14 +18,6 @@ #include "cpuidle.h" -static unsigned int sysfs_switch; -static int __init cpuidle_sysfs_setup(char *unused) -{ - sysfs_switch = 1; - return 1; -} -__setup("cpuidle_sysfs_switch", cpuidle_sysfs_setup); - static ssize_t show_available_governors(struct device *dev, struct device_attribute *attr, char *buf) @@ -35,10 +27,10 @@ static ssize_t show_available_governors(struct device *dev, mutex_lock(&cpuidle_lock); list_for_each_entry(tmp, &cpuidle_governors, governor_list) { - if (i >= (ssize_t) ((PAGE_SIZE/sizeof(char)) - - CPUIDLE_NAME_LEN - 2)) + if (i >= (ssize_t) (PAGE_SIZE - (CPUIDLE_NAME_LEN + 2))) goto out; - i += scnprintf(&buf[i], CPUIDLE_NAME_LEN, "%s ", tmp->name); + + i += scnprintf(&buf[i], CPUIDLE_NAME_LEN + 1, "%s ", tmp->name); } out: @@ -85,58 +77,43 @@ static ssize_t store_current_governor(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - char gov_name[CPUIDLE_NAME_LEN]; - int ret = -EINVAL; - size_t len = count; + char gov_name[CPUIDLE_NAME_LEN + 1]; + int ret; struct cpuidle_governor *gov; - if (!len || len >= sizeof(gov_name)) + ret = sscanf(buf, "%" __stringify(CPUIDLE_NAME_LEN) "s", gov_name); + if (ret != 1) return -EINVAL; - memcpy(gov_name, buf, len); - gov_name[len] = '\0'; - if (gov_name[len - 1] == '\n') - gov_name[--len] = '\0'; - mutex_lock(&cpuidle_lock); - + ret = -EINVAL; list_for_each_entry(gov, &cpuidle_governors, governor_list) { - if (strlen(gov->name) == len && !strcmp(gov->name, gov_name)) { + if (!strncmp(gov->name, gov_name, CPUIDLE_NAME_LEN)) { ret = cpuidle_switch_governor(gov); break; } } - mutex_unlock(&cpuidle_lock); - if (ret) - return ret; - else - return count; + return ret ? ret : count; } -static DEVICE_ATTR(current_driver, 0444, show_current_driver, NULL); -static DEVICE_ATTR(current_governor_ro, 0444, show_current_governor, NULL); - -static struct attribute *cpuidle_default_attrs[] = { - &dev_attr_current_driver.attr, - &dev_attr_current_governor_ro.attr, - NULL -}; - static DEVICE_ATTR(available_governors, 0444, show_available_governors, NULL); +static DEVICE_ATTR(current_driver, 0444, show_current_driver, NULL); static DEVICE_ATTR(current_governor, 0644, show_current_governor, - store_current_governor); + store_current_governor); +static DEVICE_ATTR(current_governor_ro, 0444, show_current_governor, NULL); -static struct attribute *cpuidle_switch_attrs[] = { +static struct attribute *cpuidle_attrs[] = { &dev_attr_available_governors.attr, &dev_attr_current_driver.attr, &dev_attr_current_governor.attr, + &dev_attr_current_governor_ro.attr, NULL }; static struct attribute_group cpuidle_attr_group = { - .attrs = cpuidle_default_attrs, + .attrs = cpuidle_attrs, .name = "cpuidle", }; @@ -146,9 +123,6 @@ static struct attribute_group cpuidle_attr_group = { */ int cpuidle_add_interface(struct device *dev) { - if (sysfs_switch) - cpuidle_attr_group.attrs = cpuidle_switch_attrs; - return sysfs_create_group(&dev->kobj, &cpuidle_attr_group); } @@ -167,11 +141,6 @@ struct cpuidle_attr { ssize_t (*store)(struct cpuidle_device *, const char *, size_t count); }; -#define define_one_ro(_name, show) \ - static struct cpuidle_attr attr_##_name = __ATTR(_name, 0444, show, NULL) -#define define_one_rw(_name, show, store) \ - static struct cpuidle_attr attr_##_name = __ATTR(_name, 0644, show, store) - #define attr_to_cpuidleattr(a) container_of(a, struct cpuidle_attr, attr) struct cpuidle_device_kobj { @@ -287,6 +256,7 @@ define_show_state_time_function(exit_latency) define_show_state_time_function(target_residency) define_show_state_function(power_usage) define_show_state_ull_function(usage) +define_show_state_ull_function(rejected) define_show_state_str_function(name) define_show_state_str_function(desc) define_show_state_ull_function(above) @@ -343,6 +313,7 @@ define_one_state_ro(latency, show_state_exit_latency); define_one_state_ro(residency, show_state_target_residency); define_one_state_ro(power, show_state_power_usage); define_one_state_ro(usage, show_state_usage); +define_one_state_ro(rejected, show_state_rejected); define_one_state_ro(time, show_state_time); define_one_state_rw(disable, show_state_disable, store_state_disable); define_one_state_ro(above, show_state_above); @@ -356,6 +327,7 @@ static struct attribute *cpuidle_state_default_attrs[] = { &attr_residency.attr, &attr_power.attr, &attr_usage.attr, + &attr_rejected.attr, &attr_time.attr, &attr_disable.attr, &attr_above.attr, @@ -363,6 +335,7 @@ static struct attribute *cpuidle_state_default_attrs[] = { &attr_default_status.attr, NULL }; +ATTRIBUTE_GROUPS(cpuidle_state_default); struct cpuidle_state_kobj { struct cpuidle_state *state; @@ -431,12 +404,12 @@ static inline void cpuidle_remove_s2idle_attr_group(struct cpuidle_state_kobj *k #define attr_to_stateattr(a) container_of(a, struct cpuidle_state_attr, attr) static ssize_t cpuidle_state_show(struct kobject *kobj, struct attribute *attr, - char * buf) + char *buf) { int ret = -EIO; struct cpuidle_state *state = kobj_to_state(kobj); struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj); - struct cpuidle_state_attr * cattr = attr_to_stateattr(attr); + struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); if (cattr->show) ret = cattr->show(state, state_usage, buf); @@ -476,7 +449,7 @@ static void cpuidle_state_sysfs_release(struct kobject *kobj) static struct kobj_type ktype_state_cpuidle = { .sysfs_ops = &cpuidle_state_sysfs_ops, - .default_attrs = cpuidle_state_default_attrs, + .default_groups = cpuidle_state_default_groups, .release = cpuidle_state_sysfs_release, }; @@ -515,6 +488,7 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device) ret = kobject_init_and_add(&kobj->kobj, &ktype_state_cpuidle, &kdev->kobj, "state%d", i); if (ret) { + kobject_put(&kobj->kobj); kfree(kobj); goto error_state; } @@ -532,7 +506,7 @@ error_state: } /** - * cpuidle_remove_driver_sysfs - removes the cpuidle states sysfs attributes + * cpuidle_remove_state_sysfs - removes the cpuidle states sysfs attributes * @device: the target device */ static void cpuidle_remove_state_sysfs(struct cpuidle_device *device) @@ -618,10 +592,11 @@ static struct attribute *cpuidle_driver_default_attrs[] = { &attr_driver_name.attr, NULL }; +ATTRIBUTE_GROUPS(cpuidle_driver_default); static struct kobj_type ktype_driver_cpuidle = { .sysfs_ops = &cpuidle_driver_sysfs_ops, - .default_attrs = cpuidle_driver_default_attrs, + .default_groups = cpuidle_driver_default_groups, .release = cpuidle_driver_sysfs_release, }; @@ -646,6 +621,7 @@ static int cpuidle_add_driver_sysfs(struct cpuidle_device *dev) ret = kobject_init_and_add(&kdrv->kobj, &ktype_driver_cpuidle, &kdev->kobj, "driver"); if (ret) { + kobject_put(&kdrv->kobj); kfree(kdrv); return ret; } @@ -733,17 +709,18 @@ int cpuidle_add_sysfs(struct cpuidle_device *dev) if (!kdev) return -ENOMEM; kdev->dev = dev; - dev->kobj_dev = kdev; init_completion(&kdev->kobj_unregister); error = kobject_init_and_add(&kdev->kobj, &ktype_cpuidle, &cpu_dev->kobj, "cpuidle"); if (error) { + kobject_put(&kdev->kobj); kfree(kdev); return error; } + dev->kobj_dev = kdev; kobject_uevent(&kdev->kobj, KOBJ_ADD); return 0; |