28 files changed, 2402 insertions, 676 deletions
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index c0aeedd66f02..ff71dd662880 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -47,6 +47,10 @@ config CPU_IDLE_GOV_HALTPOLL
 config DT_IDLE_STATES
 	bool
 
+config DT_IDLE_GENPD
+	depends on PM_GENERIC_DOMAINS_OF
+	bool
+
 menu "ARM CPU Idle Drivers"
 depends on ARM || ARM64
 source "drivers/cpuidle/Kconfig.arm"
@@ -62,6 +66,11 @@ depends on PPC
 source "drivers/cpuidle/Kconfig.powerpc"
 endmenu
 
+menu "RISC-V CPU Idle Drivers"
+depends on RISCV
+source "drivers/cpuidle/Kconfig.riscv"
+endmenu
+
 config HALTPOLL_CPUIDLE
 	tristate "Halt poll cpuidle driver"
 	depends on X86 && KVM_GUEST
diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
index 62272ecfa771..747aa537389b 100644
--- a/drivers/cpuidle/Kconfig.arm
+++ b/drivers/cpuidle/Kconfig.arm
@@ -3,7 +3,8 @@
 # ARM CPU Idle drivers
 #
 config ARM_CPUIDLE
-	bool "Generic ARM/ARM64 CPU idle Driver"
+	bool "Generic ARM CPU idle Driver"
+	depends on ARM
 	select DT_IDLE_STATES
 	select CPU_IDLE_MULTIPLE_DRIVERS
 	help
@@ -23,6 +24,17 @@ config ARM_PSCI_CPUIDLE
 	  It provides an idle driver that is capable of detecting and
 	  managing idle states through the PSCI firmware interface.
 
+config ARM_PSCI_CPUIDLE_DOMAIN
+	bool "PSCI CPU idle Domain"
+	depends on ARM_PSCI_CPUIDLE
+	depends on PM_GENERIC_DOMAINS_OF
+	select DT_IDLE_GENPD
+	default y
+	help
+	  Select this to enable the PSCI based CPUidle driver to use PM domains,
+	  which is needed to support the hierarchical DT based layout of the
+	  idle states.
+
 config ARM_BIG_LITTLE_CPUIDLE
 	bool "Support for ARM big.LITTLE processors"
 	depends on ARCH_VEXPRESS_TC2_PM || ARCH_EXYNOS || COMPILE_TEST
@@ -86,3 +98,25 @@ config ARM_MVEBU_V7_CPUIDLE
 	depends on (ARCH_MVEBU || COMPILE_TEST) && !ARM64
 	help
 	  Select this to enable cpuidle on Armada 370, 38x and XP processors.
+
+config ARM_TEGRA_CPUIDLE
+	bool "CPU Idle Driver for NVIDIA Tegra SoCs"
+	depends on (ARCH_TEGRA || COMPILE_TEST) && !ARM64 && MMU
+	select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
+	select ARM_CPU_SUSPEND
+	help
+	  Select this to enable cpuidle for NVIDIA Tegra20/30/114/124 SoCs.
+
+config ARM_QCOM_SPM_CPUIDLE
+	bool "CPU Idle Driver for Qualcomm Subsystem Power Manager (SPM)"
+	depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64 && MMU
+	select ARM_CPU_SUSPEND
+	select CPU_IDLE_MULTIPLE_DRIVERS
+	select DT_IDLE_STATES
+	select QCOM_SCM
+	select QCOM_SPM
+	help
+	  Select this to enable cpuidle for Qualcomm processors.
+	  The Subsystem Power Manager (SPM) controls low power modes for the
+	  CPU and L2 cores. It interface with various system drivers to put
+	  the cores in low power modes.
diff --git a/drivers/cpuidle/Kconfig.riscv b/drivers/cpuidle/Kconfig.riscv
new file mode 100644
index 000000000000..78518c26af74
--- /dev/null
+++ b/drivers/cpuidle/Kconfig.riscv
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# RISC-V CPU Idle drivers
+#
+
+config RISCV_SBI_CPUIDLE
+	bool "RISC-V SBI CPU idle Driver"
+	depends on RISCV_SBI
+	select DT_IDLE_STATES
+	select CPU_IDLE_MULTIPLE_DRIVERS
+	select DT_IDLE_GENPD if PM_GENERIC_DOMAINS_OF
+	help
+	  Select this option to enable RISC-V SBI firmware based CPU idle
+	  driver for RISC-V systems. This drivers also supports hierarchical
+	  DT based layout of the idle state.
diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index cc8c769d7fa9..d103342b7cfc 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -6,6 +6,7 @@
 obj-y += cpuidle.o driver.o governor.o sysfs.o governors/
 obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
 obj-$(CONFIG_DT_IDLE_STATES)		  += dt_idle_states.o
+obj-$(CONFIG_DT_IDLE_GENPD)		  += dt_idle_genpd.o
 obj-$(CONFIG_ARCH_HAS_CPU_RELAX)	  += poll_state.o
 obj-$(CONFIG_HALTPOLL_CPUIDLE)		  += cpuidle-haltpoll.o
 
@@ -21,9 +22,10 @@ obj-$(CONFIG_ARM_U8500_CPUIDLE)         += cpuidle-ux500.o
 obj-$(CONFIG_ARM_AT91_CPUIDLE)          += cpuidle-at91.o
 obj-$(CONFIG_ARM_EXYNOS_CPUIDLE)        += cpuidle-exynos.o
 obj-$(CONFIG_ARM_CPUIDLE)		+= cpuidle-arm.o
-obj-$(CONFIG_ARM_PSCI_CPUIDLE)		+= cpuidle_psci.o
-cpuidle_psci-y				:= cpuidle-psci.o
-cpuidle_psci-$(CONFIG_PM_GENERIC_DOMAINS_OF) += cpuidle-psci-domain.o
+obj-$(CONFIG_ARM_PSCI_CPUIDLE)		+= cpuidle-psci.o
+obj-$(CONFIG_ARM_PSCI_CPUIDLE_DOMAIN)	+= cpuidle-psci-domain.o
+obj-$(CONFIG_ARM_TEGRA_CPUIDLE)		+= cpuidle-tegra.o
+obj-$(CONFIG_ARM_QCOM_SPM_CPUIDLE)	+= cpuidle-qcom-spm.o
 
 ###############################################################################
 # MIPS drivers
@@ -33,3 +35,7 @@ obj-$(CONFIG_MIPS_CPS_CPUIDLE)		+= cpuidle-cps.o
 # POWERPC drivers
 obj-$(CONFIG_PSERIES_CPUIDLE)		+= cpuidle-pseries.o
 obj-$(CONFIG_POWERNV_CPUIDLE)		+= cpuidle-powernv.o
+
+###############################################################################
+# RISC-V drivers
+obj-$(CONFIG_RISCV_SBI_CPUIDLE)		+= cpuidle-riscv-sbi.o
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 04003b90dc49..9acde71558d5 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -54,7 +54,7 @@
  * variable is not locked.  It is only written from the cpu that
  * it stores (or by the on/offlining cpu if that cpu is offline),
  * and only read after all the cpus are ready for the coupled idle
- * state are are no longer updating it.
+ * state are no longer updating it.
  *
  * Three atomic counters are used.  alive_count tracks the number
  * of cpus in the coupled set that are currently or soon will be
@@ -674,8 +674,7 @@ have_coupled:
 	coupled->refcnt++;
 
 	csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu);
-	csd->func = cpuidle_coupled_handle_poke;
-	csd->info = (void *)(unsigned long)dev->cpu;
+	INIT_CSD(csd, cpuidle_coupled_handle_poke, (void *)(unsigned long)dev->cpu);
 
 	return 0;
 }
diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c
index 9e5156d39627..8c758920d699 100644
--- a/drivers/cpuidle/cpuidle-arm.c
+++ b/drivers/cpuidle/cpuidle-arm.c
@@ -8,6 +8,7 @@
 
 #define pr_fmt(fmt) "CPUidle arm: " fmt
 
+#include <linux/cpu_cooling.h>
 #include <linux/cpuidle.h>
 #include <linux/cpumask.h>
 #include <linux/cpu_pm.h>
@@ -124,6 +125,8 @@ static int __init arm_idle_init_cpu(int cpu)
 	if (ret)
 		goto out_kfree_drv;
 
+	cpuidle_cooling_register(drv);
+
 	return 0;
 
 out_kfree_drv:
diff --git a/drivers/cpuidle/cpuidle-at91.c b/drivers/cpuidle/cpuidle-at91.c
index 9c5853b6ca4a..45ee8e1e71ae 100644
--- a/drivers/cpuidle/cpuidle-at91.c
+++ b/drivers/cpuidle/cpuidle-at91.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * based on arch/arm/mach-kirkwood/cpuidle.c
  *
  * CPU idle support for AT91 SoC
  *
- * This file is licensed under the terms of the GNU General Public
- * License version 2.  This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- *
  * The cpu idle uses wait-for-interrupt and RAM self refresh in order
  * to implement two idle states -
  * #1 wait-for-interrupt
diff --git a/drivers/cpuidle/cpuidle-big_little.c b/drivers/cpuidle/cpuidle-big_little.c
index 7f8ddc04342d..abe51185f243 100644
--- a/drivers/cpuidle/cpuidle-big_little.c
+++ b/drivers/cpuidle/cpuidle-big_little.c
@@ -155,8 +155,7 @@ static int __init bl_idle_driver_init(struct cpuidle_driver *drv, int part_id)
 
 static const struct of_device_id compatible_machine_match[] = {
 	{ .compatible = "arm,vexpress,v2p-ca15_a7" },
-	{ .compatible = "samsung,exynos5420" },
-	{ .compatible = "samsung,exynos5800" },
+	{ .compatible = "google,peach" },
 	{},
 };
 
diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c
index b0ce9bc78113..3a39a7f48b77 100644
--- a/drivers/cpuidle/cpuidle-haltpoll.c
+++ b/drivers/cpuidle/cpuidle-haltpoll.c
@@ -18,6 +18,10 @@
 #include <linux/kvm_para.h>
 #include <linux/cpuidle_haltpoll.h>
 
+static bool force __read_mostly;
+module_param(force, bool, 0444);
+MODULE_PARM_DESC(force, "Load unconditionally");
+
 static struct cpuidle_device __percpu *haltpoll_cpuidle_devices;
 static enum cpuhp_state haltpoll_hp_state;
 
@@ -90,6 +94,11 @@ static void haltpoll_uninit(void)
 	haltpoll_cpuidle_devices = NULL;
 }
 
+static bool haltpoll_want(void)
+{
+	return kvm_para_has_hint(KVM_HINTS_REALTIME) || force;
+}
+
 static int __init haltpoll_init(void)
 {
 	int ret;
@@ -99,12 +108,11 @@ static int __init haltpoll_init(void)
 	if (boot_option_idle_override != IDLE_NO_OVERRIDE)
 		return -ENODEV;
 
-	cpuidle_poll_state_init(drv);
-
-	if (!kvm_para_available() ||
-		!kvm_para_has_hint(KVM_HINTS_REALTIME))
+	if (!kvm_para_available() || !haltpoll_want())
 		return -ENODEV;
 
+	cpuidle_poll_state_init(drv);
+
 	ret = cpuidle_register_driver(drv);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/cpuidle/cpuidle-kirkwood.c b/drivers/cpuidle/cpuidle-kirkwood.c
index 511c4f46027a..13bf743f885b 100644
--- a/drivers/cpuidle/cpuidle-kirkwood.c
+++ b/drivers/cpuidle/cpuidle-kirkwood.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * CPU idle Marvell Kirkwood SoCs
  *
- * This file is licensed under the terms of the GNU General Public
- * License version 2.  This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- *
  * The cpu idle uses wait-for-interrupt and DDR self refresh in order
  * to implement two idle states -
  * #1 wait-for-interrupt
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 1b299e801f74..0b5461b3d7dd 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -141,7 +141,7 @@ static int stop_loop(struct cpuidle_device *dev,
 		     struct cpuidle_driver *drv,
 		     int index)
 {
-	power9_idle_type(stop_psscr_table[index].val,
+	arch300_idle_type(stop_psscr_table[index].val,
 			 stop_psscr_table[index].mask);
 	return index;
 }
@@ -233,8 +233,8 @@ static inline void add_powernv_state(int index, const char *name,
 				     unsigned int exit_latency,
 				     u64 psscr_val, u64 psscr_mask)
 {
-	strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
-	strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
+	strscpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
+	strscpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
 	powernv_states[index].flags = flags;
 	powernv_states[index].target_residency = target_residency;
 	powernv_states[index].exit_latency = exit_latency;
@@ -244,20 +244,6 @@ static inline void add_powernv_state(int index, const char *name,
 	stop_psscr_table[index].mask = psscr_mask;
 }
 
-/*
- * Returns 0 if prop1_len == prop2_len. Else returns -1
- */
-static inline int validate_dt_prop_sizes(const char *prop1, int prop1_len,
-					 const char *prop2, int prop2_len)
-{
-	if (prop1_len == prop2_len)
-		return 0;
-
-	pr_warn("cpuidle-powernv: array sizes don't match for %s and %s\n",
-		prop1, prop2);
-	return -1;
-}
-
 extern u32 pnv_get_supported_cpuidle_states(void);
 static int powernv_add_idle_states(void)
 {
diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c
index 423f03bbeb74..821984947ed9 100644
--- a/drivers/cpuidle/cpuidle-psci-domain.c
+++ b/drivers/cpuidle/cpuidle-psci-domain.c
@@ -12,6 +12,7 @@
 #include <linux/cpu.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
+#include <linux/platform_device.h>
 #include <linux/pm_domain.h>
 #include <linux/pm_runtime.h>
 #include <linux/psci.h>
@@ -26,7 +27,7 @@ struct psci_pd_provider {
 };
 
 static LIST_HEAD(psci_pd_providers);
-static bool osi_mode_enabled __initdata;
+static bool psci_pd_allow_domain_state;
 
 static int psci_pd_power_off(struct generic_pm_domain *pd)
 {
@@ -36,6 +37,9 @@ static int psci_pd_power_off(struct generic_pm_domain *pd)
 	if (!state->data)
 		return 0;
 
+	if (!psci_pd_allow_domain_state)
+		return -EBUSY;
+
 	/* OSI mode is enabled, set the corresponding domain state. */
 	pd_state = state->data;
 	psci_set_domain_state(*pd_state);
@@ -43,73 +47,14 @@ static int psci_pd_power_off(struct generic_pm_domain *pd)
 	return 0;
 }
 
-static int __init psci_pd_parse_state_nodes(struct genpd_power_state *states,
-					int state_count)
-{
-	int i, ret;
-	u32 psci_state, *psci_state_buf;
-
-	for (i = 0; i < state_count; i++) {
-		ret = psci_dt_parse_state_node(to_of_node(states[i].fwnode),
-					&psci_state);
-		if (ret)
-			goto free_state;
-
-		psci_state_buf = kmalloc(sizeof(u32), GFP_KERNEL);
-		if (!psci_state_buf) {
-			ret = -ENOMEM;
-			goto free_state;
-		}
-		*psci_state_buf = psci_state;
-		states[i].data = psci_state_buf;
-	}
-
-	return 0;
-
-free_state:
-	i--;
-	for (; i >= 0; i--)
-		kfree(states[i].data);
-	return ret;
-}
-
-static int __init psci_pd_parse_states(struct device_node *np,
-			struct genpd_power_state **states, int *state_count)
-{
-	int ret;
-
-	/* Parse the domain idle states. */
-	ret = of_genpd_parse_idle_states(np, states, state_count);
-	if (ret)
-		return ret;
-
-	/* Fill out the PSCI specifics for each found state. */
-	ret = psci_pd_parse_state_nodes(*states, *state_count);
-	if (ret)
-		kfree(*states);
-
-	return ret;
-}
-
-static void psci_pd_free_states(struct genpd_power_state *states,
-				unsigned int state_count)
-{
-	int i;
-
-	for (i = 0; i < state_count; i++)
-		kfree(states[i].data);
-	kfree(states);
-}
-
-static int __init psci_pd_init(struct device_node *np)
+static int psci_pd_init(struct device_node *np, bool use_osi)
 {
 	struct generic_pm_domain *pd;
 	struct psci_pd_provider *pd_provider;
 	struct dev_power_governor *pd_gov;
-	struct genpd_power_state *states = NULL;
-	int ret = -ENOMEM, state_count = 0;
+	int ret = -ENOMEM;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	pd = dt_idle_pd_alloc(np, psci_dt_parse_state_node);
 	if (!pd)
 		goto out;
 
@@ -117,33 +62,20 @@ static int __init psci_pd_init(struct device_node *np)
 	if (!pd_provider)
 		goto free_pd;
 
-	pd->name = kasprintf(GFP_KERNEL, "%pOF", np);
-	if (!pd->name)
-		goto free_pd_prov;
-
-	/*
-	 * Parse the domain idle states and let genpd manage the state selection
-	 * for those being compatible with "domain-idle-state".
-	 */
-	ret = psci_pd_parse_states(np, &states, &state_count);
-	if (ret)
-		goto free_name;
-
-	pd->free_states = psci_pd_free_states;
-	pd->name = kbasename(pd->name);
-	pd->power_off = psci_pd_power_off;
-	pd->states = states;
-	pd->state_count = state_count;
 	pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN;
 
+	/* Allow power off when OSI has been successfully enabled. */
+	if (use_osi)
+		pd->power_off = psci_pd_power_off;
+	else
+		pd->flags |= GENPD_FLAG_ALWAYS_ON;
+
 	/* Use governor for CPU PM domains if it has some states to manage. */
-	pd_gov = state_count > 0 ? &pm_domain_cpu_gov : NULL;
+	pd_gov = pd->states ? &pm_domain_cpu_gov : NULL;
 
 	ret = pm_genpd_init(pd, pd_gov, false);
-	if (ret) {
-		psci_pd_free_states(states, state_count);
-		goto free_name;
-	}
+	if (ret)
+		goto free_pd_prov;
 
 	ret = of_genpd_add_provider_simple(np, pd);
 	if (ret)
@@ -157,18 +89,16 @@ static int __init psci_pd_init(struct device_node *np)
 
 remove_pd:
 	pm_genpd_remove(pd);
-free_name:
-	kfree(pd->name);
 free_pd_prov:
 	kfree(pd_provider);
 free_pd:
-	kfree(pd);
+	dt_idle_pd_free(pd);
 out:
 	pr_err("failed to init PM domain ret=%d %pOF\n", ret, np);
 	return ret;
 }
 
-static void __init psci_pd_remove(void)
+static void psci_pd_remove(void)
 {
 	struct psci_pd_provider *pd_provider, *it;
 	struct generic_pm_domain *genpd;
@@ -186,59 +116,46 @@ static void __init psci_pd_remove(void)
 	}
 }
 
-static int __init psci_pd_init_topology(struct device_node *np, bool add)
+static bool psci_pd_try_set_osi_mode(void)
 {
-	struct device_node *node;
-	struct of_phandle_args child, parent;
 	int ret;
 
-	for_each_child_of_node(np, node) {
-		if (of_parse_phandle_with_args(node, "power-domains",
-					"#power-domain-cells", 0, &parent))
-			continue;
-
-		child.np = node;
-		child.args_count = 0;
-
-		ret = add ? of_genpd_add_subdomain(&parent, &child) :
-			of_genpd_remove_subdomain(&parent, &child);
-		of_node_put(parent.np);
-		if (ret) {
-			of_node_put(node);
-			return ret;
-		}
-	}
+	if (!psci_has_osi_support())
+		return false;
 
-	return 0;
-}
+	ret = psci_set_osi_mode(true);
+	if (ret)
+		return false;
 
-static int __init psci_pd_add_topology(struct device_node *np)
-{
-	return psci_pd_init_topology(np, true);
+	return true;
 }
 
-static void __init psci_pd_remove_topology(struct device_node *np)
+static void psci_cpuidle_domain_sync_state(struct device *dev)
 {
-	psci_pd_init_topology(np, false);
+	/*
+	 * All devices have now been attached/probed to the PM domain topology,
+	 * hence it's fine to allow domain states to be picked.
+	 */
+	psci_pd_allow_domain_state = true;
 }
 
-static const struct of_device_id psci_of_match[] __initconst = {
+static const struct of_device_id psci_of_match[] = {
 	{ .compatible = "arm,psci-1.0" },
 	{}
 };
 
-static int __init psci_idle_init_domains(void)
+static int psci_cpuidle_domain_probe(struct platform_device *pdev)
 {
-	struct device_node *np = of_find_matching_node(NULL, psci_of_match);
+	struct device_node *np = pdev->dev.of_node;
 	struct device_node *node;
+	bool use_osi;
 	int ret = 0, pd_count = 0;
 
 	if (!np)
 		return -ENODEV;
 
-	/* Currently limit the hierarchical topology to be used in OSI mode. */
-	if (!psci_has_osi_support())
-		goto out;
+	/* If OSI mode is supported, let's try to enable it. */
+	use_osi = psci_pd_try_set_osi_mode();
 
 	/*
 	 * Parse child nodes for the "#power-domain-cells" property and
@@ -248,7 +165,7 @@ static int __init psci_idle_init_domains(void)
 		if (!of_find_property(node, "#power-domain-cells", NULL))
 			continue;
 
-		ret = psci_pd_init(node);
+		ret = psci_pd_init(node, use_osi);
 		if (ret)
 			goto put_node;
 
@@ -257,52 +174,38 @@ static int __init psci_idle_init_domains(void)
 
 	/* Bail out if not using the hierarchical CPU topology. */
 	if (!pd_count)
-		goto out;
+		goto no_pd;
 
 	/* Link genpd masters/subdomains to model the CPU topology. */
-	ret = psci_pd_add_topology(np);
+	ret = dt_idle_pd_init_topology(np);
 	if (ret)
 		goto remove_pd;
 
-	/* Try to enable OSI mode. */
-	ret = psci_set_osi_mode();
-	if (ret) {
-		pr_warn("failed to enable OSI mode: %d\n", ret);
-		psci_pd_remove_topology(np);
-		goto remove_pd;
-	}
-
-	osi_mode_enabled = true;
-	of_node_put(np);
 	pr_info("Initialized CPU PM domain topology\n");
-	return pd_count;
+	return 0;
 
 put_node:
 	of_node_put(node);
 remove_pd:
-	if (pd_count)
-		psci_pd_remove();
+	psci_pd_remove();
 	pr_err("failed to create CPU PM domains ret=%d\n", ret);
-out:
-	of_node_put(np);
+no_pd:
+	if (use_osi)
+		psci_set_osi_mode(false);
 	return ret;
 }
-subsys_initcall(psci_idle_init_domains);
 
-struct device __init *psci_dt_attach_cpu(int cpu)
-{
-	struct device *dev;
-
-	if (!osi_mode_enabled)
-		return NULL;
-
-	dev = dev_pm_domain_attach_by_name(get_cpu_device(cpu), "psci");
-	if (IS_ERR_OR_NULL(dev))
-		return dev;
-
-	pm_runtime_irq_safe(dev);
-	if (cpu_online(cpu))
-		pm_runtime_get_sync(dev);
+static struct platform_driver psci_cpuidle_domain_driver = {
+	.probe  = psci_cpuidle_domain_probe,
+	.driver = {
+		.name = "psci-cpuidle-domain",
+		.of_match_table = psci_of_match,
+		.sync_state = psci_cpuidle_domain_sync_state,
+	},
+};
 
-	return dev;
+static int __init psci_idle_init_domains(void)
+{
+	return platform_driver_register(&psci_cpuidle_domain_driver);
 }
+subsys_initcall(psci_idle_init_domains);
diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c
index edd7a54ef0d3..57bc3e3ae391 100644
--- a/drivers/cpuidle/cpuidle-psci.c
+++ b/drivers/cpuidle/cpuidle-psci.c
@@ -9,6 +9,7 @@
 #define pr_fmt(fmt) "CPUidle PSCI: " fmt
 
 #include <linux/cpuhotplug.h>
+#include <linux/cpu_cooling.h>
 #include <linux/cpuidle.h>
 #include <linux/cpumask.h>
 #include <linux/cpu_pm.h>
@@ -16,9 +17,13 @@
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
+#include <linux/platform_device.h>
 #include <linux/psci.h>
+#include <linux/pm_domain.h>
 #include <linux/pm_runtime.h>
 #include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/cpuidle.h>
 
@@ -32,7 +37,7 @@ struct psci_cpuidle_data {
 
 static DEFINE_PER_CPU_READ_MOSTLY(struct psci_cpuidle_data, psci_cpuidle_data);
 static DEFINE_PER_CPU(u32, domain_state);
-static bool psci_cpuidle_use_cpuhp __initdata;
+static bool psci_cpuidle_use_cpuhp;
 
 void psci_set_domain_state(u32 state)
 {
@@ -49,8 +54,9 @@ static inline int psci_enter_state(int idx, u32 state)
 	return CPU_PM_CPU_IDLE_ENTER_PARAM(psci_cpu_suspend_enter, idx, state);
 }
 
-static int psci_enter_domain_idle_state(struct cpuidle_device *dev,
-					struct cpuidle_driver *drv, int idx)
+static int __psci_enter_domain_idle_state(struct cpuidle_device *dev,
+					  struct cpuidle_driver *drv, int idx,
+					  bool s2idle)
 {
 	struct psci_cpuidle_data *data = this_cpu_ptr(&psci_cpuidle_data);
 	u32 *states = data->psci_states;
@@ -58,22 +64,51 @@ static int psci_enter_domain_idle_state(struct cpuidle_device *dev,
 	u32 state;
 	int ret;
 
+	ret = cpu_pm_enter();
+	if (ret)
+		return -1;
+
 	/* Do runtime PM to manage a hierarchical CPU toplogy. */
-	pm_runtime_put_sync_suspend(pd_dev);
+	ct_irq_enter_irqson();
+	if (s2idle)
+		dev_pm_genpd_suspend(pd_dev);
+	else
+		pm_runtime_put_sync_suspend(pd_dev);
+	ct_irq_exit_irqson();
 
 	state = psci_get_domain_state();
 	if (!state)
 		state = states[idx];
 
-	ret = psci_enter_state(idx, state);
+	ret = psci_cpu_suspend_enter(state) ? -1 : idx;
+
+	ct_irq_enter_irqson();
+	if (s2idle)
+		dev_pm_genpd_resume(pd_dev);
+	else
+		pm_runtime_get_sync(pd_dev);
+	ct_irq_exit_irqson();
 
-	pm_runtime_get_sync(pd_dev);
+	cpu_pm_exit();
 
 	/* Clear the domain state to start fresh when back from idle. */
 	psci_set_domain_state(0);
 	return ret;
 }
 
+static int psci_enter_domain_idle_state(struct cpuidle_device *dev,
+					struct cpuidle_driver *drv, int idx)
+{
+	return __psci_enter_domain_idle_state(dev, drv, idx, false);
+}
+
+static int psci_enter_s2idle_domain_idle_state(struct cpuidle_device *dev,
+					       struct cpuidle_driver *drv,
+					       int idx)
+{
+	return __psci_enter_domain_idle_state(dev, drv, idx, true);
+}
+
 static int psci_idle_cpuhp_up(unsigned int cpu)
 {
 	struct device *pd_dev = __this_cpu_read(psci_cpuidle_data.dev);
@@ -97,13 +132,58 @@ static int psci_idle_cpuhp_down(unsigned int cpu)
 	return 0;
 }
 
-static void __init psci_idle_init_cpuhp(void)
+static void psci_idle_syscore_switch(bool suspend)
+{
+	bool cleared = false;
+	struct device *dev;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		dev = per_cpu_ptr(&psci_cpuidle_data, cpu)->dev;
+
+		if (dev && suspend) {
+			dev_pm_genpd_suspend(dev);
+		} else if (dev) {
+			dev_pm_genpd_resume(dev);
+
+			/* Account for userspace having offlined a CPU. */
+			if (pm_runtime_status_suspended(dev))
+				pm_runtime_set_active(dev);
+
+			/* Clear domain state to re-start fresh. */
+			if (!cleared) {
+				psci_set_domain_state(0);
+				cleared = true;
+			}
+		}
+	}
+}
+
+static int psci_idle_syscore_suspend(void)
+{
+	psci_idle_syscore_switch(true);
+	return 0;
+}
+
+static void psci_idle_syscore_resume(void)
+{
+	psci_idle_syscore_switch(false);
+}
+
+static struct syscore_ops psci_idle_syscore_ops = {
+	.suspend = psci_idle_syscore_suspend,
+	.resume = psci_idle_syscore_resume,
+};
+
+static void psci_idle_init_cpuhp(void)
 {
 	int err;
 
 	if (!psci_cpuidle_use_cpuhp)
 		return;
 
+	register_syscore_ops(&psci_idle_syscore_ops);
+
 	err = cpuhp_setup_state_nocalls(CPUHP_AP_CPU_PM_STARTING,
 					"cpuidle/psci:online",
 					psci_idle_cpuhp_up,
@@ -120,30 +200,13 @@ static int psci_enter_idle_state(struct cpuidle_device *dev,
 	return psci_enter_state(idx, state[idx]);
 }
 
-static struct cpuidle_driver psci_idle_driver __initdata = {
-	.name = "psci_idle",
-	.owner = THIS_MODULE,
-	/*
-	 * PSCI idle states relies on architectural WFI to
-	 * be represented as state index 0.
-	 */
-	.states[0] = {
-		.enter                  = psci_enter_idle_state,
-		.exit_latency           = 1,
-		.target_residency       = 1,
-		.power_usage		= UINT_MAX,
-		.name                   = "WFI",
-		.desc                   = "ARM WFI",
-	}
-};
-
-static const struct of_device_id psci_idle_state_match[] __initconst = {
+static const struct of_device_id psci_idle_state_match[] = {
 	{ .compatible = "arm,idle-state",
 	  .data = psci_enter_idle_state },
 	{ },
 };
 
-int __init psci_dt_parse_state_node(struct device_node *np, u32 *state)
+int psci_dt_parse_state_node(struct device_node *np, u32 *state)
 {
 	int err = of_property_read_u32(np, "arm,psci-suspend-param", state);
 
@@ -160,9 +223,33 @@ int __init psci_dt_parse_state_node(struct device_node *np, u32 *state)
 	return 0;
 }
 
-static int __init psci_dt_cpu_init_idle(struct cpuidle_driver *drv,
-					struct device_node *cpu_node,
-					unsigned int state_count, int cpu)
+static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv,
+				     struct psci_cpuidle_data *data,
+				     unsigned int state_count, int cpu)
+{
+	/* Currently limit the hierarchical topology to be used in OSI mode. */
+	if (!psci_has_osi_support())
+		return 0;
+
+	data->dev = psci_dt_attach_cpu(cpu);
+	if (IS_ERR_OR_NULL(data->dev))
+		return PTR_ERR_OR_ZERO(data->dev);
+
+	/*
+	 * Using the deepest state for the CPU to trigger a potential selection
+	 * of a shared state for the domain, assumes the domain states are all
+	 * deeper states.
+	 */
+	drv->states[state_count - 1].enter = psci_enter_domain_idle_state;
+	drv->states[state_count - 1].enter_s2idle = psci_enter_s2idle_domain_idle_state;
+	psci_cpuidle_use_cpuhp = true;
+
+	return 0;
+}
+
+static int psci_dt_cpu_init_idle(struct device *dev, struct cpuidle_driver *drv,
+				 struct device_node *cpu_node,
+				 unsigned int state_count, int cpu)
 {
 	int i, ret = 0;
 	u32 *psci_states;
@@ -170,7 +257,8 @@ static int __init psci_dt_cpu_init_idle(struct cpuidle_driver *drv,
 	struct psci_cpuidle_data *data = per_cpu_ptr(&psci_cpuidle_data, cpu);
 
 	state_count++; /* Add WFI state too */
-	psci_states = kcalloc(state_count, sizeof(*psci_states), GFP_KERNEL);
+	psci_states = devm_kcalloc(dev, state_count, sizeof(*psci_states),
+				   GFP_KERNEL);
 	if (!psci_states)
 		return -ENOMEM;
 
@@ -183,47 +271,26 @@ static int __init psci_dt_cpu_init_idle(struct cpuidle_driver *drv,
 		of_node_put(state_node);
 
 		if (ret)
-			goto free_mem;
+			return ret;
 
 		pr_debug("psci-power-state %#x index %d\n", psci_states[i], i);
 	}
 
-	if (i != state_count) {
-		ret = -ENODEV;
-		goto free_mem;
-	}
-
-	/* Currently limit the hierarchical topology to be used in OSI mode. */
-	if (psci_has_osi_support()) {
-		data->dev = psci_dt_attach_cpu(cpu);
-		if (IS_ERR(data->dev)) {
-			ret = PTR_ERR(data->dev);
-			goto free_mem;
-		}
+	if (i != state_count)
+		return -ENODEV;
 
-		/*
-		 * Using the deepest state for the CPU to trigger a potential
-		 * selection of a shared state for the domain, assumes the
-		 * domain states are all deeper states.
-		 */
-		if (data->dev) {
-			drv->states[state_count - 1].enter =
-				psci_enter_domain_idle_state;
-			psci_cpuidle_use_cpuhp = true;
-		}
-	}
+	/* Initialize optional data, used for the hierarchical topology. */
+	ret = psci_dt_cpu_init_topology(drv, data, state_count, cpu);
+	if (ret < 0)
+		return ret;
 
 	/* Idle states parsed correctly, store them in the per-cpu struct. */
 	data->psci_states = psci_states;
 	return 0;
-
-free_mem:
-	kfree(psci_states);
-	return ret;
 }
 
-static __init int psci_cpu_init_idle(struct cpuidle_driver *drv,
-				     unsigned int cpu, unsigned int state_count)
+static int psci_cpu_init_idle(struct device *dev, struct cpuidle_driver *drv,
+			      unsigned int cpu, unsigned int state_count)
 {
 	struct device_node *cpu_node;
 	int ret;
@@ -239,14 +306,22 @@ static __init int psci_cpu_init_idle(struct cpuidle_driver *drv,
 	if (!cpu_node)
 		return -ENODEV;
 
-	ret = psci_dt_cpu_init_idle(drv, cpu_node, state_count, cpu);
+	ret = psci_dt_cpu_init_idle(dev, drv, cpu_node, state_count, cpu);
 
 	of_node_put(cpu_node);
 
 	return ret;
 }
 
-static int __init psci_idle_init_cpu(int cpu)
+static void psci_cpu_deinit_idle(int cpu)
+{
+	struct psci_cpuidle_data *data = per_cpu_ptr(&psci_cpuidle_data, cpu);
+
+	psci_dt_detach_cpu(data->dev);
+	psci_cpuidle_use_cpuhp = false;
+}
+
+static int psci_idle_init_cpu(struct device *dev, int cpu)
 {
 	struct cpuidle_driver *drv;
 	struct device_node *cpu_node;
@@ -269,17 +344,26 @@ static int __init psci_idle_init_cpu(int cpu)
 	if (ret)
 		return ret;
 
-	drv = kmemdup(&psci_idle_driver, sizeof(*drv), GFP_KERNEL);
+	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
 	if (!drv)
 		return -ENOMEM;
 
+	drv->name = "psci_idle";
+	drv->owner = THIS_MODULE;
 	drv->cpumask = (struct cpumask *)cpumask_of(cpu);
 
 	/*
-	 * Initialize idle states data, starting at index 1, since
-	 * by default idle state 0 is the quiescent state reached
-	 * by the cpu by executing the wfi instruction.
-	 *
+	 * PSCI idle states relies on architectural WFI to be represented as
+	 * state index 0.
+	 */
+	drv->states[0].enter = psci_enter_idle_state;
+	drv->states[0].exit_latency = 1;
+	drv->states[0].target_residency = 1;
+	drv->states[0].power_usage = UINT_MAX;
+	strcpy(drv->states[0].name, "WFI");
+	strcpy(drv->states[0].desc, "ARM WFI");
+
+	/*
 	 * If no DT idle states are detected (ret == 0) let the driver
 	 * initialization fail accordingly since there is no reason to
 	 * initialize the idle driver if only wfi is supported, the
@@ -287,46 +371,45 @@ static int __init psci_idle_init_cpu(int cpu)
 	 * on idle entry.
 	 */
 	ret = dt_init_idle_driver(drv, psci_idle_state_match, 1);
-	if (ret <= 0) {
-		ret = ret ? : -ENODEV;
-		goto out_kfree_drv;
-	}
+	if (ret <= 0)
+		return ret ? : -ENODEV;
 
 	/*
 	 * Initialize PSCI idle states.
 	 */
-	ret = psci_cpu_init_idle(drv, cpu, ret);
+	ret = psci_cpu_init_idle(dev, drv, cpu, ret);
 	if (ret) {
 		pr_err("CPU %d failed to PSCI idle\n", cpu);
-		goto out_kfree_drv;
+		return ret;
 	}
 
 	ret = cpuidle_register(drv, NULL);
 	if (ret)
-		goto out_kfree_drv;
+		goto deinit;
 
-	return 0;
+	cpuidle_cooling_register(drv);
 
-out_kfree_drv:
-	kfree(drv);
+	return 0;
+deinit:
+	psci_cpu_deinit_idle(cpu);
 	return ret;
 }
 
 /*
- * psci_idle_init - Initializes PSCI cpuidle driver
+ * psci_idle_probe - Initializes PSCI cpuidle driver
  *
  * Initializes PSCI cpuidle driver for all CPUs, if any CPU fails
  * to register cpuidle driver then rollback to cancel all CPUs
  * registration.
  */
-static int __init psci_idle_init(void)
+static int psci_cpuidle_probe(struct platform_device *pdev)
 {
 	int cpu, ret;
 	struct cpuidle_driver *drv;
 	struct cpuidle_device *dev;
 
 	for_each_possible_cpu(cpu) {
-		ret = psci_idle_init_cpu(cpu);
+		ret = psci_idle_init_cpu(&pdev->dev, cpu);
 		if (ret)
 			goto out_fail;
 	}
@@ -339,9 +422,34 @@ out_fail:
 		dev = per_cpu(cpuidle_devices, cpu);
 		drv = cpuidle_get_cpu_driver(dev);
 		cpuidle_unregister(drv);
-		kfree(drv);
+		psci_cpu_deinit_idle(cpu);
 	}
 
 	return ret;
 }
+
+static struct platform_driver psci_cpuidle_driver = {
+	.probe = psci_cpuidle_probe,
+	.driver = {
+		.name = "psci-cpuidle",
+	},
+};
+
+static int __init psci_idle_init(void)
+{
+	struct platform_device *pdev;
+	int ret;
+
+	ret = platform_driver_register(&psci_cpuidle_driver);
+	if (ret)
+		return ret;
+
+	pdev = platform_device_register_simple("psci-cpuidle", -1, NULL, 0);
+	if (IS_ERR(pdev)) {
+		platform_driver_unregister(&psci_cpuidle_driver);
+		return PTR_ERR(pdev);
+	}
+
+	return 0;
+}
 device_initcall(psci_idle_init);
diff --git a/drivers/cpuidle/cpuidle-psci.h b/drivers/cpuidle/cpuidle-psci.h
index 7299a04dd467..4e132640ed64 100644
--- a/drivers/cpuidle/cpuidle-psci.h
+++ b/drivers/cpuidle/cpuidle-psci.h
@@ -3,15 +3,29 @@
 #ifndef __CPUIDLE_PSCI_H
 #define __CPUIDLE_PSCI_H
 
+struct device;
 struct device_node;
 
 void psci_set_domain_state(u32 state);
-int __init psci_dt_parse_state_node(struct device_node *np, u32 *state);
+int psci_dt_parse_state_node(struct device_node *np, u32 *state);
+
+#ifdef CONFIG_ARM_PSCI_CPUIDLE_DOMAIN
+
+#include "dt_idle_genpd.h"
+
+static inline struct device *psci_dt_attach_cpu(int cpu)
+{
+	return dt_idle_attach_cpu(cpu, "psci");
+}
+
+static inline void psci_dt_detach_cpu(struct device *dev)
+{
+	dt_idle_detach_cpu(dev);
+}
 
-#ifdef CONFIG_PM_GENERIC_DOMAINS_OF
-struct device __init *psci_dt_attach_cpu(int cpu);
 #else
-static inline struct device __init *psci_dt_attach_cpu(int cpu) { return NULL; }
+static inline struct device *psci_dt_attach_cpu(int cpu) { return NULL; }
+static inline void psci_dt_detach_cpu(struct device *dev) { }
 #endif
 
 #endif /* __CPUIDLE_PSCI_H */
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index 74c247972bb3..7e7ab5597d7a 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -19,9 +19,11 @@
 #include <asm/machdep.h>
 #include <asm/firmware.h>
 #include <asm/runlatch.h>
+#include <asm/idle.h>
 #include <asm/plpar_wrappers.h>
+#include <asm/rtas.h>
 
-struct cpuidle_driver pseries_idle_driver = {
+static struct cpuidle_driver pseries_idle_driver = {
 	.name             = "pseries_idle",
 	.owner            = THIS_MODULE,
 };
@@ -31,39 +33,15 @@ static struct cpuidle_state *cpuidle_state_table __read_mostly;
 static u64 snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
-static inline void idle_loop_prolog(unsigned long *in_purr)
-{
-	ppc64_runlatch_off();
-	*in_purr = mfspr(SPRN_PURR);
-	/*
-	 * Indicate to the HV that we are idle. Now would be
-	 * a good time to find other work to dispatch.
-	 */
-	get_lppaca()->idle = 1;
-}
-
-static inline void idle_loop_epilog(unsigned long in_purr)
-{
-	u64 wait_cycles;
-
-	wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles);
-	wait_cycles += mfspr(SPRN_PURR) - in_purr;
-	get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
-	get_lppaca()->idle = 0;
-
-	ppc64_runlatch_on();
-}
-
 static int snooze_loop(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv,
 			int index)
 {
-	unsigned long in_purr;
 	u64 snooze_exit_time;
 
 	set_thread_flag(TIF_POLLING_NRFLAG);
 
-	idle_loop_prolog(&in_purr);
+	pseries_idle_prolog();
 	local_irq_enable();
 	snooze_exit_time = get_tb() + snooze_timeout;
 
@@ -87,7 +65,7 @@ static int snooze_loop(struct cpuidle_device *dev,
 
 	local_irq_disable();
 
-	idle_loop_epilog(in_purr);
+	pseries_idle_epilog();
 
 	return index;
 }
@@ -109,22 +87,152 @@ static void check_and_cede_processor(void)
 	}
 }
 
+/*
+ * XCEDE: Extended CEDE states discovered through the
+ *        "ibm,get-systems-parameter" RTAS call with the token
+ *        CEDE_LATENCY_TOKEN
+ */
+
+/*
+ * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a
+ * table with all the parameters to ibm,get-system-parameters.
+ * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency
+ * Settings Information.
+ */
+#define CEDE_LATENCY_TOKEN	45
+
+/*
+ * If the platform supports the cede latency settings information system
+ * parameter it must provide the following information in the NULL terminated
+ * parameter string:
+ *
+ * a. The first byte is the length “N” of each cede latency setting record minus
+ *    one (zero indicates a length of 1 byte).
+ *
+ * b. For each supported cede latency setting a cede latency setting record
+ *    consisting of the first “N” bytes as per the following table.
+ *
+ *    -----------------------------
+ *    | Field           | Field   |
+ *    | Name            | Length  |
+ *    -----------------------------
+ *    | Cede Latency    | 1 Byte  |
+ *    | Specifier Value |         |
+ *    -----------------------------
+ *    | Maximum wakeup  |         |
+ *    | latency in      | 8 Bytes |
+ *    | tb-ticks        |         |
+ *    -----------------------------
+ *    | Responsive to   |         |
+ *    | external        | 1 Byte  |
+ *    | interrupts      |         |
+ *    -----------------------------
+ *
+ * This version has cede latency record size = 10.
+ *
+ * The structure xcede_latency_payload represents a) and b) with
+ * xcede_latency_record representing the table in b).
+ *
+ * xcede_latency_parameter is what gets returned by
+ * ibm,get-systems-parameter RTAS call when made with
+ * CEDE_LATENCY_TOKEN.
+ *
+ * These structures are only used to represent the data obtained by the RTAS
+ * call. The data is in big-endian.
+ */
+struct xcede_latency_record {
+	u8	hint;
+	__be64	latency_ticks;
+	u8	wake_on_irqs;
+} __packed;
+
+// Make space for 16 records, which "should be enough".
+struct xcede_latency_payload {
+	u8     record_size;
+	struct xcede_latency_record records[16];
+} __packed;
+
+struct xcede_latency_parameter {
+	__be16  payload_size;
+	struct xcede_latency_payload payload;
+	u8 null_char;
+} __packed;
+
+static unsigned int nr_xcede_records;
+static struct xcede_latency_parameter xcede_latency_parameter __initdata;
+
+static int __init parse_cede_parameters(void)
+{
+	struct xcede_latency_payload *payload;
+	u32 total_xcede_records_size;
+	u8 xcede_record_size;
+	u16 payload_size;
+	int ret, i;
+
+	ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+			NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter),
+			sizeof(xcede_latency_parameter));
+	if (ret) {
+		pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
+		return ret;
+	}
+
+	payload_size = be16_to_cpu(xcede_latency_parameter.payload_size);
+	payload = &xcede_latency_parameter.payload;
+
+	xcede_record_size = payload->record_size + 1;
+
+	if (xcede_record_size != sizeof(struct xcede_latency_record)) {
+		pr_err("xcede: Expected record-size %lu. Observed size %u.\n",
+		       sizeof(struct xcede_latency_record), xcede_record_size);
+		return -EINVAL;
+	}
+
+	pr_info("xcede: xcede_record_size = %d\n", xcede_record_size);
+
+	/*
+	 * Since the payload_size includes the last NULL byte and the
+	 * xcede_record_size, the remaining bytes correspond to array of all
+	 * cede_latency settings.
+	 */
+	total_xcede_records_size = payload_size - 2;
+	nr_xcede_records = total_xcede_records_size / xcede_record_size;
+
+	for (i = 0; i < nr_xcede_records; i++) {
+		struct xcede_latency_record *record = &payload->records[i];
+		u64 latency_ticks = be64_to_cpu(record->latency_ticks);
+		u8 wake_on_irqs = record->wake_on_irqs;
+		u8 hint = record->hint;
+
+		pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n",
+			i, hint, latency_ticks, wake_on_irqs);
+	}
+
+	return 0;
+}
+
+#define NR_DEDICATED_STATES	2 /* snooze, CEDE */
+static u8 cede_latency_hint[NR_DEDICATED_STATES];
+
 static int dedicated_cede_loop(struct cpuidle_device *dev,
 				struct cpuidle_driver *drv,
 				int index)
 {
-	unsigned long in_purr;
+	u8 old_latency_hint;
 
-	idle_loop_prolog(&in_purr);
+	pseries_idle_prolog();
 	get_lppaca()->donate_dedicated_cpu = 1;
+	old_latency_hint = get_lppaca()->cede_latency_hint;
+	get_lppaca()->cede_latency_hint = cede_latency_hint[index];
 
 	HMT_medium();
 	check_and_cede_processor();
 
 	local_irq_disable();
 	get_lppaca()->donate_dedicated_cpu = 0;
+	get_lppaca()->cede_latency_hint = old_latency_hint;
 
-	idle_loop_epilog(in_purr);
+	pseries_idle_epilog();
 
 	return index;
 }
@@ -133,9 +241,8 @@ static int shared_cede_loop(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv,
 			int index)
 {
-	unsigned long in_purr;
 
-	idle_loop_prolog(&in_purr);
+	pseries_idle_prolog();
 
 	/*
 	 * Yield the processor to the hypervisor.  We return if
@@ -147,7 +254,7 @@ static int shared_cede_loop(struct cpuidle_device *dev,
 	check_and_cede_processor();
 
 	local_irq_disable();
-	idle_loop_epilog(in_purr);
+	pseries_idle_epilog();
 
 	return index;
 }
@@ -155,7 +262,7 @@ static int shared_cede_loop(struct cpuidle_device *dev,
 /*
  * States for dedicated partition case.
  */
-static struct cpuidle_state dedicated_states[] = {
+static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
 	{ /* Snooze */
 		.name = "snooze",
 		.desc = "snooze",
@@ -236,11 +343,67 @@ static int pseries_cpuidle_driver_init(void)
 	return 0;
 }
 
+static void __init fixup_cede0_latency(void)
+{
+	struct xcede_latency_payload *payload;
+	u64 min_xcede_latency_us = UINT_MAX;
+	int i;
+
+	if (parse_cede_parameters())
+		return;
+
+	pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n",
+		nr_xcede_records);
+
+	payload = &xcede_latency_parameter.payload;
+
+	/*
+	 * The CEDE idle state maps to CEDE(0). While the hypervisor
+	 * does not advertise CEDE(0) exit latency values, it does
+	 * advertise the latency values of the extended CEDE states.
+	 * We use the lowest advertised exit latency value as a proxy
+	 * for the exit latency of CEDE(0).
+	 */
+	for (i = 0; i < nr_xcede_records; i++) {
+		struct xcede_latency_record *record = &payload->records[i];
+		u8 hint = record->hint;
+		u64 latency_tb = be64_to_cpu(record->latency_ticks);
+		u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
+
+		/*
+		 * We expect the exit latency of an extended CEDE
+		 * state to be non-zero, it to since it takes at least
+		 * a few nanoseconds to wakeup the idle CPU and
+		 * dispatch the virtual processor into the Linux
+		 * Guest.
+		 *
+		 * So we consider only non-zero value for performing
+		 * the fixup of CEDE(0) latency.
+		 */
+		if (latency_us == 0) {
+			pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n",
+				i, hint);
+			continue;
+		}
+
+		if (latency_us < min_xcede_latency_us)
+			min_xcede_latency_us = latency_us;
+	}
+
+	if (min_xcede_latency_us != UINT_MAX) {
+		dedicated_states[1].exit_latency = min_xcede_latency_us;
+		dedicated_states[1].target_residency = 10 * (min_xcede_latency_us);
+		pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
+			min_xcede_latency_us);
+	}
+
+}
+
 /*
  * pseries_idle_probe()
  * Choose state table for shared versus dedicated partition
  */
-static int pseries_idle_probe(void)
+static int __init pseries_idle_probe(void)
 {
 
 	if (cpuidle_disable != IDLE_NO_OVERRIDE)
@@ -257,8 +420,23 @@ static int pseries_idle_probe(void)
 			cpuidle_state_table = shared_states;
 			max_idle_state = ARRAY_SIZE(shared_states);
 		} else {
+			/*
+			 * Use firmware provided latency values
+			 * starting with POWER10 platforms. In the
+			 * case that we are running on a POWER10
+			 * platform but in an earlier compat mode, we
+			 * can still use the firmware provided values.
+			 *
+			 * However, on platforms prior to POWER10, we
+			 * cannot rely on the accuracy of the firmware
+			 * provided latency values. On such platforms,
+			 * go with the conservative default estimate
+			 * of 10us.
+			 */
+			if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10))
+				fixup_cede0_latency();
 			cpuidle_state_table = dedicated_states;
-			max_idle_state = ARRAY_SIZE(dedicated_states);
+			max_idle_state = NR_DEDICATED_STATES;
 		}
 	} else
 		return -ENODEV;
diff --git a/drivers/cpuidle/cpuidle-qcom-spm.c b/drivers/cpuidle/cpuidle-qcom-spm.c
new file mode 100644
index 000000000000..beedf22cbe78
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-qcom-spm.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2011-2014, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2014,2015, Linaro Ltd.
+ *
+ * SAW power controller driver
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+#include <linux/cpuidle.h>
+#include <linux/cpu_pm.h>
+#include <linux/qcom_scm.h>
+#include <soc/qcom/spm.h>
+
+#include <asm/proc-fns.h>
+#include <asm/suspend.h>
+
+#include "dt_idle_states.h"
+
+struct cpuidle_qcom_spm_data {
+	struct cpuidle_driver cpuidle_driver;
+	struct spm_driver_data *spm;
+};
+
+static int qcom_pm_collapse(unsigned long int unused)
+{
+	qcom_scm_cpu_power_down(QCOM_SCM_CPU_PWR_DOWN_L2_ON);
+
+	/*
+	 * Returns here only if there was a pending interrupt and we did not
+	 * power down as a result.
+	 */
+	return -1;
+}
+
+static int qcom_cpu_spc(struct spm_driver_data *drv)
+{
+	int ret;
+
+	spm_set_low_power_mode(drv, PM_SLEEP_MODE_SPC);
+	ret = cpu_suspend(0, qcom_pm_collapse);
+	/*
+	 * ARM common code executes WFI without calling into our driver and
+	 * if the SPM mode is not reset, then we may accidently power down the
+	 * cpu when we intended only to gate the cpu clock.
+	 * Ensure the state is set to standby before returning.
+	 */
+	spm_set_low_power_mode(drv, PM_SLEEP_MODE_STBY);
+
+	return ret;
+}
+
+static int spm_enter_idle_state(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv, int idx)
+{
+	struct cpuidle_qcom_spm_data *data = container_of(drv, struct cpuidle_qcom_spm_data,
+							  cpuidle_driver);
+
+	return CPU_PM_CPU_IDLE_ENTER_PARAM(qcom_cpu_spc, idx, data->spm);
+}
+
+static struct cpuidle_driver qcom_spm_idle_driver = {
+	.name = "qcom_spm",
+	.owner = THIS_MODULE,
+	.states[0] = {
+		.enter			= spm_enter_idle_state,
+		.exit_latency		= 1,
+		.target_residency	= 1,
+		.power_usage		= UINT_MAX,
+		.name			= "WFI",
+		.desc			= "ARM WFI",
+	}
+};
+
+static const struct of_device_id qcom_idle_state_match[] = {
+	{ .compatible = "qcom,idle-state-spc", .data = spm_enter_idle_state },
+	{ },
+};
+
+static int spm_cpuidle_register(struct device *cpuidle_dev, int cpu)
+{
+	struct platform_device *pdev = NULL;
+	struct device_node *cpu_node, *saw_node;
+	struct cpuidle_qcom_spm_data *data = NULL;
+	int ret;
+
+	cpu_node = of_cpu_device_node_get(cpu);
+	if (!cpu_node)
+		return -ENODEV;
+
+	saw_node = of_parse_phandle(cpu_node, "qcom,saw", 0);
+	if (!saw_node)
+		return -ENODEV;
+
+	pdev = of_find_device_by_node(saw_node);
+	of_node_put(saw_node);
+	of_node_put(cpu_node);
+	if (!pdev)
+		return -ENODEV;
+
+	data = devm_kzalloc(cpuidle_dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->spm = dev_get_drvdata(&pdev->dev);
+	if (!data->spm)
+		return -EINVAL;
+
+	data->cpuidle_driver = qcom_spm_idle_driver;
+	data->cpuidle_driver.cpumask = (struct cpumask *)cpumask_of(cpu);
+
+	ret = dt_init_idle_driver(&data->cpuidle_driver,
+				  qcom_idle_state_match, 1);
+	if (ret <= 0)
+		return ret ? : -ENODEV;
+
+	return cpuidle_register(&data->cpuidle_driver, NULL);
+}
+
+static int spm_cpuidle_drv_probe(struct platform_device *pdev)
+{
+	int cpu, ret;
+
+	if (!qcom_scm_is_available())
+		return -EPROBE_DEFER;
+
+	ret = qcom_scm_set_warm_boot_addr(cpu_resume_arm);
+	if (ret)
+		return dev_err_probe(&pdev->dev, ret, "set warm boot addr failed");
+
+	for_each_possible_cpu(cpu) {
+		ret = spm_cpuidle_register(&pdev->dev, cpu);
+		if (ret && ret != -ENODEV) {
+			dev_err(&pdev->dev,
+				"Cannot register for CPU%d: %d\n", cpu, ret);
+		}
+	}
+
+	return 0;
+}
+
+static struct platform_driver spm_cpuidle_driver = {
+	.probe = spm_cpuidle_drv_probe,
+	.driver = {
+		.name = "qcom-spm-cpuidle",
+		.suppress_bind_attrs = true,
+	},
+};
+
+static bool __init qcom_spm_find_any_cpu(void)
+{
+	struct device_node *cpu_node, *saw_node;
+
+	for_each_of_cpu_node(cpu_node) {
+		saw_node = of_parse_phandle(cpu_node, "qcom,saw", 0);
+		if (of_device_is_available(saw_node)) {
+			of_node_put(saw_node);
+			of_node_put(cpu_node);
+			return true;
+		}
+		of_node_put(saw_node);
+	}
+	return false;
+}
+
+static int __init qcom_spm_cpuidle_init(void)
+{
+	struct platform_device *pdev;
+	int ret;
+
+	ret = platform_driver_register(&spm_cpuidle_driver);
+	if (ret)
+		return ret;
+
+	/* Make sure there is actually any CPU managed by the SPM */
+	if (!qcom_spm_find_any_cpu())
+		return 0;
+
+	pdev = platform_device_register_simple("qcom-spm-cpuidle",
+					       -1, NULL, 0);
+	if (IS_ERR(pdev)) {
+		platform_driver_unregister(&spm_cpuidle_driver);
+		return PTR_ERR(pdev);
+	}
+
+	return 0;
+}
+device_initcall(qcom_spm_cpuidle_init);
diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c
new file mode 100644
index 000000000000..05fe2902df9a
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-riscv-sbi.c
@@ -0,0 +1,633 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RISC-V SBI CPU idle driver.
+ *
+ * Copyright (c) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "cpuidle-riscv-sbi: " fmt
+
+#include <linux/cpuidle.h>
+#include <linux/cpumask.h>
+#include <linux/cpu_pm.h>
+#include <linux/cpu_cooling.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
+#include <asm/cpuidle.h>
+#include <asm/sbi.h>
+#include <asm/smp.h>
+#include <asm/suspend.h>
+
+#include "dt_idle_states.h"
+#include "dt_idle_genpd.h"
+
+struct sbi_cpuidle_data {
+	u32 *states;
+	struct device *dev;
+};
+
+struct sbi_domain_state {
+	bool available;
+	u32 state;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct sbi_cpuidle_data, sbi_cpuidle_data);
+static DEFINE_PER_CPU(struct sbi_domain_state, domain_state);
+static bool sbi_cpuidle_use_osi;
+static bool sbi_cpuidle_use_cpuhp;
+static bool sbi_cpuidle_pd_allow_domain_state;
+
+static inline void sbi_set_domain_state(u32 state)
+{
+	struct sbi_domain_state *data = this_cpu_ptr(&domain_state);
+
+	data->available = true;
+	data->state = state;
+}
+
+static inline u32 sbi_get_domain_state(void)
+{
+	struct sbi_domain_state *data = this_cpu_ptr(&domain_state);
+
+	return data->state;
+}
+
+static inline void sbi_clear_domain_state(void)
+{
+	struct sbi_domain_state *data = this_cpu_ptr(&domain_state);
+
+	data->available = false;
+}
+
+static inline bool sbi_is_domain_state_available(void)
+{
+	struct sbi_domain_state *data = this_cpu_ptr(&domain_state);
+
+	return data->available;
+}
+
+static int sbi_suspend_finisher(unsigned long suspend_type,
+				unsigned long resume_addr,
+				unsigned long opaque)
+{
+	struct sbiret ret;
+
+	ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND,
+			suspend_type, resume_addr, opaque, 0, 0, 0);
+
+	return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0;
+}
+
+static int sbi_suspend(u32 state)
+{
+	if (state & SBI_HSM_SUSP_NON_RET_BIT)
+		return cpu_suspend(state, sbi_suspend_finisher);
+	else
+		return sbi_suspend_finisher(state, 0, 0);
+}
+
+static int sbi_cpuidle_enter_state(struct cpuidle_device *dev,
+				   struct cpuidle_driver *drv, int idx)
+{
+	u32 *states = __this_cpu_read(sbi_cpuidle_data.states);
+	u32 state = states[idx];
+
+	if (state & SBI_HSM_SUSP_NON_RET_BIT)
+		return CPU_PM_CPU_IDLE_ENTER_PARAM(sbi_suspend, idx, state);
+	else
+		return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(sbi_suspend,
+							     idx, state);
+}
+
+static int __sbi_enter_domain_idle_state(struct cpuidle_device *dev,
+					  struct cpuidle_driver *drv, int idx,
+					  bool s2idle)
+{
+	struct sbi_cpuidle_data *data = this_cpu_ptr(&sbi_cpuidle_data);
+	u32 *states = data->states;
+	struct device *pd_dev = data->dev;
+	u32 state;
+	int ret;
+
+	ret = cpu_pm_enter();
+	if (ret)
+		return -1;
+
+	/* Do runtime PM to manage a hierarchical CPU toplogy. */
+	ct_irq_enter_irqson();
+	if (s2idle)
+		dev_pm_genpd_suspend(pd_dev);
+	else
+		pm_runtime_put_sync_suspend(pd_dev);
+	ct_irq_exit_irqson();
+
+	if (sbi_is_domain_state_available())
+		state = sbi_get_domain_state();
+	else
+		state = states[idx];
+
+	ret = sbi_suspend(state) ? -1 : idx;
+
+	ct_irq_enter_irqson();
+	if (s2idle)
+		dev_pm_genpd_resume(pd_dev);
+	else
+		pm_runtime_get_sync(pd_dev);
+	ct_irq_exit_irqson();
+
+	cpu_pm_exit();
+
+	/* Clear the domain state to start fresh when back from idle. */
+	sbi_clear_domain_state();
+	return ret;
+}
+
+static int sbi_enter_domain_idle_state(struct cpuidle_device *dev,
+				       struct cpuidle_driver *drv, int idx)
+{
+	return __sbi_enter_domain_idle_state(dev, drv, idx, false);
+}
+
+static int sbi_enter_s2idle_domain_idle_state(struct cpuidle_device *dev,
+					      struct cpuidle_driver *drv,
+					      int idx)
+{
+	return __sbi_enter_domain_idle_state(dev, drv, idx, true);
+}
+
+static int sbi_cpuidle_cpuhp_up(unsigned int cpu)
+{
+	struct device *pd_dev = __this_cpu_read(sbi_cpuidle_data.dev);
+
+	if (pd_dev)
+		pm_runtime_get_sync(pd_dev);
+
+	return 0;
+}
+
+static int sbi_cpuidle_cpuhp_down(unsigned int cpu)
+{
+	struct device *pd_dev = __this_cpu_read(sbi_cpuidle_data.dev);
+
+	if (pd_dev) {
+		pm_runtime_put_sync(pd_dev);
+		/* Clear domain state to start fresh at next online. */
+		sbi_clear_domain_state();
+	}
+
+	return 0;
+}
+
+static void sbi_idle_init_cpuhp(void)
+{
+	int err;
+
+	if (!sbi_cpuidle_use_cpuhp)
+		return;
+
+	err = cpuhp_setup_state_nocalls(CPUHP_AP_CPU_PM_STARTING,
+					"cpuidle/sbi:online",
+					sbi_cpuidle_cpuhp_up,
+					sbi_cpuidle_cpuhp_down);
+	if (err)
+		pr_warn("Failed %d while setup cpuhp state\n", err);
+}
+
+static const struct of_device_id sbi_cpuidle_state_match[] = {
+	{ .compatible = "riscv,idle-state",
+	  .data = sbi_cpuidle_enter_state },
+	{ },
+};
+
+static bool sbi_suspend_state_is_valid(u32 state)
+{
+	if (state > SBI_HSM_SUSPEND_RET_DEFAULT &&
+	    state < SBI_HSM_SUSPEND_RET_PLATFORM)
+		return false;
+	if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT &&
+	    state < SBI_HSM_SUSPEND_NON_RET_PLATFORM)
+		return false;
+	return true;
+}
+
+static int sbi_dt_parse_state_node(struct device_node *np, u32 *state)
+{
+	int err = of_property_read_u32(np, "riscv,sbi-suspend-param", state);
+
+	if (err) {
+		pr_warn("%pOF missing riscv,sbi-suspend-param property\n", np);
+		return err;
+	}
+
+	if (!sbi_suspend_state_is_valid(*state)) {
+		pr_warn("Invalid SBI suspend state %#x\n", *state);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int sbi_dt_cpu_init_topology(struct cpuidle_driver *drv,
+				     struct sbi_cpuidle_data *data,
+				     unsigned int state_count, int cpu)
+{
+	/* Currently limit the hierarchical topology to be used in OSI mode. */
+	if (!sbi_cpuidle_use_osi)
+		return 0;
+
+	data->dev = dt_idle_attach_cpu(cpu, "sbi");
+	if (IS_ERR_OR_NULL(data->dev))
+		return PTR_ERR_OR_ZERO(data->dev);
+
+	/*
+	 * Using the deepest state for the CPU to trigger a potential selection
+	 * of a shared state for the domain, assumes the domain states are all
+	 * deeper states.
+	 */
+	drv->states[state_count - 1].enter = sbi_enter_domain_idle_state;
+	drv->states[state_count - 1].enter_s2idle =
+					sbi_enter_s2idle_domain_idle_state;
+	sbi_cpuidle_use_cpuhp = true;
+
+	return 0;
+}
+
+static int sbi_cpuidle_dt_init_states(struct device *dev,
+					struct cpuidle_driver *drv,
+					unsigned int cpu,
+					unsigned int state_count)
+{
+	struct sbi_cpuidle_data *data = per_cpu_ptr(&sbi_cpuidle_data, cpu);
+	struct device_node *state_node;
+	struct device_node *cpu_node;
+	u32 *states;
+	int i, ret;
+
+	cpu_node = of_cpu_device_node_get(cpu);
+	if (!cpu_node)
+		return -ENODEV;
+
+	states = devm_kcalloc(dev, state_count, sizeof(*states), GFP_KERNEL);
+	if (!states) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/* Parse SBI specific details from state DT nodes */
+	for (i = 1; i < state_count; i++) {
+		state_node = of_get_cpu_state_node(cpu_node, i - 1);
+		if (!state_node)
+			break;
+
+		ret = sbi_dt_parse_state_node(state_node, &states[i]);
+		of_node_put(state_node);
+
+		if (ret)
+			return ret;
+
+		pr_debug("sbi-state %#x index %d\n", states[i], i);
+	}
+	if (i != state_count) {
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	/* Initialize optional data, used for the hierarchical topology. */
+	ret = sbi_dt_cpu_init_topology(drv, data, state_count, cpu);
+	if (ret < 0)
+		return ret;
+
+	/* Store states in the per-cpu struct. */
+	data->states = states;
+
+fail:
+	of_node_put(cpu_node);
+
+	return ret;
+}
+
+static void sbi_cpuidle_deinit_cpu(int cpu)
+{
+	struct sbi_cpuidle_data *data = per_cpu_ptr(&sbi_cpuidle_data, cpu);
+
+	dt_idle_detach_cpu(data->dev);
+	sbi_cpuidle_use_cpuhp = false;
+}
+
+static int sbi_cpuidle_init_cpu(struct device *dev, int cpu)
+{
+	struct cpuidle_driver *drv;
+	unsigned int state_count = 0;
+	int ret = 0;
+
+	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
+	if (!drv)
+		return -ENOMEM;
+
+	drv->name = "sbi_cpuidle";
+	drv->owner = THIS_MODULE;
+	drv->cpumask = (struct cpumask *)cpumask_of(cpu);
+
+	/* RISC-V architectural WFI to be represented as state index 0. */
+	drv->states[0].enter = sbi_cpuidle_enter_state;
+	drv->states[0].exit_latency = 1;
+	drv->states[0].target_residency = 1;
+	drv->states[0].power_usage = UINT_MAX;
+	strcpy(drv->states[0].name, "WFI");
+	strcpy(drv->states[0].desc, "RISC-V WFI");
+
+	/*
+	 * If no DT idle states are detected (ret == 0) let the driver
+	 * initialization fail accordingly since there is no reason to
+	 * initialize the idle driver if only wfi is supported, the
+	 * default archictectural back-end already executes wfi
+	 * on idle entry.
+	 */
+	ret = dt_init_idle_driver(drv, sbi_cpuidle_state_match, 1);
+	if (ret <= 0) {
+		pr_debug("HART%ld: failed to parse DT idle states\n",
+			 cpuid_to_hartid_map(cpu));
+		return ret ? : -ENODEV;
+	}
+	state_count = ret + 1; /* Include WFI state as well */
+
+	/* Initialize idle states from DT. */
+	ret = sbi_cpuidle_dt_init_states(dev, drv, cpu, state_count);
+	if (ret) {
+		pr_err("HART%ld: failed to init idle states\n",
+		       cpuid_to_hartid_map(cpu));
+		return ret;
+	}
+
+	ret = cpuidle_register(drv, NULL);
+	if (ret)
+		goto deinit;
+
+	cpuidle_cooling_register(drv);
+
+	return 0;
+deinit:
+	sbi_cpuidle_deinit_cpu(cpu);
+	return ret;
+}
+
+static void sbi_cpuidle_domain_sync_state(struct device *dev)
+{
+	/*
+	 * All devices have now been attached/probed to the PM domain
+	 * topology, hence it's fine to allow domain states to be picked.
+	 */
+	sbi_cpuidle_pd_allow_domain_state = true;
+}
+
+#ifdef CONFIG_DT_IDLE_GENPD
+
+static int sbi_cpuidle_pd_power_off(struct generic_pm_domain *pd)
+{
+	struct genpd_power_state *state = &pd->states[pd->state_idx];
+	u32 *pd_state;
+
+	if (!state->data)
+		return 0;
+
+	if (!sbi_cpuidle_pd_allow_domain_state)
+		return -EBUSY;
+
+	/* OSI mode is enabled, set the corresponding domain state. */
+	pd_state = state->data;
+	sbi_set_domain_state(*pd_state);
+
+	return 0;
+}
+
+struct sbi_pd_provider {
+	struct list_head link;
+	struct device_node *node;
+};
+
+static LIST_HEAD(sbi_pd_providers);
+
+static int sbi_pd_init(struct device_node *np)
+{
+	struct generic_pm_domain *pd;
+	struct sbi_pd_provider *pd_provider;
+	struct dev_power_governor *pd_gov;
+	int ret = -ENOMEM;
+
+	pd = dt_idle_pd_alloc(np, sbi_dt_parse_state_node);
+	if (!pd)
+		goto out;
+
+	pd_provider = kzalloc(sizeof(*pd_provider), GFP_KERNEL);
+	if (!pd_provider)
+		goto free_pd;
+
+	pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN;
+
+	/* Allow power off when OSI is available. */
+	if (sbi_cpuidle_use_osi)
+		pd->power_off = sbi_cpuidle_pd_power_off;
+	else
+		pd->flags |= GENPD_FLAG_ALWAYS_ON;
+
+	/* Use governor for CPU PM domains if it has some states to manage. */
+	pd_gov = pd->states ? &pm_domain_cpu_gov : NULL;
+
+	ret = pm_genpd_init(pd, pd_gov, false);
+	if (ret)
+		goto free_pd_prov;
+
+	ret = of_genpd_add_provider_simple(np, pd);
+	if (ret)
+		goto remove_pd;
+
+	pd_provider->node = of_node_get(np);
+	list_add(&pd_provider->link, &sbi_pd_providers);
+
+	pr_debug("init PM domain %s\n", pd->name);
+	return 0;
+
+remove_pd:
+	pm_genpd_remove(pd);
+free_pd_prov:
+	kfree(pd_provider);
+free_pd:
+	dt_idle_pd_free(pd);
+out:
+	pr_err("failed to init PM domain ret=%d %pOF\n", ret, np);
+	return ret;
+}
+
+static void sbi_pd_remove(void)
+{
+	struct sbi_pd_provider *pd_provider, *it;
+	struct generic_pm_domain *genpd;
+
+	list_for_each_entry_safe(pd_provider, it, &sbi_pd_providers, link) {
+		of_genpd_del_provider(pd_provider->node);
+
+		genpd = of_genpd_remove_last(pd_provider->node);
+		if (!IS_ERR(genpd))
+			kfree(genpd);
+
+		of_node_put(pd_provider->node);
+		list_del(&pd_provider->link);
+		kfree(pd_provider);
+	}
+}
+
+static int sbi_genpd_probe(struct device_node *np)
+{
+	struct device_node *node;
+	int ret = 0, pd_count = 0;
+
+	if (!np)
+		return -ENODEV;
+
+	/*
+	 * Parse child nodes for the "#power-domain-cells" property and
+	 * initialize a genpd/genpd-of-provider pair when it's found.
+	 */
+	for_each_child_of_node(np, node) {
+		if (!of_find_property(node, "#power-domain-cells", NULL))
+			continue;
+
+		ret = sbi_pd_init(node);
+		if (ret)
+			goto put_node;
+
+		pd_count++;
+	}
+
+	/* Bail out if not using the hierarchical CPU topology. */
+	if (!pd_count)
+		goto no_pd;
+
+	/* Link genpd masters/subdomains to model the CPU topology. */
+	ret = dt_idle_pd_init_topology(np);
+	if (ret)
+		goto remove_pd;
+
+	return 0;
+
+put_node:
+	of_node_put(node);
+remove_pd:
+	sbi_pd_remove();
+	pr_err("failed to create CPU PM domains ret=%d\n", ret);
+no_pd:
+	return ret;
+}
+
+#else
+
+static inline int sbi_genpd_probe(struct device_node *np)
+{
+	return 0;
+}
+
+#endif
+
+static int sbi_cpuidle_probe(struct platform_device *pdev)
+{
+	int cpu, ret;
+	struct cpuidle_driver *drv;
+	struct cpuidle_device *dev;
+	struct device_node *np, *pds_node;
+
+	/* Detect OSI support based on CPU DT nodes */
+	sbi_cpuidle_use_osi = true;
+	for_each_possible_cpu(cpu) {
+		np = of_cpu_device_node_get(cpu);
+		if (np &&
+		    of_find_property(np, "power-domains", NULL) &&
+		    of_find_property(np, "power-domain-names", NULL)) {
+			continue;
+		} else {
+			sbi_cpuidle_use_osi = false;
+			break;
+		}
+	}
+
+	/* Populate generic power domains from DT nodes */
+	pds_node = of_find_node_by_path("/cpus/power-domains");
+	if (pds_node) {
+		ret = sbi_genpd_probe(pds_node);
+		of_node_put(pds_node);
+		if (ret)
+			return ret;
+	}
+
+	/* Initialize CPU idle driver for each CPU */
+	for_each_possible_cpu(cpu) {
+		ret = sbi_cpuidle_init_cpu(&pdev->dev, cpu);
+		if (ret) {
+			pr_debug("HART%ld: idle driver init failed\n",
+				 cpuid_to_hartid_map(cpu));
+			goto out_fail;
+		}
+	}
+
+	/* Setup CPU hotplut notifiers */
+	sbi_idle_init_cpuhp();
+
+	pr_info("idle driver registered for all CPUs\n");
+
+	return 0;
+
+out_fail:
+	while (--cpu >= 0) {
+		dev = per_cpu(cpuidle_devices, cpu);
+		drv = cpuidle_get_cpu_driver(dev);
+		cpuidle_unregister(drv);
+		sbi_cpuidle_deinit_cpu(cpu);
+	}
+
+	return ret;
+}
+
+static struct platform_driver sbi_cpuidle_driver = {
+	.probe = sbi_cpuidle_probe,
+	.driver = {
+		.name = "sbi-cpuidle",
+		.sync_state = sbi_cpuidle_domain_sync_state,
+	},
+};
+
+static int __init sbi_cpuidle_init(void)
+{
+	int ret;
+	struct platform_device *pdev;
+
+	/*
+	 * The SBI HSM suspend function is only available when:
+	 * 1) SBI version is 0.3 or higher
+	 * 2) SBI HSM extension is available
+	 */
+	if ((sbi_spec_version < sbi_mk_version(0, 3)) ||
+	    sbi_probe_extension(SBI_EXT_HSM) <= 0) {
+		pr_info("HSM suspend not available\n");
+		return 0;
+	}
+
+	ret = platform_driver_register(&sbi_cpuidle_driver);
+	if (ret)
+		return ret;
+
+	pdev = platform_device_register_simple("sbi-cpuidle",
+						-1, NULL, 0);
+	if (IS_ERR(pdev)) {
+		platform_driver_unregister(&sbi_cpuidle_driver);
+		return PTR_ERR(pdev);
+	}
+
+	return 0;
+}
+device_initcall(sbi_cpuidle_init);
diff --git a/drivers/cpuidle/cpuidle-tegra.c b/drivers/cpuidle/cpuidle-tegra.c
new file mode 100644
index 000000000000..9845629aeb6d
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-tegra.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU idle driver for Tegra CPUs
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ * Copyright (c) 2011 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *         Gary King <gking@nvidia.com>
+ *
+ * Rework for 3.3 by Peter De Schrijver <pdeschrijver@nvidia.com>
+ *
+ * Tegra20/124 driver unification by Dmitry Osipenko <digetx@gmail.com>
+ */
+
+#define pr_fmt(fmt)	"tegra-cpuidle: " fmt
+
+#include <linux/atomic.h>
+#include <linux/cpuidle.h>
+#include <linux/cpumask.h>
+#include <linux/cpu_pm.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/platform_device.h>
+#include <linux/types.h>
+
+#include <linux/clk/tegra.h>
+#include <linux/firmware/trusted_foundations.h>
+
+#include <soc/tegra/cpuidle.h>
+#include <soc/tegra/flowctrl.h>
+#include <soc/tegra/fuse.h>
+#include <soc/tegra/irq.h>
+#include <soc/tegra/pm.h>
+#include <soc/tegra/pmc.h>
+
+#include <asm/cpuidle.h>
+#include <asm/firmware.h>
+#include <asm/smp_plat.h>
+#include <asm/suspend.h>
+
+enum tegra_state {
+	TEGRA_C1,
+	TEGRA_C7,
+	TEGRA_CC6,
+	TEGRA_STATE_COUNT,
+};
+
+static atomic_t tegra_idle_barrier;
+static atomic_t tegra_abort_flag;
+
+static void tegra_cpuidle_report_cpus_state(void)
+{
+	unsigned long cpu, lcpu, csr;
+
+	for_each_cpu(lcpu, cpu_possible_mask) {
+		cpu = cpu_logical_map(lcpu);
+		csr = flowctrl_read_cpu_csr(cpu);
+
+		pr_err("cpu%lu: online=%d flowctrl_csr=0x%08lx\n",
+		       cpu, cpu_online(lcpu), csr);
+	}
+}
+
+static int tegra_cpuidle_wait_for_secondary_cpus_parking(void)
+{
+	unsigned int retries = 3;
+
+	while (retries--) {
+		unsigned int delay_us = 10;
+		unsigned int timeout_us = 500 * 1000 / delay_us;
+
+		/*
+		 * The primary CPU0 core shall wait for the secondaries
+		 * shutdown in order to power-off CPU's cluster safely.
+		 * The timeout value depends on the current CPU frequency,
+		 * it takes about 40-150us in average and over 1000us in
+		 * a worst case scenario.
+		 */
+		do {
+			if (tegra_cpu_rail_off_ready())
+				return 0;
+
+			udelay(delay_us);
+
+		} while (timeout_us--);
+
+		pr_err("secondary CPU taking too long to park\n");
+
+		tegra_cpuidle_report_cpus_state();
+	}
+
+	pr_err("timed out waiting secondaries to park\n");
+
+	return -ETIMEDOUT;
+}
+
+static void tegra_cpuidle_unpark_secondary_cpus(void)
+{
+	unsigned int cpu, lcpu;
+
+	for_each_cpu(lcpu, cpu_online_mask) {
+		cpu = cpu_logical_map(lcpu);
+
+		if (cpu > 0) {
+			tegra_enable_cpu_clock(cpu);
+			tegra_cpu_out_of_reset(cpu);
+			flowctrl_write_cpu_halt(cpu, 0);
+		}
+	}
+}
+
+static int tegra_cpuidle_cc6_enter(unsigned int cpu)
+{
+	int ret;
+
+	if (cpu > 0) {
+		ret = cpu_suspend(cpu, tegra_pm_park_secondary_cpu);
+	} else {
+		ret = tegra_cpuidle_wait_for_secondary_cpus_parking();
+		if (!ret)
+			ret = tegra_pm_enter_lp2();
+
+		tegra_cpuidle_unpark_secondary_cpus();
+	}
+
+	return ret;
+}
+
+static int tegra_cpuidle_c7_enter(void)
+{
+	int err;
+
+	err = call_firmware_op(prepare_idle, TF_PM_MODE_LP2_NOFLUSH_L2);
+	if (err && err != -ENOSYS)
+		return err;
+
+	return cpu_suspend(0, tegra30_pm_secondary_cpu_suspend);
+}
+
+static int tegra_cpuidle_coupled_barrier(struct cpuidle_device *dev)
+{
+	if (tegra_pending_sgi()) {
+		/*
+		 * CPU got local interrupt that will be lost after GIC's
+		 * shutdown because GIC driver doesn't save/restore the
+		 * pending SGI state across CPU cluster PM.  Abort and retry
+		 * next time.
+		 */
+		atomic_set(&tegra_abort_flag, 1);
+	}
+
+	cpuidle_coupled_parallel_barrier(dev, &tegra_idle_barrier);
+
+	if (atomic_read(&tegra_abort_flag)) {
+		cpuidle_coupled_parallel_barrier(dev, &tegra_idle_barrier);
+		atomic_set(&tegra_abort_flag, 0);
+		return -EINTR;
+	}
+
+	return 0;
+}
+
+static int tegra_cpuidle_state_enter(struct cpuidle_device *dev,
+				     int index, unsigned int cpu)
+{
+	int err;
+
+	/*
+	 * CC6 state is the "CPU cluster power-off" state.  In order to
+	 * enter this state, at first the secondary CPU cores need to be
+	 * parked into offline mode, then the last CPU should clean out
+	 * remaining dirty cache lines into DRAM and trigger Flow Controller
+	 * logic that turns off the cluster's power domain (which includes
+	 * CPU cores, GIC and L2 cache).
+	 */
+	if (index == TEGRA_CC6) {
+		err = tegra_cpuidle_coupled_barrier(dev);
+		if (err)
+			return err;
+	}
+
+	local_fiq_disable();
+	RCU_NONIDLE(tegra_pm_set_cpu_in_lp2());
+	cpu_pm_enter();
+
+	switch (index) {
+	case TEGRA_C7:
+		err = tegra_cpuidle_c7_enter();
+		break;
+
+	case TEGRA_CC6:
+		err = tegra_cpuidle_cc6_enter(cpu);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	cpu_pm_exit();
+	RCU_NONIDLE(tegra_pm_clear_cpu_in_lp2());
+	local_fiq_enable();
+
+	return err ?: index;
+}
+
+static int tegra_cpuidle_adjust_state_index(int index, unsigned int cpu)
+{
+	/*
+	 * On Tegra30 CPU0 can't be power-gated separately from secondary
+	 * cores because it gates the whole CPU cluster.
+	 */
+	if (cpu > 0 || index != TEGRA_C7 || tegra_get_chip_id() != TEGRA30)
+		return index;
+
+	/* put CPU0 into C1 if C7 is requested and secondaries are online */
+	if (!IS_ENABLED(CONFIG_PM_SLEEP) || num_online_cpus() > 1)
+		index = TEGRA_C1;
+	else
+		index = TEGRA_CC6;
+
+	return index;
+}
+
+static int tegra_cpuidle_enter(struct cpuidle_device *dev,
+			       struct cpuidle_driver *drv,
+			       int index)
+{
+	unsigned int cpu = cpu_logical_map(dev->cpu);
+	int ret;
+
+	index = tegra_cpuidle_adjust_state_index(index, cpu);
+	if (dev->states_usage[index].disable)
+		return -1;
+
+	if (index == TEGRA_C1)
+		ret = arm_cpuidle_simple_enter(dev, drv, index);
+	else
+		ret = tegra_cpuidle_state_enter(dev, index, cpu);
+
+	if (ret < 0) {
+		if (ret != -EINTR || index != TEGRA_CC6)
+			pr_err_once("failed to enter state %d err: %d\n",
+				    index, ret);
+		index = -1;
+	} else {
+		index = ret;
+	}
+
+	return index;
+}
+
+static int tegra114_enter_s2idle(struct cpuidle_device *dev,
+				 struct cpuidle_driver *drv,
+				 int index)
+{
+	tegra_cpuidle_enter(dev, drv, index);
+
+	return 0;
+}
+
+/*
+ * The previous versions of Tegra CPUIDLE driver used a different "legacy"
+ * terminology for naming of the idling states, while this driver uses the
+ * new terminology.
+ *
+ * Mapping of the old terms into the new ones:
+ *
+ * Old | New
+ * ---------
+ * LP3 | C1	(CPU core clock gating)
+ * LP2 | C7	(CPU core power gating)
+ * LP2 | CC6	(CPU cluster power gating)
+ *
+ * Note that that the older CPUIDLE driver versions didn't explicitly
+ * differentiate the LP2 states because these states either used the same
+ * code path or because CC6 wasn't supported.
+ */
+static struct cpuidle_driver tegra_idle_driver = {
+	.name = "tegra_idle",
+	.states = {
+		[TEGRA_C1] = ARM_CPUIDLE_WFI_STATE_PWR(600),
+		[TEGRA_C7] = {
+			.enter			= tegra_cpuidle_enter,
+			.exit_latency		= 2000,
+			.target_residency	= 2200,
+			.power_usage		= 100,
+			.flags			= CPUIDLE_FLAG_TIMER_STOP,
+			.name			= "C7",
+			.desc			= "CPU core powered off",
+		},
+		[TEGRA_CC6] = {
+			.enter			= tegra_cpuidle_enter,
+			.exit_latency		= 5000,
+			.target_residency	= 10000,
+			.power_usage		= 0,
+			.flags			= CPUIDLE_FLAG_TIMER_STOP |
+						  CPUIDLE_FLAG_COUPLED,
+			.name			= "CC6",
+			.desc			= "CPU cluster powered off",
+		},
+	},
+	.state_count = TEGRA_STATE_COUNT,
+	.safe_state_index = TEGRA_C1,
+};
+
+static inline void tegra_cpuidle_disable_state(enum tegra_state state)
+{
+	cpuidle_driver_state_disabled(&tegra_idle_driver, state, true);
+}
+
+/*
+ * Tegra20 HW appears to have a bug such that PCIe device interrupts, whether
+ * they are legacy IRQs or MSI, are lost when CC6 is enabled.  To work around
+ * this, simply disable CC6 if the PCI driver and DT node are both enabled.
+ */
+void tegra_cpuidle_pcie_irqs_in_use(void)
+{
+	struct cpuidle_state *state_cc6 = &tegra_idle_driver.states[TEGRA_CC6];
+
+	if ((state_cc6->flags & CPUIDLE_FLAG_UNUSABLE) ||
+	    tegra_get_chip_id() != TEGRA20)
+		return;
+
+	pr_info("disabling CC6 state, since PCIe IRQs are in use\n");
+	tegra_cpuidle_disable_state(TEGRA_CC6);
+}
+
+static void tegra_cpuidle_setup_tegra114_c7_state(void)
+{
+	struct cpuidle_state *s = &tegra_idle_driver.states[TEGRA_C7];
+
+	s->enter_s2idle = tegra114_enter_s2idle;
+	s->target_residency = 1000;
+	s->exit_latency = 500;
+}
+
+static int tegra_cpuidle_probe(struct platform_device *pdev)
+{
+	if (tegra_pmc_get_suspend_mode() == TEGRA_SUSPEND_NOT_READY)
+		return -EPROBE_DEFER;
+
+	/* LP2 could be disabled in device-tree */
+	if (tegra_pmc_get_suspend_mode() < TEGRA_SUSPEND_LP2)
+		tegra_cpuidle_disable_state(TEGRA_CC6);
+
+	/*
+	 * Required suspend-resume functionality, which is provided by the
+	 * Tegra-arch core and PMC driver, is unavailable if PM-sleep option
+	 * is disabled.
+	 */
+	if (!IS_ENABLED(CONFIG_PM_SLEEP)) {
+		tegra_cpuidle_disable_state(TEGRA_C7);
+		tegra_cpuidle_disable_state(TEGRA_CC6);
+	}
+
+	/*
+	 * Generic WFI state (also known as C1 or LP3) and the coupled CPU
+	 * cluster power-off (CC6 or LP2) states are common for all Tegra SoCs.
+	 */
+	switch (tegra_get_chip_id()) {
+	case TEGRA20:
+		/* Tegra20 isn't capable to power-off individual CPU cores */
+		tegra_cpuidle_disable_state(TEGRA_C7);
+		break;
+
+	case TEGRA30:
+		break;
+
+	case TEGRA114:
+	case TEGRA124:
+		tegra_cpuidle_setup_tegra114_c7_state();
+
+		/* coupled CC6 (LP2) state isn't implemented yet */
+		tegra_cpuidle_disable_state(TEGRA_CC6);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return cpuidle_register(&tegra_idle_driver, cpu_possible_mask);
+}
+
+static struct platform_driver tegra_cpuidle_driver = {
+	.probe = tegra_cpuidle_probe,
+	.driver = {
+		.name = "tegra-cpuidle",
+	},
+};
+builtin_platform_driver(tegra_cpuidle_driver);
diff --git a/drivers/cpuidle/cpuidle-ux500.c b/drivers/cpuidle/cpuidle-ux500.c
index a2d34be17a09..f7d778580e9b 100644
--- a/drivers/cpuidle/cpuidle-ux500.c
+++ b/drivers/cpuidle/cpuidle-ux500.c
@@ -117,7 +117,7 @@ static int dbx500_cpuidle_probe(struct platform_device *pdev)
 
 static struct platform_driver dbx500_cpuidle_plat_driver = {
 	.driver = {
-		.name = "cpuidle-dbx500",
+		.name = "db8500-cpuidle",
 	},
 	.probe = dbx500_cpuidle_probe,
 };
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index de81298051b3..6eceb1988243 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -8,6 +8,7 @@
  * This code is licenced under the GPL.
  */
 
+#include "linux/percpu-defs.h"
 #include <linux/clockchips.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
@@ -22,6 +23,8 @@
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/tick.h>
+#include <linux/mmu_context.h>
+#include <linux/context_tracking.h>
 #include <trace/events/power.h>
 
 #include "cpuidle.h"
@@ -137,29 +140,25 @@ static void enter_s2idle_proper(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev, int index)
 {
 	ktime_t time_start, time_end;
+	struct cpuidle_state *target_state = &drv->states[index];
 
 	time_start = ns_to_ktime(local_clock());
 
-	/*
-	 * trace_suspend_resume() called by tick_freeze() for the last CPU
-	 * executing it contains RCU usage regarded as invalid in the idle
-	 * context, so tell RCU about that.
-	 */
-	RCU_NONIDLE(tick_freeze());
+	tick_freeze();
 	/*
 	 * The state used here cannot be a "coupled" one, because the "coupled"
 	 * cpuidle mechanism enables interrupts and doing that with timekeeping
 	 * suspended is generally unsafe.
 	 */
 	stop_critical_timings();
-	drv->states[index].enter_s2idle(dev, drv, index);
-	WARN_ON(!irqs_disabled());
-	/*
-	 * timekeeping_resume() that will be called by tick_unfreeze() for the
-	 * first CPU executing it calls functions containing RCU read-side
-	 * critical sections, so tell RCU about that.
-	 */
-	RCU_NONIDLE(tick_unfreeze());
+	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE))
+		ct_idle_enter();
+	target_state->enter_s2idle(dev, drv, index);
+	if (WARN_ON_ONCE(!irqs_disabled()))
+		local_irq_disable();
+	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE))
+		ct_idle_exit();
+	tick_unfreeze();
 	start_critical_timings();
 
 	time_end = ns_to_ktime(local_clock());
@@ -186,9 +185,10 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	 * be frozen safely.
 	 */
 	index = find_deepest_state(drv, dev, U64_MAX, 0, true);
-	if (index > 0)
+	if (index > 0) {
 		enter_s2idle_proper(drv, dev, index);
-
+		local_irq_enable();
+	}
 	return index;
 }
 #endif /* CONFIG_SUSPEND */
@@ -224,19 +224,26 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 		broadcast = false;
 	}
 
+	if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
+		leave_mm(dev->cpu);
+
 	/* Take note of the planned idle state. */
 	sched_idle_set_state(target_state);
 
-	trace_cpu_idle_rcuidle(index, dev->cpu);
+	trace_cpu_idle(index, dev->cpu);
 	time_start = ns_to_ktime(local_clock());
 
 	stop_critical_timings();
+	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE))
+		ct_idle_enter();
 	entered_state = target_state->enter(dev, drv, index);
+	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE))
+		ct_idle_exit();
 	start_critical_timings();
 
 	sched_clock_idle_wakeup_event();
 	time_end = ns_to_ktime(local_clock());
-	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+	trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu);
 
 	/* The cpu is no longer idle or about to enter idle. */
 	sched_idle_set_state(NULL);
@@ -273,6 +280,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 
 				/* Shallower states are enabled, so update. */
 				dev->states_usage[entered_state].above++;
+				trace_cpu_idle_miss(dev->cpu, entered_state, false);
 				break;
 			}
 		} else if (diff > delay) {
@@ -284,14 +292,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 				 * Update if a deeper state would have been a
 				 * better match for the observed idle duration.
 				 */
-				if (diff - delay >= drv->states[i].target_residency_ns)
+				if (diff - delay >= drv->states[i].target_residency_ns) {
 					dev->states_usage[entered_state].below++;
+					trace_cpu_idle_miss(dev->cpu, entered_state, true);
+				}
 
 				break;
 			}
 		}
 	} else {
 		dev->last_residency_ns = 0;
+		dev->states_usage[index].rejected++;
 	}
 
 	return entered_state;
@@ -362,6 +373,19 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index)
 		cpuidle_curr_governor->reflect(dev, index);
 }
 
+/*
+ * Min polling interval of 10usec is a guess. It is assuming that
+ * for most users, the time for a single ping-pong workload like
+ * perf bench pipe would generally complete within 10usec but
+ * this is hardware dependant. Actual time can be estimated with
+ *
+ * perf bench sched pipe -l 10000
+ *
+ * Run multiple times to avoid cpufreq effects.
+ */
+#define CPUIDLE_POLL_MIN 10000
+#define CPUIDLE_POLL_MAX (TICK_NSEC / 16)
+
 /**
  * cpuidle_poll_time - return amount of time to poll for,
  * governors can override dev->poll_limit_ns if necessary
@@ -376,15 +400,23 @@ u64 cpuidle_poll_time(struct cpuidle_driver *drv,
 	int i;
 	u64 limit_ns;
 
+	BUILD_BUG_ON(CPUIDLE_POLL_MIN > CPUIDLE_POLL_MAX);
+
 	if (dev->poll_limit_ns)
 		return dev->poll_limit_ns;
 
-	limit_ns = TICK_NSEC;
+	limit_ns = CPUIDLE_POLL_MAX;
 	for (i = 1; i < drv->state_count; i++) {
+		u64 state_limit;
+
 		if (dev->states_usage[i].disable)
 			continue;
 
-		limit_ns = drv->states[i].target_residency_ns;
+		state_limit = drv->states[i].target_residency_ns;
+		if (state_limit < CPUIDLE_POLL_MIN)
+			continue;
+
+		limit_ns = min_t(u64, state_limit, CPUIDLE_POLL_MAX);
 		break;
 	}
 
@@ -736,53 +768,15 @@ int cpuidle_register(struct cpuidle_driver *drv,
 }
 EXPORT_SYMBOL_GPL(cpuidle_register);
 
-#ifdef CONFIG_SMP
-
-/*
- * This function gets called when a part of the kernel has a new latency
- * requirement.  This means we need to get all processors out of their C-state,
- * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that
- * wakes them all right up.
- */
-static int cpuidle_latency_notify(struct notifier_block *b,
-		unsigned long l, void *v)
-{
-	wake_up_all_idle_cpus();
-	return NOTIFY_OK;
-}
-
-static struct notifier_block cpuidle_latency_notifier = {
-	.notifier_call = cpuidle_latency_notify,
-};
-
-static inline void latency_notifier_init(struct notifier_block *n)
-{
-	pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY, n);
-}
-
-#else /* CONFIG_SMP */
-
-#define latency_notifier_init(x) do { } while (0)
-
-#endif /* CONFIG_SMP */
-
 /**
  * cpuidle_init - core initializer
  */
 static int __init cpuidle_init(void)
 {
-	int ret;
-
 	if (cpuidle_disabled())
 		return -ENODEV;
 
-	ret = cpuidle_add_interface(cpu_subsys.dev_root);
-	if (ret)
-		return ret;
-
-	latency_notifier_init(&cpuidle_latency_notifier);
-
-	return 0;
+	return cpuidle_add_interface(cpu_subsys.dev_root);
 }
 
 module_param(off, int, 0444);
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 4070e573bf43..f70aa17e2a8e 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -181,9 +181,13 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
 		 */
 		if (s->target_residency > 0)
 			s->target_residency_ns = s->target_residency * NSEC_PER_USEC;
+		else if (s->target_residency_ns < 0)
+			s->target_residency_ns = 0;
 
 		if (s->exit_latency > 0)
 			s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC;
+		else if (s->exit_latency_ns < 0)
+			s->exit_latency_ns =  0;
 	}
 }
 
diff --git a/drivers/cpuidle/dt_idle_genpd.c b/drivers/cpuidle/dt_idle_genpd.c
new file mode 100644
index 000000000000..b37165514d4e
--- /dev/null
+++ b/drivers/cpuidle/dt_idle_genpd.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PM domains for CPUs via genpd.
+ *
+ * Copyright (C) 2019 Linaro Ltd.
+ * Author: Ulf Hansson <ulf.hansson@linaro.org>
+ *
+ * Copyright (c) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "dt-idle-genpd: " fmt
+
+#include <linux/cpu.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "dt_idle_genpd.h"
+
+static int pd_parse_state_nodes(
+			int (*parse_state)(struct device_node *, u32 *),
+			struct genpd_power_state *states, int state_count)
+{
+	int i, ret;
+	u32 state, *state_buf;
+
+	for (i = 0; i < state_count; i++) {
+		ret = parse_state(to_of_node(states[i].fwnode), &state);
+		if (ret)
+			goto free_state;
+
+		state_buf = kmalloc(sizeof(u32), GFP_KERNEL);
+		if (!state_buf) {
+			ret = -ENOMEM;
+			goto free_state;
+		}
+		*state_buf = state;
+		states[i].data = state_buf;
+	}
+
+	return 0;
+
+free_state:
+	i--;
+	for (; i >= 0; i--)
+		kfree(states[i].data);
+	return ret;
+}
+
+static int pd_parse_states(struct device_node *np,
+			   int (*parse_state)(struct device_node *, u32 *),
+			   struct genpd_power_state **states,
+			   int *state_count)
+{
+	int ret;
+
+	/* Parse the domain idle states. */
+	ret = of_genpd_parse_idle_states(np, states, state_count);
+	if (ret)
+		return ret;
+
+	/* Fill out the dt specifics for each found state. */
+	ret = pd_parse_state_nodes(parse_state, *states, *state_count);
+	if (ret)
+		kfree(*states);
+
+	return ret;
+}
+
+static void pd_free_states(struct genpd_power_state *states,
+			    unsigned int state_count)
+{
+	int i;
+
+	for (i = 0; i < state_count; i++)
+		kfree(states[i].data);
+	kfree(states);
+}
+
+void dt_idle_pd_free(struct generic_pm_domain *pd)
+{
+	pd_free_states(pd->states, pd->state_count);
+	kfree(pd->name);
+	kfree(pd);
+}
+
+struct generic_pm_domain *dt_idle_pd_alloc(struct device_node *np,
+			int (*parse_state)(struct device_node *, u32 *))
+{
+	struct generic_pm_domain *pd;
+	struct genpd_power_state *states = NULL;
+	int ret, state_count = 0;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		goto out;
+
+	pd->name = kasprintf(GFP_KERNEL, "%pOF", np);
+	if (!pd->name)
+		goto free_pd;
+
+	/*
+	 * Parse the domain idle states and let genpd manage the state selection
+	 * for those being compatible with "domain-idle-state".
+	 */
+	ret = pd_parse_states(np, parse_state, &states, &state_count);
+	if (ret)
+		goto free_name;
+
+	pd->free_states = pd_free_states;
+	pd->name = kbasename(pd->name);
+	pd->states = states;
+	pd->state_count = state_count;
+
+	pr_debug("alloc PM domain %s\n", pd->name);
+	return pd;
+
+free_name:
+	kfree(pd->name);
+free_pd:
+	kfree(pd);
+out:
+	pr_err("failed to alloc PM domain %pOF\n", np);
+	return NULL;
+}
+
+int dt_idle_pd_init_topology(struct device_node *np)
+{
+	struct device_node *node;
+	struct of_phandle_args child, parent;
+	int ret;
+
+	for_each_child_of_node(np, node) {
+		if (of_parse_phandle_with_args(node, "power-domains",
+					"#power-domain-cells", 0, &parent))
+			continue;
+
+		child.np = node;
+		child.args_count = 0;
+		ret = of_genpd_add_subdomain(&parent, &child);
+		of_node_put(parent.np);
+		if (ret) {
+			of_node_put(node);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+struct device *dt_idle_attach_cpu(int cpu, const char *name)
+{
+	struct device *dev;
+
+	dev = dev_pm_domain_attach_by_name(get_cpu_device(cpu), name);
+	if (IS_ERR_OR_NULL(dev))
+		return dev;
+
+	pm_runtime_irq_safe(dev);
+	if (cpu_online(cpu))
+		pm_runtime_get_sync(dev);
+
+	dev_pm_syscore_device(dev, true);
+
+	return dev;
+}
+
+void dt_idle_detach_cpu(struct device *dev)
+{
+	if (IS_ERR_OR_NULL(dev))
+		return;
+
+	dev_pm_domain_detach(dev, false);
+}
diff --git a/drivers/cpuidle/dt_idle_genpd.h b/drivers/cpuidle/dt_idle_genpd.h
new file mode 100644
index 000000000000..a95483d08a02
--- /dev/null
+++ b/drivers/cpuidle/dt_idle_genpd.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DT_IDLE_GENPD
+#define __DT_IDLE_GENPD
+
+struct device_node;
+struct generic_pm_domain;
+
+#ifdef CONFIG_DT_IDLE_GENPD
+
+void dt_idle_pd_free(struct generic_pm_domain *pd);
+
+struct generic_pm_domain *dt_idle_pd_alloc(struct device_node *np,
+			int (*parse_state)(struct device_node *, u32 *));
+
+int dt_idle_pd_init_topology(struct device_node *np);
+
+struct device *dt_idle_attach_cpu(int cpu, const char *name);
+
+void dt_idle_detach_cpu(struct device *dev);
+
+#else
+
+static inline void dt_idle_pd_free(struct generic_pm_domain *pd)
+{
+}
+
+static inline struct generic_pm_domain *dt_idle_pd_alloc(
+			struct device_node *np,
+			int (*parse_state)(struct device_node *, u32 *))
+{
+	return NULL;
+}
+
+static inline int dt_idle_pd_init_topology(struct device_node *np)
+{
+	return 0;
+}
+
+static inline struct device *dt_idle_attach_cpu(int cpu, const char *name)
+{
+	return NULL;
+}
+
+static inline void dt_idle_detach_cpu(struct device *dev)
+{
+}
+
+#endif
+
+#endif
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index e48271e117a3..0d0f9751ff8f 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -63,12 +63,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov)
 
 	cpuidle_curr_governor = gov;
 
-	if (gov) {
-		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
-			cpuidle_enable_device(dev);
-		cpuidle_install_idle_handler();
-		printk(KERN_INFO "cpuidle: using governor %s\n", gov->name);
-	}
+	list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
+		cpuidle_enable_device(dev);
+
+	cpuidle_install_idle_handler();
+	pr_info("cpuidle: using governor %s\n", gov->name);
 
 	return 0;
 }
@@ -109,9 +108,9 @@ int cpuidle_register_governor(struct cpuidle_governor *gov)
  */
 s64 cpuidle_governor_latency_req(unsigned int cpu)
 {
-	int global_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 	struct device *device = get_cpu_device(cpu);
 	int device_req = dev_pm_qos_raw_resume_latency(device);
+	int global_req = cpu_latency_qos_limit();
 
 	if (device_req > global_req)
 		device_req = global_req;
diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c
index cb2a96eafc02..1dff3a52917d 100644
--- a/drivers/cpuidle/governors/haltpoll.c
+++ b/drivers/cpuidle/governors/haltpoll.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/kvm_para.h>
+#include <trace/events/power.h>
 
 static unsigned int guest_halt_poll_ns __read_mostly = 200000;
 module_param(guest_halt_poll_ns, uint, 0644);
@@ -90,6 +91,7 @@ static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns)
 		if (val > guest_halt_poll_ns)
 			val = guest_halt_poll_ns;
 
+		trace_guest_halt_poll_ns_grow(val, dev->poll_limit_ns);
 		dev->poll_limit_ns = val;
 	} else if (block_ns > guest_halt_poll_ns &&
 		   guest_halt_poll_allow_shrink) {
@@ -100,6 +102,7 @@ static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns)
 			val = 0;
 		else
 			val /= shrink;
+		trace_guest_halt_poll_ns_shrink(val, dev->poll_limit_ns);
 		dev->poll_limit_ns = val;
 	}
 }
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index b0a7ad566081..c4922684f305 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -34,7 +34,7 @@
  * 1) Energy break even point
  * 2) Performance impact
  * 3) Latency tolerance (from pmqos infrastructure)
- * These these three factors are treated independently.
+ * These three factors are treated independently.
  *
  * Energy break even point
  * -----------------------
@@ -117,7 +117,7 @@ struct menu_device {
 	int		interval_ptr;
 };
 
-static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters)
+static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
 {
 	int bucket = 0;
 
@@ -150,7 +150,7 @@ static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters)
  * to be, the higher this multiplier, and thus the higher
  * the barrier to go to an expensive C state.
  */
-static inline int performance_multiplier(unsigned long nr_iowaiters)
+static inline int performance_multiplier(unsigned int nr_iowaiters)
 {
 	/* for IO wait tasks (per cpu!) we add 10x each */
 	return 1 + 10 * nr_iowaiters;
@@ -270,8 +270,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	unsigned int predicted_us;
 	u64 predicted_ns;
 	u64 interactivity_req;
-	unsigned long nr_iowaiters;
-	ktime_t delta_next;
+	unsigned int nr_iowaiters;
+	ktime_t delta, delta_tick;
 	int i, idx;
 
 	if (data->needs_update) {
@@ -280,7 +280,12 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	}
 
 	/* determine the expected residency time, round up */
-	data->next_timer_ns = tick_nohz_get_sleep_length(&delta_next);
+	delta = tick_nohz_get_sleep_length(&delta_tick);
+	if (unlikely(delta < 0)) {
+		delta = 0;
+		delta_tick = 0;
+	}
+	data->next_timer_ns = delta;
 
 	nr_iowaiters = nr_iowait_cpu(dev->cpu);
 	data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
@@ -318,7 +323,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * state selection.
 		 */
 		if (predicted_ns < TICK_NSEC)
-			predicted_ns = delta_next;
+			predicted_ns = data->next_timer_ns;
 	} else {
 		/*
 		 * Use the performance multiplier and the user-configurable
@@ -377,7 +382,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			 * stuck in the shallow one for too long.
 			 */
 			if (drv->states[idx].target_residency_ns < TICK_NSEC &&
-			    s->target_residency_ns <= delta_next)
+			    s->target_residency_ns <= delta_tick)
 				idx = i;
 
 			return idx;
@@ -399,7 +404,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	     predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
 		*stop_tick = false;
 
-		if (idx > 0 && drv->states[idx].target_residency_ns > delta_next) {
+		if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) {
 			/*
 			 * The tick is not going to be stopped and the target
 			 * residency of the state to be returned is not within
@@ -411,7 +416,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 					continue;
 
 				idx = i;
-				if (drv->states[i].target_residency_ns <= delta_next)
+				if (drv->states[i].target_residency_ns <= delta_tick)
 					break;
 			}
 		}
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 6deaaf5f05b5..d9262db79cae 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -2,47 +2,103 @@
 /*
  * Timer events oriented CPU idle governor
  *
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018 - 2021 Intel Corporation
  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ */
+
+/**
+ * DOC: teo-description
  *
  * The idea of this governor is based on the observation that on many systems
  * timer events are two or more orders of magnitude more frequent than any
- * other interrupts, so they are likely to be the most significant source of CPU
+ * other interrupts, so they are likely to be the most significant cause of CPU
  * wakeups from idle states.  Moreover, information about what happened in the
  * (relatively recent) past can be used to estimate whether or not the deepest
- * idle state with target residency within the time to the closest timer is
- * likely to be suitable for the upcoming idle time of the CPU and, if not, then
- * which of the shallower idle states to choose.
+ * idle state with target residency within the (known) time till the closest
+ * timer event, referred to as the sleep length, is likely to be suitable for
+ * the upcoming CPU idle period and, if not, then which of the shallower idle
+ * states to choose instead of it.
+ *
+ * Of course, non-timer wakeup sources are more important in some use cases
+ * which can be covered by taking a few most recent idle time intervals of the
+ * CPU into account.  However, even in that context it is not necessary to
+ * consider idle duration values greater than the sleep length, because the
+ * closest timer will ultimately wake up the CPU anyway unless it is woken up
+ * earlier.
+ *
+ * Thus this governor estimates whether or not the prospective idle duration of
+ * a CPU is likely to be significantly shorter than the sleep length and selects
+ * an idle state for it accordingly.
+ *
+ * The computations carried out by this governor are based on using bins whose
+ * boundaries are aligned with the target residency parameter values of the CPU
+ * idle states provided by the %CPUIdle driver in the ascending order.  That is,
+ * the first bin spans from 0 up to, but not including, the target residency of
+ * the second idle state (idle state 1), the second bin spans from the target
+ * residency of idle state 1 up to, but not including, the target residency of
+ * idle state 2, the third bin spans from the target residency of idle state 2
+ * up to, but not including, the target residency of idle state 3 and so on.
+ * The last bin spans from the target residency of the deepest idle state
+ * supplied by the driver to infinity.
+ *
+ * Two metrics called "hits" and "intercepts" are associated with each bin.
+ * They are updated every time before selecting an idle state for the given CPU
+ * in accordance with what happened last time.
+ *
+ * The "hits" metric reflects the relative frequency of situations in which the
+ * sleep length and the idle duration measured after CPU wakeup fall into the
+ * same bin (that is, the CPU appears to wake up "on time" relative to the sleep
+ * length).  In turn, the "intercepts" metric reflects the relative frequency of
+ * situations in which the measured idle duration is so much shorter than the
+ * sleep length that the bin it falls into corresponds to an idle state
+ * shallower than the one whose bin is fallen into by the sleep length (these
+ * situations are referred to as "intercepts" below).
+ *
+ * In addition to the metrics described above, the governor counts recent
+ * intercepts (that is, intercepts that have occurred during the last
+ * %NR_RECENT invocations of it for the given CPU) for each bin.
  *
- * Of course, non-timer wakeup sources are more important in some use cases and
- * they can be covered by taking a few most recent idle time intervals of the
- * CPU into account.  However, even in that case it is not necessary to consider
- * idle duration values greater than the time till the closest timer, as the
- * patterns that they may belong to produce average values close enough to
- * the time till the closest timer (sleep length) anyway.
+ * In order to select an idle state for a CPU, the governor takes the following
+ * steps (modulo the possible latency constraint that must be taken into account
+ * too):
  *
- * Thus this governor estimates whether or not the upcoming idle time of the CPU
- * is likely to be significantly shorter than the sleep length and selects an
- * idle state for it in accordance with that, as follows:
+ * 1. Find the deepest CPU idle state whose target residency does not exceed
+ *    the current sleep length (the candidate idle state) and compute 3 sums as
+ *    follows:
  *
- * - Find an idle state on the basis of the sleep length and state statistics
- *   collected over time:
+ *    - The sum of the "hits" and "intercepts" metrics for the candidate state
+ *      and all of the deeper idle states (it represents the cases in which the
+ *      CPU was idle long enough to avoid being intercepted if the sleep length
+ *      had been equal to the current one).
  *
- *   o Find the deepest idle state whose target residency is less than or equal
- *     to the sleep length.
+ *    - The sum of the "intercepts" metrics for all of the idle states shallower
+ *      than the candidate one (it represents the cases in which the CPU was not
+ *      idle long enough to avoid being intercepted if the sleep length had been
+ *      equal to the current one).
  *
- *   o Select it if it matched both the sleep length and the observed idle
- *     duration in the past more often than it matched the sleep length alone
- *     (i.e. the observed idle duration was significantly shorter than the sleep
- *     length matched by it).
+ *    - The sum of the numbers of recent intercepts for all of the idle states
+ *      shallower than the candidate one.
  *
- *   o Otherwise, select the shallower state with the greatest matched "early"
- *     wakeups metric.
+ * 2. If the second sum is greater than the first one or the third sum is
+ *    greater than %NR_RECENT / 2, the CPU is likely to wake up early, so look
+ *    for an alternative idle state to select.
  *
- * - If the majority of the most recent idle duration values are below the
- *   target residency of the idle state selected so far, use those values to
- *   compute the new expected idle duration and find an idle state matching it
- *   (which has to be shallower than the one selected so far).
+ *    - Traverse the idle states shallower than the candidate one in the
+ *      descending order.
+ *
+ *    - For each of them compute the sum of the "intercepts" metrics and the sum
+ *      of the numbers of recent intercepts over all of the idle states between
+ *      it and the candidate one (including the former and excluding the
+ *      latter).
+ *
+ *    - If each of these sums that needs to be taken into account (because the
+ *      check related to it has indicated that the CPU is likely to wake up
+ *      early) is greater than a half of the corresponding sum computed in step
+ *      1 (which means that the target residency of the state in question had
+ *      not exceeded the idle duration in over a half of the relevant cases),
+ *      select the given idle state instead of the candidate one.
+ *
+ * 3. By default, select the candidate state.
  */
 
 #include <linux/cpuidle.h>
@@ -60,64 +116,51 @@
 
 /*
  * Number of the most recent idle duration values to take into consideration for
- * the detection of wakeup patterns.
+ * the detection of recent early wakeup patterns.
  */
-#define INTERVALS	8
+#define NR_RECENT	9
 
 /**
- * struct teo_idle_state - Idle state data used by the TEO cpuidle governor.
- * @early_hits: "Early" CPU wakeups "matching" this state.
- * @hits: "On time" CPU wakeups "matching" this state.
- * @misses: CPU wakeups "missing" this state.
- *
- * A CPU wakeup is "matched" by a given idle state if the idle duration measured
- * after the wakeup is between the target residency of that state and the target
- * residency of the next one (or if this is the deepest available idle state, it
- * "matches" a CPU wakeup when the measured idle duration is at least equal to
- * its target residency).
- *
- * Also, from the TEO governor perspective, a CPU wakeup from idle is "early" if
- * it occurs significantly earlier than the closest expected timer event (that
- * is, early enough to match an idle state shallower than the one matching the
- * time till the closest timer event).  Otherwise, the wakeup is "on time", or
- * it is a "hit".
- *
- * A "miss" occurs when the given state doesn't match the wakeup, but it matches
- * the time till the closest timer event used for idle state selection.
+ * struct teo_bin - Metrics used by the TEO cpuidle governor.
+ * @intercepts: The "intercepts" metric.
+ * @hits: The "hits" metric.
+ * @recent: The number of recent "intercepts".
  */
-struct teo_idle_state {
-	unsigned int early_hits;
+struct teo_bin {
+	unsigned int intercepts;
 	unsigned int hits;
-	unsigned int misses;
+	unsigned int recent;
 };
 
 /**
  * struct teo_cpu - CPU data used by the TEO cpuidle governor.
  * @time_span_ns: Time between idle state selection and post-wakeup update.
  * @sleep_length_ns: Time till the closest timer event (at the selection time).
- * @states: Idle states data corresponding to this CPU.
- * @interval_idx: Index of the most recent saved idle interval.
- * @intervals: Saved idle duration values.
+ * @state_bins: Idle state data bins for this CPU.
+ * @total: Grand total of the "intercepts" and "hits" mertics for all bins.
+ * @next_recent_idx: Index of the next @recent_idx entry to update.
+ * @recent_idx: Indices of bins corresponding to recent "intercepts".
  */
 struct teo_cpu {
-	u64 time_span_ns;
-	u64 sleep_length_ns;
-	struct teo_idle_state states[CPUIDLE_STATE_MAX];
-	int interval_idx;
-	u64 intervals[INTERVALS];
+	s64 time_span_ns;
+	s64 sleep_length_ns;
+	struct teo_bin state_bins[CPUIDLE_STATE_MAX];
+	unsigned int total;
+	int next_recent_idx;
+	int recent_idx[NR_RECENT];
 };
 
 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
 
 /**
- * teo_update - Update CPU data after wakeup.
+ * teo_update - Update CPU metrics after wakeup.
  * @drv: cpuidle driver containing state data.
  * @dev: Target CPU.
  */
 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 {
 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
-	int i, idx_hit = -1, idx_timer = -1;
+	int i, idx_timer = 0, idx_duration = 0;
 	u64 measured_ns;
 
 	if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
@@ -150,56 +193,52 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 			measured_ns /= 2;
 	}
 
+	cpu_data->total = 0;
+
 	/*
-	 * Decay the "early hits" metric for all of the states and find the
-	 * states matching the sleep length and the measured idle duration.
+	 * Decay the "hits" and "intercepts" metrics for all of the bins and
+	 * find the bins that the sleep length and the measured idle duration
+	 * fall into.
 	 */
 	for (i = 0; i < drv->state_count; i++) {
-		unsigned int early_hits = cpu_data->states[i].early_hits;
+		s64 target_residency_ns = drv->states[i].target_residency_ns;
+		struct teo_bin *bin = &cpu_data->state_bins[i];
+
+		bin->hits -= bin->hits >> DECAY_SHIFT;
+		bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
 
-		cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT;
+		cpu_data->total += bin->hits + bin->intercepts;
 
-		if (drv->states[i].target_residency_ns <= cpu_data->sleep_length_ns) {
+		if (target_residency_ns <= cpu_data->sleep_length_ns) {
 			idx_timer = i;
-			if (drv->states[i].target_residency_ns <= measured_ns)
-				idx_hit = i;
+			if (target_residency_ns <= measured_ns)
+				idx_duration = i;
 		}
 	}
 
-	/*
-	 * Update the "hits" and "misses" data for the state matching the sleep
-	 * length.  If it matches the measured idle duration too, this is a hit,
-	 * so increase the "hits" metric for it then.  Otherwise, this is a
-	 * miss, so increase the "misses" metric for it.  In the latter case
-	 * also increase the "early hits" metric for the state that actually
-	 * matches the measured idle duration.
-	 */
-	if (idx_timer >= 0) {
-		unsigned int hits = cpu_data->states[idx_timer].hits;
-		unsigned int misses = cpu_data->states[idx_timer].misses;
-
-		hits -= hits >> DECAY_SHIFT;
-		misses -= misses >> DECAY_SHIFT;
-
-		if (idx_timer > idx_hit) {
-			misses += PULSE;
-			if (idx_hit >= 0)
-				cpu_data->states[idx_hit].early_hits += PULSE;
-		} else {
-			hits += PULSE;
-		}
+	i = cpu_data->next_recent_idx++;
+	if (cpu_data->next_recent_idx >= NR_RECENT)
+		cpu_data->next_recent_idx = 0;
 
-		cpu_data->states[idx_timer].misses = misses;
-		cpu_data->states[idx_timer].hits = hits;
-	}
+	if (cpu_data->recent_idx[i] >= 0)
+		cpu_data->state_bins[cpu_data->recent_idx[i]].recent--;
 
 	/*
-	 * Save idle duration values corresponding to non-timer wakeups for
-	 * pattern detection.
+	 * If the measured idle duration falls into the same bin as the sleep
+	 * length, this is a "hit", so update the "hits" metric for that bin.
+	 * Otherwise, update the "intercepts" metric for the bin fallen into by
+	 * the measured idle duration.
 	 */
-	cpu_data->intervals[cpu_data->interval_idx++] = measured_ns;
-	if (cpu_data->interval_idx >= INTERVALS)
-		cpu_data->interval_idx = 0;
+	if (idx_timer == idx_duration) {
+		cpu_data->state_bins[idx_timer].hits += PULSE;
+		cpu_data->recent_idx[i] = -1;
+	} else {
+		cpu_data->state_bins[idx_duration].intercepts += PULSE;
+		cpu_data->state_bins[idx_duration].recent++;
+		cpu_data->recent_idx[i] = idx_duration;
+	}
+
+	cpu_data->total += PULSE;
 }
 
 static bool teo_time_ok(u64 interval_ns)
@@ -207,6 +246,12 @@ static bool teo_time_ok(u64 interval_ns)
 	return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC;
 }
 
+static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv)
+{
+	return (drv->states[idx].target_residency_ns +
+		drv->states[idx+1].target_residency_ns) / 2;
+}
+
 /**
  * teo_find_shallower_state - Find shallower idle state matching given duration.
  * @drv: cpuidle driver containing state data.
@@ -216,7 +261,7 @@ static bool teo_time_ok(u64 interval_ns)
  */
 static int teo_find_shallower_state(struct cpuidle_driver *drv,
 				    struct cpuidle_device *dev, int state_idx,
-				    u64 duration_ns)
+				    s64 duration_ns)
 {
 	int i;
 
@@ -242,10 +287,18 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 {
 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
-	u64 duration_ns;
-	unsigned int hits, misses, early_hits;
-	int max_early_idx, prev_max_early_idx, constraint_idx, idx, i;
+	unsigned int idx_intercept_sum = 0;
+	unsigned int intercept_sum = 0;
+	unsigned int idx_recent_sum = 0;
+	unsigned int recent_sum = 0;
+	unsigned int idx_hit_sum = 0;
+	unsigned int hit_sum = 0;
+	int constraint_idx = 0;
+	int idx0 = 0, idx = -1;
+	bool alt_intercepts, alt_recent;
 	ktime_t delta_tick;
+	s64 duration_ns;
+	int i;
 
 	if (dev->last_state_idx >= 0) {
 		teo_update(drv, dev);
@@ -257,164 +310,150 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	duration_ns = tick_nohz_get_sleep_length(&delta_tick);
 	cpu_data->sleep_length_ns = duration_ns;
 
-	hits = 0;
-	misses = 0;
-	early_hits = 0;
-	max_early_idx = -1;
-	prev_max_early_idx = -1;
-	constraint_idx = drv->state_count;
-	idx = -1;
+	/* Check if there is any choice in the first place. */
+	if (drv->state_count < 2) {
+		idx = 0;
+		goto end;
+	}
+	if (!dev->states_usage[0].disable) {
+		idx = 0;
+		if (drv->states[1].target_residency_ns > duration_ns)
+			goto end;
+	}
 
-	for (i = 0; i < drv->state_count; i++) {
+	/*
+	 * Find the deepest idle state whose target residency does not exceed
+	 * the current sleep length and the deepest idle state not deeper than
+	 * the former whose exit latency does not exceed the current latency
+	 * constraint.  Compute the sums of metrics for early wakeup pattern
+	 * detection.
+	 */
+	for (i = 1; i < drv->state_count; i++) {
+		struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
 		struct cpuidle_state *s = &drv->states[i];
 
-		if (dev->states_usage[i].disable) {
-			/*
-			 * Ignore disabled states with target residencies beyond
-			 * the anticipated idle duration.
-			 */
-			if (s->target_residency_ns > duration_ns)
-				continue;
-
-			/*
-			 * This state is disabled, so the range of idle duration
-			 * values corresponding to it is covered by the current
-			 * candidate state, but still the "hits" and "misses"
-			 * metrics of the disabled state need to be used to
-			 * decide whether or not the state covering the range in
-			 * question is good enough.
-			 */
-			hits = cpu_data->states[i].hits;
-			misses = cpu_data->states[i].misses;
-
-			if (early_hits >= cpu_data->states[i].early_hits ||
-			    idx < 0)
-				continue;
-
-			/*
-			 * If the current candidate state has been the one with
-			 * the maximum "early hits" metric so far, the "early
-			 * hits" metric of the disabled state replaces the
-			 * current "early hits" count to avoid selecting a
-			 * deeper state with lower "early hits" metric.
-			 */
-			if (max_early_idx == idx) {
-				early_hits = cpu_data->states[i].early_hits;
-				continue;
-			}
-
-			/*
-			 * The current candidate state is closer to the disabled
-			 * one than the current maximum "early hits" state, so
-			 * replace the latter with it, but in case the maximum
-			 * "early hits" state index has not been set so far,
-			 * check if the current candidate state is not too
-			 * shallow for that role.
-			 */
-			if (teo_time_ok(drv->states[idx].target_residency_ns)) {
-				prev_max_early_idx = max_early_idx;
-				early_hits = cpu_data->states[i].early_hits;
-				max_early_idx = idx;
-			}
+		/*
+		 * Update the sums of idle state mertics for all of the states
+		 * shallower than the current one.
+		 */
+		intercept_sum += prev_bin->intercepts;
+		hit_sum += prev_bin->hits;
+		recent_sum += prev_bin->recent;
 
+		if (dev->states_usage[i].disable)
 			continue;
-		}
 
 		if (idx < 0) {
 			idx = i; /* first enabled state */
-			hits = cpu_data->states[i].hits;
-			misses = cpu_data->states[i].misses;
+			idx0 = i;
 		}
 
 		if (s->target_residency_ns > duration_ns)
 			break;
 
-		if (s->exit_latency_ns > latency_req && constraint_idx > i)
+		idx = i;
+
+		if (s->exit_latency_ns <= latency_req)
 			constraint_idx = i;
 
-		idx = i;
-		hits = cpu_data->states[i].hits;
-		misses = cpu_data->states[i].misses;
-
-		if (early_hits < cpu_data->states[i].early_hits &&
-		    teo_time_ok(drv->states[i].target_residency_ns)) {
-			prev_max_early_idx = max_early_idx;
-			early_hits = cpu_data->states[i].early_hits;
-			max_early_idx = i;
-		}
+		idx_intercept_sum = intercept_sum;
+		idx_hit_sum = hit_sum;
+		idx_recent_sum = recent_sum;
 	}
 
-	/*
-	 * If the "hits" metric of the idle state matching the sleep length is
-	 * greater than its "misses" metric, that is the one to use.  Otherwise,
-	 * it is more likely that one of the shallower states will match the
-	 * idle duration observed after wakeup, so take the one with the maximum
-	 * "early hits" metric, but if that cannot be determined, just use the
-	 * state selected so far.
-	 */
-	if (hits <= misses) {
-		/*
-		 * The current candidate state is not suitable, so take the one
-		 * whose "early hits" metric is the maximum for the range of
-		 * shallower states.
-		 */
-		if (idx == max_early_idx)
-			max_early_idx = prev_max_early_idx;
-
-		if (max_early_idx >= 0) {
-			idx = max_early_idx;
-			duration_ns = drv->states[idx].target_residency_ns;
-		}
+	/* Avoid unnecessary overhead. */
+	if (idx < 0) {
+		idx = 0; /* No states enabled, must use 0. */
+		goto end;
+	} else if (idx == idx0) {
+		goto end;
 	}
 
 	/*
-	 * If there is a latency constraint, it may be necessary to use a
-	 * shallower idle state than the one selected so far.
+	 * If the sum of the intercepts metric for all of the idle states
+	 * shallower than the current candidate one (idx) is greater than the
+	 * sum of the intercepts and hits metrics for the candidate state and
+	 * all of the deeper states, or the sum of the numbers of recent
+	 * intercepts over all of the states shallower than the candidate one
+	 * is greater than a half of the number of recent events taken into
+	 * account, the CPU is likely to wake up early, so find an alternative
+	 * idle state to select.
 	 */
-	if (constraint_idx < idx)
-		idx = constraint_idx;
-
-	if (idx < 0) {
-		idx = 0; /* No states enabled. Must use 0. */
-	} else if (idx > 0) {
-		unsigned int count = 0;
-		u64 sum = 0;
+	alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum;
+	alt_recent = idx_recent_sum > NR_RECENT / 2;
+	if (alt_recent || alt_intercepts) {
+		s64 first_suitable_span_ns = duration_ns;
+		int first_suitable_idx = idx;
 
 		/*
-		 * Count and sum the most recent idle duration values less than
-		 * the current expected idle duration value.
+		 * Look for the deepest idle state whose target residency had
+		 * not exceeded the idle duration in over a half of the relevant
+		 * cases (both with respect to intercepts overall and with
+		 * respect to the recent intercepts only) in the past.
+		 *
+		 * Take the possible latency constraint and duration limitation
+		 * present if the tick has been stopped already into account.
 		 */
-		for (i = 0; i < INTERVALS; i++) {
-			u64 val = cpu_data->intervals[i];
+		intercept_sum = 0;
+		recent_sum = 0;
+
+		for (i = idx - 1; i >= 0; i--) {
+			struct teo_bin *bin = &cpu_data->state_bins[i];
+			s64 span_ns;
+
+			intercept_sum += bin->intercepts;
+			recent_sum += bin->recent;
+
+			span_ns = teo_middle_of_bin(i, drv);
+
+			if ((!alt_recent || 2 * recent_sum > idx_recent_sum) &&
+			    (!alt_intercepts ||
+			     2 * intercept_sum > idx_intercept_sum)) {
+				if (teo_time_ok(span_ns) &&
+				    !dev->states_usage[i].disable) {
+					idx = i;
+					duration_ns = span_ns;
+				} else {
+					/*
+					 * The current state is too shallow or
+					 * disabled, so take the first enabled
+					 * deeper state with suitable time span.
+					 */
+					idx = first_suitable_idx;
+					duration_ns = first_suitable_span_ns;
+				}
+				break;
+			}
 
-			if (val >= duration_ns)
+			if (dev->states_usage[i].disable)
 				continue;
 
-			count++;
-			sum += val;
-		}
+			if (!teo_time_ok(span_ns)) {
+				/*
+				 * The current state is too shallow, but if an
+				 * alternative candidate state has been found,
+				 * it may still turn out to be a better choice.
+				 */
+				if (first_suitable_idx != idx)
+					continue;
 
-		/*
-		 * Give up unless the majority of the most recent idle duration
-		 * values are in the interesting range.
-		 */
-		if (count > INTERVALS / 2) {
-			u64 avg_ns = div64_u64(sum, count);
-
-			/*
-			 * Avoid spending too much time in an idle state that
-			 * would be too shallow.
-			 */
-			if (teo_time_ok(avg_ns)) {
-				duration_ns = avg_ns;
-				if (drv->states[idx].target_residency_ns > avg_ns)
-					idx = teo_find_shallower_state(drv, dev,
-								       idx, avg_ns);
+				break;
 			}
+
+			first_suitable_span_ns = span_ns;
+			first_suitable_idx = i;
 		}
 	}
 
 	/*
+	 * If there is a latency constraint, it may be necessary to select an
+	 * idle state shallower than the current candidate one.
+	 */
+	if (idx > constraint_idx)
+		idx = constraint_idx;
+
+end:
+	/*
 	 * Don't stop the tick if the selected state is a polling one or if the
 	 * expected idle duration is shorter than the tick period length.
 	 */
@@ -428,7 +467,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * till the closest timer including the tick, try to correct
 		 * that.
 		 */
-		if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick)
+		if (idx > idx0 &&
+		    drv->states[idx].target_residency_ns > delta_tick)
 			idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
 	}
 
@@ -472,8 +512,8 @@ static int teo_enable_device(struct cpuidle_driver *drv,
 
 	memset(cpu_data, 0, sizeof(*cpu_data));
 
-	for (i = 0; i < INTERVALS; i++)
-		cpu_data->intervals[i] = U64_MAX;
+	for (i = 0; i < NR_RECENT; i++)
+		cpu_data->recent_idx[i] = -1;
 
 	return 0;
 }
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index cdeedbf02646..2b496a53cbca 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -18,14 +18,6 @@
 
 #include "cpuidle.h"
 
-static unsigned int sysfs_switch;
-static int __init cpuidle_sysfs_setup(char *unused)
-{
-	sysfs_switch = 1;
-	return 1;
-}
-__setup("cpuidle_sysfs_switch", cpuidle_sysfs_setup);
-
 static ssize_t show_available_governors(struct device *dev,
 					struct device_attribute *attr,
 					char *buf)
@@ -35,10 +27,10 @@ static ssize_t show_available_governors(struct device *dev,
 
 	mutex_lock(&cpuidle_lock);
 	list_for_each_entry(tmp, &cpuidle_governors, governor_list) {
-		if (i >= (ssize_t) ((PAGE_SIZE/sizeof(char)) -
-				    CPUIDLE_NAME_LEN - 2))
+		if (i >= (ssize_t) (PAGE_SIZE - (CPUIDLE_NAME_LEN + 2)))
 			goto out;
-		i += scnprintf(&buf[i], CPUIDLE_NAME_LEN, "%s ", tmp->name);
+
+		i += scnprintf(&buf[i], CPUIDLE_NAME_LEN + 1, "%s ", tmp->name);
 	}
 
 out:
@@ -85,58 +77,43 @@ static ssize_t store_current_governor(struct device *dev,
 				      struct device_attribute *attr,
 				      const char *buf, size_t count)
 {
-	char gov_name[CPUIDLE_NAME_LEN];
-	int ret = -EINVAL;
-	size_t len = count;
+	char gov_name[CPUIDLE_NAME_LEN + 1];
+	int ret;
 	struct cpuidle_governor *gov;
 
-	if (!len || len >= sizeof(gov_name))
+	ret = sscanf(buf, "%" __stringify(CPUIDLE_NAME_LEN) "s", gov_name);
+	if (ret != 1)
 		return -EINVAL;
 
-	memcpy(gov_name, buf, len);
-	gov_name[len] = '\0';
-	if (gov_name[len - 1] == '\n')
-		gov_name[--len] = '\0';
-
 	mutex_lock(&cpuidle_lock);
-
+	ret = -EINVAL;
 	list_for_each_entry(gov, &cpuidle_governors, governor_list) {
-		if (strlen(gov->name) == len && !strcmp(gov->name, gov_name)) {
+		if (!strncmp(gov->name, gov_name, CPUIDLE_NAME_LEN)) {
 			ret = cpuidle_switch_governor(gov);
 			break;
 		}
 	}
-
 	mutex_unlock(&cpuidle_lock);
 
-	if (ret)
-		return ret;
-	else
-		return count;
+	return ret ? ret : count;
 }
 
-static DEVICE_ATTR(current_driver, 0444, show_current_driver, NULL);
-static DEVICE_ATTR(current_governor_ro, 0444, show_current_governor, NULL);
-
-static struct attribute *cpuidle_default_attrs[] = {
-	&dev_attr_current_driver.attr,
-	&dev_attr_current_governor_ro.attr,
-	NULL
-};
-
 static DEVICE_ATTR(available_governors, 0444, show_available_governors, NULL);
+static DEVICE_ATTR(current_driver, 0444, show_current_driver, NULL);
 static DEVICE_ATTR(current_governor, 0644, show_current_governor,
-		   store_current_governor);
+				   store_current_governor);
+static DEVICE_ATTR(current_governor_ro, 0444, show_current_governor, NULL);
 
-static struct attribute *cpuidle_switch_attrs[] = {
+static struct attribute *cpuidle_attrs[] = {
 	&dev_attr_available_governors.attr,
 	&dev_attr_current_driver.attr,
 	&dev_attr_current_governor.attr,
+	&dev_attr_current_governor_ro.attr,
 	NULL
 };
 
 static struct attribute_group cpuidle_attr_group = {
-	.attrs = cpuidle_default_attrs,
+	.attrs = cpuidle_attrs,
 	.name = "cpuidle",
 };
 
@@ -146,9 +123,6 @@ static struct attribute_group cpuidle_attr_group = {
  */
 int cpuidle_add_interface(struct device *dev)
 {
-	if (sysfs_switch)
-		cpuidle_attr_group.attrs = cpuidle_switch_attrs;
-
 	return sysfs_create_group(&dev->kobj, &cpuidle_attr_group);
 }
 
@@ -167,11 +141,6 @@ struct cpuidle_attr {
 	ssize_t (*store)(struct cpuidle_device *, const char *, size_t count);
 };
 
-#define define_one_ro(_name, show) \
-	static struct cpuidle_attr attr_##_name = __ATTR(_name, 0444, show, NULL)
-#define define_one_rw(_name, show, store) \
-	static struct cpuidle_attr attr_##_name = __ATTR(_name, 0644, show, store)
-
 #define attr_to_cpuidleattr(a) container_of(a, struct cpuidle_attr, attr)
 
 struct cpuidle_device_kobj {
@@ -287,6 +256,7 @@ define_show_state_time_function(exit_latency)
 define_show_state_time_function(target_residency)
 define_show_state_function(power_usage)
 define_show_state_ull_function(usage)
+define_show_state_ull_function(rejected)
 define_show_state_str_function(name)
 define_show_state_str_function(desc)
 define_show_state_ull_function(above)
@@ -343,6 +313,7 @@ define_one_state_ro(latency, show_state_exit_latency);
 define_one_state_ro(residency, show_state_target_residency);
 define_one_state_ro(power, show_state_power_usage);
 define_one_state_ro(usage, show_state_usage);
+define_one_state_ro(rejected, show_state_rejected);
 define_one_state_ro(time, show_state_time);
 define_one_state_rw(disable, show_state_disable, store_state_disable);
 define_one_state_ro(above, show_state_above);
@@ -356,6 +327,7 @@ static struct attribute *cpuidle_state_default_attrs[] = {
 	&attr_residency.attr,
 	&attr_power.attr,
 	&attr_usage.attr,
+	&attr_rejected.attr,
 	&attr_time.attr,
 	&attr_disable.attr,
 	&attr_above.attr,
@@ -363,6 +335,7 @@ static struct attribute *cpuidle_state_default_attrs[] = {
 	&attr_default_status.attr,
 	NULL
 };
+ATTRIBUTE_GROUPS(cpuidle_state_default);
 
 struct cpuidle_state_kobj {
 	struct cpuidle_state *state;
@@ -431,12 +404,12 @@ static inline void cpuidle_remove_s2idle_attr_group(struct cpuidle_state_kobj *k
 #define attr_to_stateattr(a) container_of(a, struct cpuidle_state_attr, attr)
 
 static ssize_t cpuidle_state_show(struct kobject *kobj, struct attribute *attr,
-				  char * buf)
+				  char *buf)
 {
 	int ret = -EIO;
 	struct cpuidle_state *state = kobj_to_state(kobj);
 	struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj);
-	struct cpuidle_state_attr * cattr = attr_to_stateattr(attr);
+	struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
 
 	if (cattr->show)
 		ret = cattr->show(state, state_usage, buf);
@@ -476,7 +449,7 @@ static void cpuidle_state_sysfs_release(struct kobject *kobj)
 
 static struct kobj_type ktype_state_cpuidle = {
 	.sysfs_ops = &cpuidle_state_sysfs_ops,
-	.default_attrs = cpuidle_state_default_attrs,
+	.default_groups = cpuidle_state_default_groups,
 	.release = cpuidle_state_sysfs_release,
 };
 
@@ -515,6 +488,7 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device)
 		ret = kobject_init_and_add(&kobj->kobj, &ktype_state_cpuidle,
 					   &kdev->kobj, "state%d", i);
 		if (ret) {
+			kobject_put(&kobj->kobj);
 			kfree(kobj);
 			goto error_state;
 		}
@@ -532,7 +506,7 @@ error_state:
 }
 
 /**
- * cpuidle_remove_driver_sysfs - removes the cpuidle states sysfs attributes
+ * cpuidle_remove_state_sysfs - removes the cpuidle states sysfs attributes
  * @device: the target device
  */
 static void cpuidle_remove_state_sysfs(struct cpuidle_device *device)
@@ -618,10 +592,11 @@ static struct attribute *cpuidle_driver_default_attrs[] = {
 	&attr_driver_name.attr,
 	NULL
 };
+ATTRIBUTE_GROUPS(cpuidle_driver_default);
 
 static struct kobj_type ktype_driver_cpuidle = {
 	.sysfs_ops = &cpuidle_driver_sysfs_ops,
-	.default_attrs = cpuidle_driver_default_attrs,
+	.default_groups = cpuidle_driver_default_groups,
 	.release = cpuidle_driver_sysfs_release,
 };
 
@@ -646,6 +621,7 @@ static int cpuidle_add_driver_sysfs(struct cpuidle_device *dev)
 	ret = kobject_init_and_add(&kdrv->kobj, &ktype_driver_cpuidle,
 				   &kdev->kobj, "driver");
 	if (ret) {
+		kobject_put(&kdrv->kobj);
 		kfree(kdrv);
 		return ret;
 	}
@@ -733,17 +709,18 @@ int cpuidle_add_sysfs(struct cpuidle_device *dev)
 	if (!kdev)
 		return -ENOMEM;
 	kdev->dev = dev;
-	dev->kobj_dev = kdev;
 
 	init_completion(&kdev->kobj_unregister);
 
 	error = kobject_init_and_add(&kdev->kobj, &ktype_cpuidle, &cpu_dev->kobj,
 				   "cpuidle");
 	if (error) {
+		kobject_put(&kdev->kobj);
 		kfree(kdev);
 		return error;
 	}
 
+	dev->kobj_dev = kdev;
 	kobject_uevent(&kdev->kobj, KOBJ_ADD);
 
 	return 0;