From af3a1b6a18258168f09e4ebc014b368bb0e845b0 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 17 Mar 2025 15:55:40 +0200 Subject: Documentation: admin-guide: pm: Document intel_idle C1 demotion Document the intel_idle driver sysfs file for enabling/disabling C1 demotion. Signed-off-by: Artem Bityutskiy Link: https://patch.msgid.link/20250317135541.1471754-3-dedekind1@gmail.com Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/intel_idle.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst index 5940528146eb..ed6f055d4b14 100644 --- a/Documentation/admin-guide/pm/intel_idle.rst +++ b/Documentation/admin-guide/pm/intel_idle.rst @@ -38,6 +38,27 @@ instruction at all. only way to pass early-configuration-time parameters to it is via the kernel command line. +Sysfs Interface +=============== + +The ``intel_idle`` driver exposes the following ``sysfs`` attributes in +``/sys/devices/system/cpu/cpuidle/``: + +``intel_c1_demotion`` + Enable or disable C1 demotion for all CPUs in the system. This file is + only exposed on platforms that support the C1 demotion feature and where + it was tested. Value 0 means that C1 demotion is disabled, value 1 means + that it is enabled. Write 0 or 1 to disable or enable C1 demotion for + all CPUs. + + The C1 demotion feature involves the platform firmware demoting deep + C-state requests from the OS (e.g., C6 requests) to C1. The idea is that + firmware monitors CPU wake-up rate, and if it is higher than a + platform-specific threshold, the firmware demotes deep C-state requests + to C1. For example, Linux requests C6, but firmware noticed too many + wake-ups per second, and it keeps the CPU in C1. When the CPU stays in + C1 long enough, the platform promotes it back to C6. This may improve + some workloads' performance, but it may also increase power consumption. .. _intel-idle-enumeration-of-states: -- cgit v1.2.3-59-g8ed1b From e80e1342eabaac86da02f2c74fffc010e4f68c55 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 17 Mar 2025 15:55:41 +0200 Subject: Documentation: ABI: testing: document the new cpuidle sysfs file Mention the new 'intel_c1_demotion' sysfs file in the "cpuidle" section and refer to "Documentation/admin-guide/pm/intel_idle.rst" for more information. Signed-off-by: Artem Bityutskiy Link: https://patch.msgid.link/20250317135541.1471754-4-dedekind1@gmail.com Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-devices-system-cpu | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 206079d3bd5b..6c4934d3f4db 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -111,6 +111,7 @@ What: /sys/devices/system/cpu/cpuidle/available_governors /sys/devices/system/cpu/cpuidle/current_driver /sys/devices/system/cpu/cpuidle/current_governor /sys/devices/system/cpu/cpuidle/current_governer_ro + /sys/devices/system/cpu/cpuidle/intel_c1_demotion Date: September 2007 Contact: Linux kernel mailing list Description: Discover cpuidle policy and mechanism @@ -132,7 +133,11 @@ Description: Discover cpuidle policy and mechanism current_governor_ro: (RO) displays current idle policy. - See Documentation/admin-guide/pm/cpuidle.rst and + intel_c1_demotion: (RW) enables/disables the C1 demotion + feature on Intel CPUs. + + See Documentation/admin-guide/pm/cpuidle.rst, + Documentation/admin-guide/pm/intel_idle.rst, and Documentation/driver-api/pm/cpuidle.rst for more information. -- cgit v1.2.3-59-g8ed1b From 50c9bb30dc1f9731995a191deafbc49be717053e Mon Sep 17 00:00:00 2001 From: Zihuan Zhang Date: Wed, 7 May 2025 14:35:20 +0800 Subject: PM: hibernate: add configurable delay for pm_test Turn the default 5 second test delay for hibernation into a configurable module parameter, so users can determine how long to wait in this pseudo-hibernate state before resuming the system. The configurable delay parameter has been added for suspend, so add an analogous one for hibernation. Example (wait 30 seconds); # echo 30 > /sys/module/hibernate/parameters/pm_test_delay # echo core > /sys/power/pm_test Signed-off-by: Zihuan Zhang Reviewed-by: Randy Dunlap Link: https://patch.msgid.link/20250507063520.419635-1-zhangzihuan@kylinos.cn [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ kernel/power/hibernate.c | 9 +++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 76e538c77e31..ccd61e22403f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1831,6 +1831,13 @@ lz4: Select LZ4 compression algorithm to compress/decompress hibernation image. + hibernate.pm_test_delay= + [HIBERNATION] + Sets the number of seconds to remain in a hibernation test + mode before resuming the system (see + /sys/power/pm_test). Only available when CONFIG_PM_DEBUG + is set. Default value is 5. + highmem=nn[KMG] [KNL,BOOT,EARLY] forces the highmem zone to have an exact size of . This works even on boxes that have no highmem otherwise. This also works to reduce highmem diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f0db9d1896e8..f4db2e82fd87 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -133,10 +133,15 @@ bool system_entering_hibernation(void) EXPORT_SYMBOL(system_entering_hibernation); #ifdef CONFIG_PM_DEBUG +static unsigned int pm_test_delay = 5; +module_param(pm_test_delay, uint, 0644); +MODULE_PARM_DESC(pm_test_delay, + "Number of seconds to wait before resuming from hibernation test"); static void hibernation_debug_sleep(void) { - pr_info("debug: Waiting for 5 seconds.\n"); - mdelay(5000); + pr_info("hibernation debug: Waiting for %d second(s).\n", + pm_test_delay); + mdelay(pm_test_delay * 1000); } static int hibernation_test(int level) -- cgit v1.2.3-59-g8ed1b From 7330e002c050eea6a47dfa57c7f128af2e8dc592 Mon Sep 17 00:00:00 2001 From: Moon Hee Lee Date: Tue, 6 May 2025 15:00:57 -0700 Subject: PM: EM: Documentation: fix typo in energy-model.rst Fixes a grammar issue ("than" -> "then") and changes "re-use" to "reuse" for consistency with modern spelling. Signed-off-by: Moon Hee Lee Link: https://patch.msgid.link/20250506220057.5589-1-moonhee.lee.ca@gmail.com [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- Documentation/power/energy-model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index ada4938c37e5..490ddd483f46 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -230,7 +230,7 @@ Drivers must provide a pointer to the allocated and initialized new EM and will be visible to other sub-systems in the kernel (thermal, powercap). The main design goal for this API is to be fast and avoid extra calculations or memory allocations at runtime. When pre-computed EMs are available in the -device driver, than it should be possible to simply re-use them with low +device driver, then it should be possible to simply reuse them with low performance overhead. In order to free the EM, provided earlier by the driver (e.g. when the module -- cgit v1.2.3-59-g8ed1b From c9b83cbe46c645712988a53ce513aaf56728dcd3 Mon Sep 17 00:00:00 2001 From: Atul Kumar Pant Date: Sun, 11 May 2025 12:41:41 +0530 Subject: PM: EM: Documentation: Fix typos in example driver code Fix the API name to free the allocated table in the example driver code that modifies the EM. Also fix the passing of correct table when updating the cost. Signed-off-by: Atul Kumar Pant Link: https://patch.msgid.link/20250511071141.13237-1-atulpant.linux@gmail.com Signed-off-by: Rafael J. Wysocki --- Documentation/power/energy-model.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index 490ddd483f46..cbdf7520aaa6 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -381,17 +381,17 @@ up periodically to check the temperature and modify the EM data:: 26 rcu_read_unlock(); 27 28 /* Calculate 'cost' values for EAS */ - 29 ret = em_dev_compute_costs(dev, table, pd->nr_perf_states); + 29 ret = em_dev_compute_costs(dev, new_table, pd->nr_perf_states); 30 if (ret) { 31 dev_warn(dev, "EM: compute costs failed %d\n", ret); - 32 em_free_table(em_table); + 32 em_table_free(em_table); 33 return; 34 } 35 36 ret = em_dev_update_perf_domain(dev, em_table); 37 if (ret) { 38 dev_warn(dev, "EM: update failed %d\n", ret); - 39 em_free_table(em_table); + 39 em_table_free(em_table); 40 return; 41 } 42 -- cgit v1.2.3-59-g8ed1b From f20af84c29b222f68df53bfa72dcfbd4a3491603 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 May 2025 22:48:59 +0200 Subject: cpufreq: intel_pstate: Document hybrid processor support Describe the support for hybrid processors in intel_pstate, including the CAS and EAS support, in the admin-guide documentation. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/1935040.CQOukoFCf9@rjwysocki.net --- Documentation/admin-guide/pm/intel_pstate.rst | 104 +++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index 78fc83ed2a7e..26e702c7016e 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -329,6 +329,106 @@ information listed above is the same for all of the processors supporting the HWP feature, which is why ``intel_pstate`` works with all of them.] +Support for Hybrid Processors +============================= + +Some processors supported by ``intel_pstate`` contain two or more types of CPU +cores differing by the maximum turbo P-state, performance vs power characteristics, +cache sizes, and possibly other properties. They are commonly referred to as +hybrid processors. To support them, ``intel_pstate`` requires HWP to be enabled +and it assumes the HWP performance units to be the same for all CPUs in the +system, so a given HWP performance level always represents approximately the +same physical performance regardless of the core (CPU) type. + +Hybrid Processors with SMT +-------------------------- + +On systems where SMT (Simultaneous Multithreading), also referred to as +HyperThreading (HT) in the context of Intel processors, is enabled on at least +one core, ``intel_pstate`` assigns performance-based priorities to CPUs. Namely, +the priority of a given CPU reflects its highest HWP performance level which +causes the CPU scheduler to generally prefer more performant CPUs, so the less +performant CPUs are used when the other ones are fully loaded. However, SMT +siblings (that is, logical CPUs sharing one physical core) are treated in a +special way such that if one of them is in use, the effective priority of the +other ones is lowered below the priorities of the CPUs located in the other +physical cores. + +This approach maximizes performance in the majority of cases, but unfortunately +it also leads to excessive energy usage in some important scenarios, like video +playback, which is not generally desirable. While there is no other viable +choice with SMT enabled because the effective capacity and utilization of SMT +siblings are hard to determine, hybrid processors without SMT can be handled in +more energy-efficient ways. + +.. _CAS: + +Capacity-Aware Scheduling Support +--------------------------------- + +The capacity-aware scheduling (CAS) support in the CPU scheduler is enabled by +``intel_pstate`` by default on hybrid processors without SMT. CAS generally +causes the scheduler to put tasks on a CPU so long as there is a sufficient +amount of spare capacity on it, and if the utilization of a given task is too +high for it, the task will need to go somewhere else. + +Since CAS takes CPU capacities into account, it does not require CPU +prioritization and it allows tasks to be distributed more symmetrically among +the more performant and less performant CPUs. Once placed on a CPU with enough +capacity to accommodate it, a task may just continue to run there regardless of +whether or not the other CPUs are fully loaded, so on average CAS reduces the +utilization of the more performant CPUs which causes the energy usage to be more +balanced because the more performant CPUs are generally less energy-efficient +than the less performant ones. + +In order to use CAS, the scheduler needs to know the capacity of each CPU in +the system and it needs to be able to compute scale-invariant utilization of +CPUs, so ``intel_pstate`` provides it with the requisite information. + +First of all, the capacity of each CPU is represented by the ratio of its highest +HWP performance level, multiplied by 1024, to the highest HWP performance level +of the most performant CPU in the system, which works because the HWP performance +units are the same for all CPUs. Second, the frequency-invariance computations, +carried out by the scheduler to always express CPU utilization in the same units +regardless of the frequency it is currently running at, are adjusted to take the +CPU capacity into account. All of this happens when ``intel_pstate`` has +registered itself with the ``CPUFreq`` core and it has figured out that it is +running on a hybrid processor without SMT. + +Energy-Aware Scheduling Support +------------------------------- + +If ``CONFIG_ENERGY_MODEL`` has been set during kernel configuration and +``intel_pstate`` runs on a hybrid processor without SMT, in addition to enabling +`CAS `_ it registers an Energy Model for the processor. This allows the +Energy-Aware Scheduling (EAS) support to be enabled in the CPU scheduler if +``schedutil`` is used as the ``CPUFreq`` governor which requires ``intel_pstate`` +to operate in the `passive mode `_. + +The Energy Model registered by ``intel_pstate`` is artificial (that is, it is +based on abstract cost values and it does not include any real power numbers) +and it is relatively simple to avoid unnecessary computations in the scheduler. +There is a performance domain in it for every CPU in the system and the cost +values for these performance domains have been chosen so that running a task on +a less performant (small) CPU appears to be always cheaper than running that +task on a more performant (big) CPU. However, for two CPUs of the same type, +the cost difference depends on their current utilization, and the CPU whose +current utilization is higher generally appears to be a more expensive +destination for a given task. This helps to balance the load among CPUs of the +same type. + +Since EAS works on top of CAS, high-utilization tasks are always migrated to +CPUs with enough capacity to accommodate them, but thanks to EAS, low-utilization +tasks tend to be placed on the CPUs that look less expensive to the scheduler. +Effectively, this causes the less performant and less loaded CPUs to be +preferred as long as they have enough spare capacity to run the given task +which generally leads to reduced energy usage. + +The Energy Model created by ``intel_pstate`` can be inspected by looking at +the ``energy_model`` directory in ``debugfs`` (typlically mounted on +``/sys/kernel/debug/``). + + User Space Interface in ``sysfs`` ================================= @@ -697,8 +797,8 @@ of them have to be prepended with the ``intel_pstate=`` prefix. Limits`_ for details). ``no_cas`` - Do not enable capacity-aware scheduling (CAS) which is enabled by - default on hybrid systems. + Do not enable `capacity-aware scheduling `_ which is enabled by + default on hybrid systems without SMT. Diagnostics and Tuning ====================== -- cgit v1.2.3-59-g8ed1b From 922607a2b462b813c4b461feca04aed0c97d4cfe Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Wed, 7 May 2025 11:19:41 +0800 Subject: cpufreq: CPPC: Add support for autonomous selection Add sysfs interfaces for CPPC autonomous selection in the cppc_cpufreq driver. Signed-off-by: Lifeng Zheng Reviewed-by: Sumit Gupta Acked-by: Viresh Kumar Link: https://patch.msgid.link/20250507031941.2812701-1-zhenglifeng1@huawei.com [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-devices-system-cpu | 54 ++++++++++ drivers/cpufreq/cppc_cpufreq.c | 109 +++++++++++++++++++++ 2 files changed, 163 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 206079d3bd5b..37065e1b8ebc 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -268,6 +268,60 @@ Description: Discover CPUs in the same CPU frequency coordination domain This file is only present if the acpi-cpufreq or the cppc-cpufreq drivers are in use. +What: /sys/devices/system/cpu/cpuX/cpufreq/auto_select +Date: May 2025 +Contact: linux-pm@vger.kernel.org +Description: Autonomous selection enable + + Read/write interface to control autonomous selection enable + Read returns autonomous selection status: + 0: autonomous selection is disabled + 1: autonomous selection is enabled + + Write 'y' or '1' or 'on' to enable autonomous selection. + Write 'n' or '0' or 'off' to disable autonomous selection. + + This file is only present if the cppc-cpufreq driver is in use. + +What: /sys/devices/system/cpu/cpuX/cpufreq/auto_act_window +Date: May 2025 +Contact: linux-pm@vger.kernel.org +Description: Autonomous activity window + + This file indicates a moving utilization sensitivity window to + the platform's autonomous selection policy. + + Read/write an integer represents autonomous activity window (in + microseconds) from/to this file. The max value to write is + 1270000000 but the max significand is 127. This means that if 128 + is written to this file, 127 will be stored. If the value is + greater than 130, only the first two digits will be saved as + significand. + + Writing a zero value to this file enable the platform to + determine an appropriate Activity Window depending on the workload. + + Writing to this file only has meaning when Autonomous Selection is + enabled. + + This file is only present if the cppc-cpufreq driver is in use. + +What: /sys/devices/system/cpu/cpuX/cpufreq/energy_performance_preference_val +Date: May 2025 +Contact: linux-pm@vger.kernel.org +Description: Energy performance preference + + Read/write an 8-bit integer from/to this file. This file + represents a range of values from 0 (performance preference) to + 0xFF (energy efficiency preference) that influences the rate of + performance increase/decrease and the result of the hardware's + energy efficiency and performance optimization policies. + + Writing to this file only has meaning when Autonomous Selection is + enabled. + + This file is only present if the cppc-cpufreq driver is in use. + What: /sys/devices/system/cpu/cpu*/cache/index3/cache_disable_{0,1} Date: August 2008 diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index cb93f00bafdb..b7c688a5659c 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -808,10 +808,119 @@ static ssize_t show_freqdomain_cpus(struct cpufreq_policy *policy, char *buf) return cpufreq_show_cpus(cpu_data->shared_cpu_map, buf); } + +static ssize_t show_auto_select(struct cpufreq_policy *policy, char *buf) +{ + bool val; + int ret; + + ret = cppc_get_auto_sel(policy->cpu, &val); + + /* show "" when this register is not supported by cpc */ + if (ret == -EOPNOTSUPP) + return sysfs_emit(buf, "\n"); + + if (ret) + return ret; + + return sysfs_emit(buf, "%d\n", val); +} + +static ssize_t store_auto_select(struct cpufreq_policy *policy, + const char *buf, size_t count) +{ + bool val; + int ret; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + ret = cppc_set_auto_sel(policy->cpu, val); + if (ret) + return ret; + + return count; +} + +static ssize_t show_auto_act_window(struct cpufreq_policy *policy, char *buf) +{ + u64 val; + int ret; + + ret = cppc_get_auto_act_window(policy->cpu, &val); + + /* show "" when this register is not supported by cpc */ + if (ret == -EOPNOTSUPP) + return sysfs_emit(buf, "\n"); + + if (ret) + return ret; + + return sysfs_emit(buf, "%llu\n", val); +} + +static ssize_t store_auto_act_window(struct cpufreq_policy *policy, + const char *buf, size_t count) +{ + u64 usec; + int ret; + + ret = kstrtou64(buf, 0, &usec); + if (ret) + return ret; + + ret = cppc_set_auto_act_window(policy->cpu, usec); + if (ret) + return ret; + + return count; +} + +static ssize_t show_energy_performance_preference_val(struct cpufreq_policy *policy, char *buf) +{ + u64 val; + int ret; + + ret = cppc_get_epp_perf(policy->cpu, &val); + + /* show "" when this register is not supported by cpc */ + if (ret == -EOPNOTSUPP) + return sysfs_emit(buf, "\n"); + + if (ret) + return ret; + + return sysfs_emit(buf, "%llu\n", val); +} + +static ssize_t store_energy_performance_preference_val(struct cpufreq_policy *policy, + const char *buf, size_t count) +{ + u64 val; + int ret; + + ret = kstrtou64(buf, 0, &val); + if (ret) + return ret; + + ret = cppc_set_epp(policy->cpu, val); + if (ret) + return ret; + + return count; +} + cpufreq_freq_attr_ro(freqdomain_cpus); +cpufreq_freq_attr_rw(auto_select); +cpufreq_freq_attr_rw(auto_act_window); +cpufreq_freq_attr_rw(energy_performance_preference_val); static struct freq_attr *cppc_cpufreq_attr[] = { &freqdomain_cpus, + &auto_select, + &auto_act_window, + &energy_performance_preference_val, NULL, }; -- cgit v1.2.3-59-g8ed1b