From 2bc756e7dde20972300bb8e2e46dd430d6d2d5db Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 2 Mar 2017 23:22:29 +0100 Subject: cpufreq: intel_pstate: Do not use performance_limits in passive mode Using performance_limits in the passive mode doesn't make sense, because in that mode the global limits are applied to the frequency selected by the scaling governor. The maximum and minimum P-state limits in performance_limits are both set to 100 percent which will put all CPUs into the turbo range regardless of what governor is used and what frequencies are selected by it (that is particularly undesirable on CPUs with the generic powersave governor attached). For this reason, make intel_pstate_register_driver() always point limits to powersave_limits in the passive mode. Fixes: 001c76f05b01 (cpufreq: intel_pstate: Generic governors support) Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9ee13f195c37..7fc1f8a0ef44 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2442,8 +2442,11 @@ static int intel_pstate_register_driver(void) intel_pstate_init_limits(&powersave_limits); intel_pstate_set_performance_limits(&performance_limits); - limits = IS_ENABLED(CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE) ? - &performance_limits : &powersave_limits; + if (IS_ENABLED(CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE) && + intel_pstate_driver == &intel_pstate) + limits = &performance_limits; + else + limits = &powersave_limits; ret = cpufreq_register_driver(intel_pstate_driver); if (ret) { -- cgit v1.2.3-59-g8ed1b From 7f17326fc0b69afbda7aed6c4ce8e2328b74203b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 2 Mar 2017 23:24:36 +0100 Subject: cpufreq: intel_pstate: Fix intel_cpufreq_verify_policy() The intel_pstate_update_perf_limits() called from intel_cpufreq_verify_policy() may cause global P-state limits to change which is generally confusing and unnecessary. In the passive mode the global limits are only applied to the frequency selected by the scaling governor (they are not taken into account by governors when making decisions anyway), so making them follow the per-policy limits serves no purpose and may go against user expectations (as it generally causes the global attributes in sysfs to change even though they have not been written to in some cases). Fix that by dropping the intel_pstate_update_perf_limits() invocation from intel_cpufreq_verify_policy() (which also reduces the code size by a few lines). This change does not affect the per-CPU limits case, because those limits allow any P-state to be set by default in the passive mode and it removes the only piece of code updating them in that mode, so the per-policy settings will be the only ones taken into account in that case as expected. Fixes: 001c76f05b01 (cpufreq: intel_pstate: Generic governors support) Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 7fc1f8a0ef44..04b2aa162276 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2306,7 +2306,6 @@ static struct cpufreq_driver intel_pstate = { static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy) { struct cpudata *cpu = all_cpu_data[policy->cpu]; - struct perf_limits *perf_limits = limits; update_turbo_state(); policy->cpuinfo.max_freq = limits->turbo_disabled ? @@ -2314,15 +2313,6 @@ static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy) cpufreq_verify_within_cpu_limits(policy); - if (per_cpu_limits) - perf_limits = cpu->perf_limits; - - mutex_lock(&intel_pstate_limits_lock); - - intel_pstate_update_perf_limits(policy, perf_limits); - - mutex_unlock(&intel_pstate_limits_lock); - return 0; } -- cgit v1.2.3-59-g8ed1b From 6407829901f074cf6beef566d6af9a0b0e238f0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Mar 2017 23:51:31 +0100 Subject: cpufreq: intel_pstate: Avoid triggering cpu_frequency tracepoint unnecessarily In the passive mode the cpu_frequency trace event is already triggered by the cpufreq core or by scaling governors, so intel_pstate should not trigger it once again for the same P-state updates. In addition to that, the frequency returned by intel_cpufreq_fast_switch() and passed via freqs.new from intel_cpufreq_target() to cpufreq_freq_transition_end() should reflect the P-state actually set, so make that happen. Fixes: 001c76f05b01 (cpufreq: intel_pstate: Generic governors support) Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 04b2aa162276..5e066b332598 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1879,13 +1879,11 @@ static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate) intel_pstate_get_min_max(cpu, &min_perf, &max_perf); pstate = clamp_t(int, pstate, min_perf, max_perf); - trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); return pstate; } static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) { - pstate = intel_pstate_prepare_request(cpu, pstate); if (pstate == cpu->pstate.current_pstate) return; @@ -1905,6 +1903,8 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) update_turbo_state(); + target_pstate = intel_pstate_prepare_request(cpu, target_pstate); + trace_cpu_frequency(target_pstate * cpu->pstate.scaling, cpu->cpu); intel_pstate_update_pstate(cpu, target_pstate); sample = &cpu->sample; @@ -2365,6 +2365,7 @@ static int intel_cpufreq_target(struct cpufreq_policy *policy, wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, target_pstate)); } + freqs.new = target_pstate * cpu->pstate.scaling; cpufreq_freq_transition_end(policy, &freqs, false); return 0; @@ -2378,8 +2379,9 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, target_freq = intel_cpufreq_turbo_update(cpu, policy, target_freq); target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); + target_pstate = intel_pstate_prepare_request(cpu, target_pstate); intel_pstate_update_pstate(cpu, target_pstate); - return target_freq; + return target_pstate * cpu->pstate.scaling; } static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) -- cgit v1.2.3-59-g8ed1b From d82f26925599cae83c38d17d07ae982356e81318 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 28 Feb 2017 16:44:16 -0500 Subject: cpufreq: Add the "cpufreq.off=1" cmdline option Add the "cpufreq.off=1" cmdline option. At boot-time, this allows a user to request CONFIG_CPU_FREQ=n behavior from a kernel built with CONFIG_CPU_FREQ=y. This is analogous to the existing "cpuidle.off=1" option and CONFIG_CPU_IDLE=y This capability is valuable when we need to debug end-user issues in the BIOS or in Linux. It is also convenient for enabling comparisons, which may otherwise require a new kernel, or help from BIOS SETUP, which may be buggy or unavailable. Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ drivers/cpufreq/cpufreq.c | 1 + 2 files changed, 4 insertions(+) (limited to 'drivers/cpufreq') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index be7c0d9506b1..3988d2311f97 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -662,6 +662,9 @@ cpuidle.off=1 [CPU_IDLE] disable the cpuidle sub-system + cpufreq.off=1 [CPU_FREQ] + disable the cpufreq sub-system + cpu_init_udelay=N [X86] Delay for N microsec between assert and de-assert of APIC INIT to start processors. This delay occurs diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 80a785ad17e8..7790db2645d7 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2532,4 +2532,5 @@ static int __init cpufreq_core_init(void) return 0; } +module_param(off, int, 0444); core_initcall(cpufreq_core_init); -- cgit v1.2.3-59-g8ed1b From cd59b4bed9d11a2aefc4bb44eed9de0e6c1eea06 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 1 Mar 2017 00:07:36 +0100 Subject: cpufreq: intel_pstate: Fix global settings in active mode Commit 111b8b3fe4fa (cpufreq: intel_pstate: Always keep all limits settings in sync) changed intel_pstate to invoke cpufreq_update_policy() for every registered CPU on global sysfs attributes updates, but that led to undesirable effects in the active mode if the "performance" P-state selection algorithm is configufred for one CPU and the "powersave" one is chosen for all of the other CPUs. Namely, in that case, the following is possible: # cd /sys/devices/system/cpu/ # cat intel_pstate/max_perf_pct 100 # cat intel_pstate/min_perf_pct 26 # echo performance > cpufreq/policy0/scaling_governor # cat intel_pstate/max_perf_pct 100 # cat intel_pstate/min_perf_pct 100 # echo 94 > intel_pstate/min_perf_pct # cat intel_pstate/min_perf_pct 26 The reason why this happens is because intel_pstate attempts to maintain two sets of global limits in the active mode, one for the "performance" P-state selection algorithm and one for the "powersave" P-state selection algorithm, but the P-state selection algorithms are set per policy, so the global limits cannot reflect all of them at the same time if they are different for different policies. In the particular situation above, the attempt to change min_perf_pct to 94 caused cpufreq_update_policy() to be run for a CPU with the "powersave" P-state selection algorithm and intel_pstate_set_policy() called by it silently switched the global limits to the "powersave" set which finally was reflected by the sysfs interface. To prevent that from happening, modify intel_pstate_update_policies() to always switch back to the set of limits that was used right before it has been invoked. Fixes: 111b8b3fe4fa (cpufreq: intel_pstate: Always keep all limits settings in sync) Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 5e066b332598..436a4c5ba70d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -973,11 +973,20 @@ static int intel_pstate_resume(struct cpufreq_policy *policy) } static void intel_pstate_update_policies(void) + __releases(&intel_pstate_limits_lock) + __acquires(&intel_pstate_limits_lock) { + struct perf_limits *saved_limits = limits; int cpu; + mutex_unlock(&intel_pstate_limits_lock); + for_each_possible_cpu(cpu) cpufreq_update_policy(cpu); + + mutex_lock(&intel_pstate_limits_lock); + + limits = saved_limits; } /************************** debugfs begin ************************/ @@ -1185,10 +1194,10 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, limits->no_turbo = clamp_t(int, input, 0, 1); - mutex_unlock(&intel_pstate_limits_lock); - intel_pstate_update_policies(); + mutex_unlock(&intel_pstate_limits_lock); + mutex_unlock(&intel_pstate_driver_lock); return count; @@ -1222,10 +1231,10 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b, limits->max_perf_pct); limits->max_perf = div_ext_fp(limits->max_perf_pct, 100); - mutex_unlock(&intel_pstate_limits_lock); - intel_pstate_update_policies(); + mutex_unlock(&intel_pstate_limits_lock); + mutex_unlock(&intel_pstate_driver_lock); return count; @@ -1259,10 +1268,10 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, limits->min_perf_pct); limits->min_perf = div_ext_fp(limits->min_perf_pct, 100); - mutex_unlock(&intel_pstate_limits_lock); - intel_pstate_update_policies(); + mutex_unlock(&intel_pstate_limits_lock); + mutex_unlock(&intel_pstate_driver_lock); return count; -- cgit v1.2.3-59-g8ed1b From d74b19929130c9b5395a497030dcf161353cd526 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 1 Mar 2017 00:11:05 +0100 Subject: cpufreq: intel_pstate: Fix intel_pstate_verify_policy() The code added to intel_pstate_verify_policy() by commit 1443ebbacfd7 (cpufreq: intel_pstate: Fix sysfs limits enforcement for performance policy) should use perf_limits instead of limits, because otherwise setting global limits via sysfs may affect policies inconsistently. For example, in the sequence of shell commands below, the scaling_min_freq attribute for policy1 and policy2 should be affected in the same way, because scaling_governor is set in the same way for both of them: # cat cpufreq/policy1/scaling_governor powersave # cat cpufreq/policy2/scaling_governor powersave # echo performance > cpufreq/policy0/scaling_governor # echo 94 > intel_pstate/min_perf_pct # cat cpufreq/policy0/scaling_min_freq 2914000 # cat cpufreq/policy1/scaling_min_freq 2914000 # cat cpufreq/policy2/scaling_min_freq 800000 The are affected differently, because intel_pstate_verify_policy() is invoked with limits set to &performance_limits (left behind by policy0) for policy1 and with limits set to &powersave_limits (left behind by policy1) for policy2. Since perf_limits is set to the set of limits matching the policy being updated, using it instead of limits fixes the inconsistency. Fixes: 1443ebbacfd7 (cpufreq: intel_pstate: Fix sysfs limits enforcement for performance policy) Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 436a4c5ba70d..3dc546601c88 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2212,9 +2212,9 @@ static int intel_pstate_verify_policy(struct cpufreq_policy *policy) unsigned int max_freq, min_freq; max_freq = policy->cpuinfo.max_freq * - limits->max_sysfs_pct / 100; + perf_limits->max_sysfs_pct / 100; min_freq = policy->cpuinfo.max_freq * - limits->min_sysfs_pct / 100; + perf_limits->min_sysfs_pct / 100; cpufreq_verify_within_limits(policy, min_freq, max_freq); } -- cgit v1.2.3-59-g8ed1b From a240c4aa5d0f9c8bb0db380ab9c1ec1f68b923ad Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 2 Mar 2017 23:29:12 +0100 Subject: cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy If the current P-state selection algorithm is set to "performance" in intel_pstate_set_policy(), the limits may be initialized from scratch, but only if no_turbo is not set and the maximum frequency allowed for the given CPU (i.e. the policy object representing it) is at least equal to the max frequency supported by the CPU. In all of the other cases, the limits will not be updated. For example, the following can happen: # cat intel_pstate/status active # echo performance > cpufreq/policy0/scaling_governor # cat intel_pstate/min_perf_pct 100 # echo 94 > intel_pstate/min_perf_pct # cat intel_pstate/min_perf_pct 100 # cat cpufreq/policy0/scaling_max_freq 3100000 echo 3000000 > cpufreq/policy0/scaling_max_freq # cat intel_pstate/min_perf_pct 94 # echo 95 > intel_pstate/min_perf_pct # cat intel_pstate/min_perf_pct 95 That is confusing for two reasons. First, the initial attempt to change min_perf_pct to 94 seems to have no effect, even though setting the global limits should always work. Second, after changing scaling_max_freq for policy0 the global min_perf_pct attribute shows 94, even though it should have not been affected by that operation in principle. Moreover, the final attempt to change min_perf_pct to 95 worked as expected, because scaling_max_freq for the only policy with scaling_governor equal to "performance" was different from the maximum at that time. To make all that confusion go away, modify intel_pstate_set_policy() so that it doesn't reinitialize the limits at all. At the same time, change intel_pstate_set_performance_limits() to set min_sysfs_pct to 100 in the "performance" limits set so that switching the P-state selection algorithm to "performance" causes intel_pstate/min_perf_pct in sysfs to go to 100 (or whatever value min_sysfs_pct in the "performance" limits is set to later). That requires per-CPU limits to be initialized explicitly rather than by copying the global limits to avoid setting min_sysfs_pct in the per-CPU limits to 100. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 3dc546601c88..f1f8fbe0b4c4 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -382,6 +382,7 @@ static void intel_pstate_set_performance_limits(struct perf_limits *limits) intel_pstate_init_limits(limits); limits->min_perf_pct = 100; limits->min_perf = int_ext_tofp(1); + limits->min_sysfs_pct = 100; } static DEFINE_MUTEX(intel_pstate_driver_lock); @@ -2146,16 +2147,11 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) mutex_lock(&intel_pstate_limits_lock); if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { + pr_debug("set performance\n"); if (!perf_limits) { limits = &performance_limits; perf_limits = limits; } - if (policy->max >= policy->cpuinfo.max_freq && - !limits->no_turbo) { - pr_debug("set performance\n"); - intel_pstate_set_performance_limits(perf_limits); - goto out; - } } else { pr_debug("set powersave\n"); if (!perf_limits) { @@ -2166,7 +2162,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) } intel_pstate_update_perf_limits(policy, perf_limits); - out: + if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) { /* * NOHZ_FULL CPUs need this as the governor callback may not @@ -2257,13 +2253,8 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) cpu = all_cpu_data[policy->cpu]; - /* - * We need sane value in the cpu->perf_limits, so inherit from global - * perf_limits limits, which are seeded with values based on the - * CONFIG_CPU_FREQ_DEFAULT_GOV_*, during boot up. - */ if (per_cpu_limits) - memcpy(cpu->perf_limits, limits, sizeof(struct perf_limits)); + intel_pstate_init_limits(cpu->perf_limits); policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; -- cgit v1.2.3-59-g8ed1b