From f9f4872df6e1801572949f8a370c886122d4b6da Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 8 Oct 2016 12:42:38 -0700 Subject: cpufreq: intel_pstate: Fix unsafe HWP MSR access This is a requirement that MSR MSR_PM_ENABLE must be set to 0x01 before reading MSR_HWP_CAPABILITIES on a given CPU. If cpufreq init() is scheduled on a CPU which is not same as policy->cpu or migrates to a different CPU before calling msr read for MSR_HWP_CAPABILITIES, it is possible that MSR_PM_ENABLE was not to set to 0x01 on that CPU. This will cause GP fault. So like other places in this path rdmsrl_on_cpu should be used instead of rdmsrl. Moreover the scope of MSR_HWP_CAPABILITIES is on per thread basis, so it should be read from the same CPU, for which MSR MSR_HWP_REQUEST is getting set. dmesg dump or warning: [ 22.014488] WARNING: CPU: 139 PID: 1 at arch/x86/mm/extable.c:50 ex_handler_rdmsr_unsafe+0x68/0x70 [ 22.014492] unchecked MSR access error: RDMSR from 0x771 [ 22.014493] Modules linked in: [ 22.014507] CPU: 139 PID: 1 Comm: swapper/0 Not tainted 4.7.5+ #1 ... ... [ 22.014516] Call Trace: [ 22.014542] [] dump_stack+0x63/0x82 [ 22.014558] [] __warn+0xcb/0xf0 [ 22.014561] [] warn_slowpath_fmt+0x4f/0x60 [ 22.014563] [] ex_handler_rdmsr_unsafe+0x68/0x70 [ 22.014564] [] fixup_exception+0x39/0x50 [ 22.014604] [] do_general_protection+0x80/0x150 [ 22.014610] [] general_protection+0x28/0x30 [ 22.014635] [] ? get_target_pstate_use_performance+0xb0/0xb0 [ 22.014642] [] ? native_read_msr+0x7/0x40 [ 22.014657] [] intel_pstate_hwp_set+0x23/0x130 [ 22.014660] [] intel_pstate_set_policy+0x1b6/0x340 [ 22.014662] [] cpufreq_set_policy+0xeb/0x2c0 [ 22.014664] [] cpufreq_init_policy+0x79/0xe0 [ 22.014666] [] ? cpufreq_update_policy+0x120/0x120 [ 22.014669] [] cpufreq_online+0x406/0x820 [ 22.014671] [] cpufreq_add_dev+0x5f/0x90 [ 22.014717] [] subsys_interface_register+0xb8/0x100 [ 22.014719] [] cpufreq_register_driver+0x14c/0x210 [ 22.014749] [] intel_pstate_init+0x39d/0x4d5 [ 22.014751] [] ? cpufreq_gov_dbs_init+0x12/0x12 Cc: 4.3+ # 4.3+ Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 806f2039571e..bc976a3db253 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -562,12 +562,12 @@ static void intel_pstate_hwp_set(const struct cpumask *cpumask) int min, hw_min, max, hw_max, cpu, range, adj_range; u64 value, cap; - rdmsrl(MSR_HWP_CAPABILITIES, cap); - hw_min = HWP_LOWEST_PERF(cap); - hw_max = HWP_HIGHEST_PERF(cap); - range = hw_max - hw_min; - for_each_cpu(cpu, cpumask) { + rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); + hw_min = HWP_LOWEST_PERF(cap); + hw_max = HWP_HIGHEST_PERF(cap); + range = hw_max - hw_min; + rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); adj_range = limits->min_perf_pct * range / 100; min = hw_min + adj_range; -- cgit v1.2.3-59-g8ed1b From f00593a4bdd22e0885db89df8ee8afcc6867994f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Sep 2016 03:13:34 +0200 Subject: cpufreq: intel_pstate: Clarify comment in get_target_pstate_use_performance() Make the comment explaining the meaning of the perf_scaled variable in get_target_pstate_use_performance() more straightforward. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index bc976a3db253..1c7b91c5805b 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1251,10 +1251,11 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) u64 duration_ns; /* - * perf_scaled is the average performance during the last sampling - * period scaled by the ratio of the maximum P-state to the P-state - * requested last time (in percent). That measures the system's - * response to the previous P-state selection. + * perf_scaled is the ratio of the average P-state during the last + * sampling period to the P-state requested last time (in percent). + * + * That measures the system's response to the previous P-state + * selection. */ max_pstate = cpu->pstate.max_pstate_physical; current_pstate = cpu->pstate.current_pstate; -- cgit v1.2.3-59-g8ed1b From 69e67a0626e503676ea2933b448f7aca8dd544f5 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Sun, 25 Sep 2016 20:13:57 +0800 Subject: PM / devfreq: exynos-nocp: Select REGMAP_MMIO This driver uses devm_regmap_init_mmio(), so select REGMAP_MMIO to avoid build failure. Signed-off-by: Axel Lin Acked-by: Chanwoo Choi Signed-off-by: Rafael J. Wysocki --- drivers/devfreq/event/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/devfreq/event/Kconfig b/drivers/devfreq/event/Kconfig index 0fdae8608961..cd949800eed9 100644 --- a/drivers/devfreq/event/Kconfig +++ b/drivers/devfreq/event/Kconfig @@ -17,6 +17,7 @@ config DEVFREQ_EVENT_EXYNOS_NOCP tristate "EXYNOS NoC (Network On Chip) Probe DEVFREQ event Driver" depends on ARCH_EXYNOS || COMPILE_TEST select PM_OPP + select REGMAP_MMIO help This add the devfreq-event driver for Exynos SoC. It provides NoC (Network on Chip) Probe counters to measure the bandwidth of AXI bus. -- cgit v1.2.3-59-g8ed1b From 3b91f4b361daad1f3e1c60ca5a7861e8b36f9728 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Sun, 25 Sep 2016 20:13:58 +0800 Subject: PM / devfreq: exynos-nocp: Remove redundant code load_count/total_count are reset by devfreq_event_get_event(), so remove the redundant code in exynos_nocp_get_event(). Signed-off-by: Axel Lin Acked-by: Chanwoo Choi [ rjw: Subject/changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/devfreq/event/exynos-nocp.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers') diff --git a/drivers/devfreq/event/exynos-nocp.c b/drivers/devfreq/event/exynos-nocp.c index a5841403bde8..49e712aca0c1 100644 --- a/drivers/devfreq/event/exynos-nocp.c +++ b/drivers/devfreq/event/exynos-nocp.c @@ -176,9 +176,6 @@ static int exynos_nocp_get_event(struct devfreq_event_dev *edev, return 0; out: - edata->load_count = 0; - edata->total_count = 0; - dev_err(nocp->dev, "Failed to read the counter of NoC probe device\n"); return ret; -- cgit v1.2.3-59-g8ed1b From 0f376c9cd86c23f37312d37748b233660ef9d9af Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Thu, 29 Sep 2016 10:13:28 +0800 Subject: PM / devfreq: Add proper locking around list_del() Use devfreq_list_lock around list_del() to prevent list corruption. Signed-off-by: Axel Lin Acked-by: MyungJoo Ham Signed-off-by: Rafael J. Wysocki --- drivers/devfreq/devfreq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 478006b7764a..66d3c7184a0f 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -594,17 +594,19 @@ struct devfreq *devfreq_add_device(struct device *dev, if (devfreq->governor) err = devfreq->governor->event_handler(devfreq, DEVFREQ_GOV_START, NULL); - mutex_unlock(&devfreq_list_lock); if (err) { dev_err(dev, "%s: Unable to start governor for the device\n", __func__); goto err_init; } + mutex_unlock(&devfreq_list_lock); return devfreq; err_init: list_del(&devfreq->node); + mutex_unlock(&devfreq_list_lock); + device_unregister(&devfreq->dev); err_out: return ERR_PTR(err); -- cgit v1.2.3-59-g8ed1b From d0563a039c9d35305f4cb2405665130de596d86a Mon Sep 17 00:00:00 2001 From: Tobias Jakobi Date: Thu, 29 Sep 2016 14:36:36 +0200 Subject: PM / devfreq: Skip status update on uninitialized previous_freq In case devfreq->previous_freq is still uninitialized in devfreq_update_status(), i.e. it has value '0', the lookups in that function fail, eventually leading to some error message: [ 3.041292] devfreq bus_dmc: Couldn't update frequency transition information. Just skip the statup update in this situation. Signed-off-by: Tobias Jakobi Acked-by: MyungJoo Ham Signed-off-by: Rafael J. Wysocki --- drivers/devfreq/devfreq.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers') diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 66d3c7184a0f..bf3ea7603a58 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -137,6 +137,10 @@ static int devfreq_update_status(struct devfreq *devfreq, unsigned long freq) cur_time = jiffies; + /* Immediately exit if previous_freq is not initialized yet. */ + if (!devfreq->previous_freq) + goto out; + prev_lev = devfreq_get_freq_level(devfreq, devfreq->previous_freq); if (prev_lev < 0) { ret = prev_lev; -- cgit v1.2.3-59-g8ed1b From 0843e83c1a4aa67e08f424430526c948d591d5f1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Oct 2016 14:07:51 +0200 Subject: cpufreq: intel_pstate: Proportional algorithm for Atom The PID algorithm used by the intel_pstate driver tends to drive performance to the minimum for workloads with utilization below the setpoint, which is undesirable, so replace it with a modified "proportional" algorithm on Atom. The new algorithm will set the new P-state to be 1.25 times the available maximum times the (frequency-invariant) utilization during the previous sampling period except when the target P-state computed this way is lower than the average P-state during the previous sampling period. In the latter case, it will increase the target by 50% of the difference between it and the average P-state to prevent performance from dropping down too fast in some cases. Signed-off-by: Rafael J. Wysocki Tested-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 1c7b91c5805b..6c8c897d0a2d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1232,6 +1232,7 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) { struct sample *sample = &cpu->sample; int32_t busy_frac, boost; + int target, avg_pstate; busy_frac = div_fp(sample->mperf, sample->tsc); @@ -1242,7 +1243,26 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) busy_frac = boost; sample->busy_scaled = busy_frac * 100; - return get_avg_pstate(cpu) - pid_calc(&cpu->pid, sample->busy_scaled); + + target = limits->no_turbo || limits->turbo_disabled ? + cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; + target += target >> 2; + target = mul_fp(target, busy_frac); + if (target < cpu->pstate.min_pstate) + target = cpu->pstate.min_pstate; + + /* + * If the average P-state during the previous cycle was higher than the + * current target, add 50% of the difference to the target to reduce + * possible performance oscillations and offset possible performance + * loss related to moving the workload from one CPU to another within + * a package/module. + */ + avg_pstate = get_avg_pstate(cpu); + if (avg_pstate > target) + target += (avg_pstate - target) >> 1; + + return target; } static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) -- cgit v1.2.3-59-g8ed1b From 3954517e2f083c8aeb52bfd467b7b2c164232ffc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Oct 2016 23:07:38 +0200 Subject: cpufreq: intel_pstate: Fix struct pstate_adjust_policy kerneldoc It looks like the name of struct pstate_adjust_policy was updated without updating its kerneldoc comment accordingly, so fix that mistake. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6c8c897d0a2d..f535f8123258 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -225,7 +225,7 @@ struct cpudata { static struct cpudata **all_cpu_data; /** - * struct pid_adjust_policy - Stores static PID configuration data + * struct pstate_adjust_policy - Stores static PID configuration data * @sample_rate_ms: PID calculation sample rate in ms * @sample_rate_ns: Sample rate calculation in ns * @deadband: PID deadband -- cgit v1.2.3-59-g8ed1b From abb6627910a1e783c8e034b35b7c80e5e7f98f41 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Oct 2016 21:47:03 +0200 Subject: cpufreq: conservative: Fix next frequency selection Commit d352cf47d93e (cpufreq: conservative: Do not use transition notifications) overlooked the case when the "frequency step" used by the conservative governor is small relative to the distances between the available frequencies and broke the algorithm by using policy->cur instead of the previously requested frequency when computing the next one. As a result, the governor may not be able to go outside of a narrow range between two consecutive available frequencies. Fix the problem by making the governor save the previously requested frequency and select the next one relative that value (unless it is out of range, in which case policy->cur will be used instead). Fixes: d352cf47d93e (cpufreq: conservative: Do not use transition notifications) Link: https://bugzilla.kernel.org/show_bug.cgi?id=177171 Reported-and-tested-by: Aleksey Rybalkin Acked-by: Viresh Kumar Cc: 4.8+ # 4.8+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_conservative.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 18da4f8051d3..13475890d792 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -17,6 +17,7 @@ struct cs_policy_dbs_info { struct policy_dbs_info policy_dbs; unsigned int down_skip; + unsigned int requested_freq; }; static inline struct cs_policy_dbs_info *to_dbs_info(struct policy_dbs_info *policy_dbs) @@ -61,6 +62,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) { struct policy_dbs_info *policy_dbs = policy->governor_data; struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs); + unsigned int requested_freq = dbs_info->requested_freq; struct dbs_data *dbs_data = policy_dbs->dbs_data; struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; unsigned int load = dbs_update(policy); @@ -72,10 +74,16 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) if (cs_tuners->freq_step == 0) goto out; + /* + * If requested_freq is out of range, it is likely that the limits + * changed in the meantime, so fall back to current frequency in that + * case. + */ + if (requested_freq > policy->max || requested_freq < policy->min) + requested_freq = policy->cur; + /* Check for frequency increase */ if (load > dbs_data->up_threshold) { - unsigned int requested_freq = policy->cur; - dbs_info->down_skip = 0; /* if we are already at full speed then break out early */ @@ -83,8 +91,11 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) goto out; requested_freq += get_freq_target(cs_tuners, policy); + if (requested_freq > policy->max) + requested_freq = policy->max; __cpufreq_driver_target(policy, requested_freq, CPUFREQ_RELATION_H); + dbs_info->requested_freq = requested_freq; goto out; } @@ -95,7 +106,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) /* Check for frequency decrease */ if (load < cs_tuners->down_threshold) { - unsigned int freq_target, requested_freq = policy->cur; + unsigned int freq_target; /* * if we cannot reduce the frequency anymore, break out early */ @@ -109,6 +120,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) requested_freq = policy->min; __cpufreq_driver_target(policy, requested_freq, CPUFREQ_RELATION_L); + dbs_info->requested_freq = requested_freq; } out: @@ -287,6 +299,7 @@ static void cs_start(struct cpufreq_policy *policy) struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy->governor_data); dbs_info->down_skip = 0; + dbs_info->requested_freq = policy->cur; } static struct dbs_governor cs_governor = { -- cgit v1.2.3-59-g8ed1b From c197d758036d8c77923ae3f88571cf198283107e Mon Sep 17 00:00:00 2001 From: Hoan Tran Date: Thu, 13 Oct 2016 10:33:35 -0700 Subject: cpufreq: CPPC: Correct desired_perf calculation The desired_perf is an abstract performance number. Its value should be in the range of [lowest perf, highest perf] of CPPC. The correct calculation is desired_perf = freq * cppc_highest_perf / cppc_dmi_max_khz And cppc_cpufreq_set_target() returns if desired_perf is exactly the same with the old perf. Signed-off-by: Hoan Tran Reviewed-by: Prashanth Prakash Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 99db4227ae38..f7e98fc077c1 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -80,11 +80,17 @@ static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, { struct cpudata *cpu; struct cpufreq_freqs freqs; + u32 desired_perf; int ret = 0; cpu = all_cpu_data[policy->cpu]; - cpu->perf_ctrls.desired_perf = (u64)target_freq * policy->max / cppc_dmi_max_khz; + desired_perf = (u64)target_freq * cpu->perf_caps.highest_perf / cppc_dmi_max_khz; + /* Return if it is exactly the same perf */ + if (desired_perf == cpu->perf_ctrls.desired_perf) + return ret; + + cpu->perf_ctrls.desired_perf = desired_perf; freqs.old = policy->cur; freqs.new = target_freq; -- cgit v1.2.3-59-g8ed1b