aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/x86/kernel/smpboot.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/smpboot.c')
-rw-r--r--arch/x86/kernel/smpboot.c313
1 files changed, 310 insertions, 3 deletions
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 69881b2d446c..8c89e4d9ad28 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -147,6 +147,8 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
+static void init_freq_invariance(bool secondary);
+
/*
* Report back to the Boot Processor during boot time or to the caller processor
* during CPU online.
@@ -183,6 +185,8 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
+ init_freq_invariance(true);
+
/*
* Get our bogomips.
* Update loops_per_jiffy in cpu_data. Previous call to
@@ -466,7 +470,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
*/
static const struct x86_cpu_id snc_cpu[] = {
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
+ X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, NULL),
{}
};
@@ -1337,7 +1341,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
-
+ init_freq_invariance(false);
smp_sanity_check();
switch (apic_intr_mode) {
@@ -1434,7 +1438,7 @@ early_param("possible_cpus", _setup_possible_cpus);
/*
* cpu_possible_mask should be static, it cannot change as cpu's
* are onlined, or offlined. The reason is per-cpu data-structures
- * are allocated by some modules at init time, and dont expect to
+ * are allocated by some modules at init time, and don't expect to
* do this dynamically on cpu arrival/departure.
* cpu_present_mask on the other hand can change dynamically.
* In case when cpu_hotplug is not compiled, then we resort to current
@@ -1764,3 +1768,306 @@ void native_play_dead(void)
}
#endif
+
+/*
+ * APERF/MPERF frequency ratio computation.
+ *
+ * The scheduler wants to do frequency invariant accounting and needs a <1
+ * ratio to account for the 'current' frequency, corresponding to
+ * freq_curr / freq_max.
+ *
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
+ * our P-state setting is little more than a request/hint, we need to observe
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
+ * interval after discarding idle time. This is given by:
+ *
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
+ *
+ * where freq_base is the max non-turbo P-state.
+ *
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
+ * can't know which turbo states will be available at a given point in time:
+ * it all depends on the thermal headroom of the entire package. We set it to
+ * the turbo level with 4 cores active.
+ *
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
+ * which would ignore the entire turbo range (a conspicuous part, making
+ * freq_curr/freq_max always maxed out).
+ *
+ * An exception to the heuristic above is the Atom uarch, where we choose the
+ * highest turbo level for freq_max since Atom's are generally oriented towards
+ * power efficiency.
+ *
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
+ */
+
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+static DEFINE_PER_CPU(u64, arch_prev_aperf);
+static DEFINE_PER_CPU(u64, arch_prev_mperf);
+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
+
+void arch_set_max_freq_ratio(bool turbo_disabled)
+{
+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
+ arch_turbo_freq_ratio;
+}
+
+static bool turbo_disabled(void)
+{
+ u64 misc_en;
+ int err;
+
+ err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ if (err)
+ return false;
+
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
+}
+
+static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ int err;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
+ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
+
+ return true;
+}
+
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+#define ICPU(model) \
+ {X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0}
+
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
+ ICPU(INTEL_FAM6_XEON_PHI_KNL),
+ ICPU(INTEL_FAM6_XEON_PHI_KNM),
+ {}
+};
+
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
+ ICPU(INTEL_FAM6_SKYLAKE_X),
+ {}
+};
+
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
+ ICPU(INTEL_FAM6_ATOM_GOLDMONT),
+ ICPU(INTEL_FAM6_ATOM_GOLDMONT_D),
+ ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS),
+ {}
+};
+
+static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
+ int num_delta_fratio)
+{
+ int fratio, delta_fratio, found;
+ int err, i;
+ u64 msr;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ fratio = (msr >> 8) & 0xFF;
+ i = 16;
+ found = 0;
+ do {
+ if (found >= num_delta_fratio) {
+ *turbo_freq = fratio;
+ return true;
+ }
+
+ delta_fratio = (msr >> (i + 5)) & 0x7;
+
+ if (delta_fratio) {
+ found += 1;
+ fratio -= delta_fratio;
+ }
+
+ i += 8;
+ } while (i < 64);
+
+ return true;
+}
+
+static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
+{
+ u64 ratios, counts;
+ u32 group_size;
+ int err, i;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
+ if (err)
+ return false;
+
+ for (i = 0; i < 64; i += 8) {
+ group_size = (counts >> i) & 0xFF;
+ if (group_size >= size) {
+ *turbo_freq = (ratios >> i) & 0xFF;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ u64 msr;
+ int err;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
+
+ /* The CPU may have less than 4 cores */
+ if (!*turbo_freq)
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
+
+ return true;
+}
+
+static bool intel_set_max_freq_ratio(void)
+{
+ u64 base_freq, turbo_freq;
+
+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
+ goto out;
+
+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ return false;
+
+out:
+ /*
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
+ * but then fill all MSR's with zeroes.
+ */
+ if (!base_freq) {
+ pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
+ base_freq);
+ arch_set_max_freq_ratio(turbo_disabled());
+ return true;
+}
+
+static void init_counter_refs(void)
+{
+ u64 aperf, mperf;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+
+ this_cpu_write(arch_prev_aperf, aperf);
+ this_cpu_write(arch_prev_mperf, mperf);
+}
+
+static void init_freq_invariance(bool secondary)
+{
+ bool ret = false;
+
+ if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
+ return;
+
+ if (secondary) {
+ if (static_branch_likely(&arch_scale_freq_key)) {
+ init_counter_refs();
+ }
+ return;
+ }
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ ret = intel_set_max_freq_ratio();
+
+ if (ret) {
+ init_counter_refs();
+ static_branch_enable(&arch_scale_freq_key);
+ } else {
+ pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
+ }
+}
+
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+
+void arch_scale_freq_tick(void)
+{
+ u64 freq_scale;
+ u64 aperf, mperf;
+ u64 acnt, mcnt;
+
+ if (!arch_scale_freq_invariant())
+ return;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+
+ acnt = aperf - this_cpu_read(arch_prev_aperf);
+ mcnt = mperf - this_cpu_read(arch_prev_mperf);
+ if (!mcnt)
+ return;
+
+ this_cpu_write(arch_prev_aperf, aperf);
+ this_cpu_write(arch_prev_mperf, mperf);
+
+ acnt <<= 2*SCHED_CAPACITY_SHIFT;
+ mcnt *= arch_max_freq_ratio;
+
+ freq_scale = div64_u64(acnt, mcnt);
+
+ if (freq_scale > SCHED_CAPACITY_SCALE)
+ freq_scale = SCHED_CAPACITY_SCALE;
+
+ this_cpu_write(arch_freq_scale, freq_scale);
+}